1 """Comparison tools for Zinnia
2 Based on clustered_models app"""
3 from math import sqrt
4
5 from zinnia.settings import F_MIN
6 from zinnia.settings import F_MAX
7
8
10 """Compute the pearson score between 2 lists of vectors"""
11 sum1 = sum(list1)
12 sum2 = sum(list2)
13 sum_sq1 = sum([pow(l, 2) for l in list1])
14 sum_sq2 = sum([pow(l, 2) for l in list2])
15
16 prod_sum = sum([list1[i] * list2[i] for i in range(len(list1))])
17
18 num = prod_sum - (sum1 * sum2 / len(list1))
19 den = sqrt((sum_sq1 - pow(sum1, 2) / len(list1)) *
20 (sum_sq2 - pow(sum2, 2) / len(list2)))
21 if den == 0:
22 return 0.0
23 return 1.0 - num / den
24
25
27 """Wrapper around Model class
28 building a dataset of instances"""
29
31 self.queryset = info_dict.get('queryset', [])
32 self.fields = info_dict.get('fields', ['id'])
33
35 """Generate a dataset with the queryset
36 and specified fields"""
37 dataset = {}
38 for item in self.queryset.filter():
39 dataset[item] = ' '.join([unicode(item.__dict__[field])
40 for field in self.fields])
41 return dataset
42
43
45 """Build a list of vectors based on datasets"""
46
53
55 """Generate whole dataset"""
56 data = {}
57 words_total = {}
58
59 for clustered_model in self.clustered_models:
60 model_data = clustered_model.dataset()
61 for instance, words in model_data.items():
62 words_item_total = {}
63 for word in words.split():
64 words_total.setdefault(word, 0)
65 words_item_total.setdefault(word, 0)
66 words_total[word] += 1
67 words_item_total[word] += 1
68 data[instance] = words_item_total
69
70 top_words = []
71 for word, count in words_total.items():
72 frequency = float(count) / len(data)
73 if frequency > F_MIN and frequency < F_MAX:
74 top_words.append(word)
75
76 self.dataset = {}
77 self.columns = top_words
78 for instance in data.keys():
79 self.dataset[instance] = [data[instance].get(word, 0)
80 for word in top_words]
81 self.key = self.generate_key()
82
84 """Generate key for this list of vectors"""
85 return '-'.join([str(c.queryset.filter().count())
86 for c in self.clustered_models])
87
92
96