1
"""Comparison tools for Zinnia
2
Based on clustered_models app"""
5
from zinnia.settings import F_MIN
6
from zinnia.settings import F_MAX
9
def pearson_score(list1, list2):
10
"""Compute the pearson score between 2 lists of vectors"""
13
sum_sq1 = sum([pow(l, 2) for l in list1])
14
sum_sq2 = sum([pow(l, 2) for l in list2])
16
prod_sum = sum([list1[i] * list2[i] for i in range(len(list1))])
18
num = prod_sum - (sum1 * sum2 / len(list1))
19
den = sqrt((sum_sq1 - pow(sum1, 2) / len(list1)) *
20
(sum_sq2 - pow(sum2, 2) / len(list2)))
23
return 1.0 - num / den
26
class ClusteredModel(object):
27
"""Wrapper around Model class
28
building a dataset of instances"""
30
def __init__(self, info_dict):
31
self.queryset = info_dict.get('queryset', [])
32
self.fields = info_dict.get('fields', ['id'])
35
"""Generate a dataset with the queryset
36
and specified fields"""
38
for item in self.queryset.filter():
39
dataset[item] = ' '.join([unicode(item.__dict__[field])
40
for field in self.fields])
44
class VectorBuilder(object):
45
"""Build a list of vectors based on datasets"""
47
def __init__(self, *models_conf):
51
self.clustered_models = [ClusteredModel(conf) for conf in models_conf]
54
def build_dataset(self):
55
"""Generate whole dataset"""
59
for clustered_model in self.clustered_models:
60
model_data = clustered_model.dataset()
61
for instance, words in model_data.items():
63
for word in words.split():
64
words_total.setdefault(word, 0)
65
words_item_total.setdefault(word, 0)
66
words_total[word] += 1
67
words_item_total[word] += 1
68
data[instance] = words_item_total
71
for word, count in words_total.items():
72
frequency = float(count) / len(data)
73
if frequency > F_MIN and frequency < F_MAX:
74
top_words.append(word)
77
self.columns = top_words
78
for instance in data.keys():
79
self.dataset[instance] = [data[instance].get(word, 0)
80
for word in top_words]
81
self.key = self.generate_key()
83
def generate_key(self):
84
"""Generate key for this list of vectors"""
85
return '-'.join([str(c.queryset.filter().count())
86
for c in self.clustered_models])
89
"""Flush the dataset"""
90
if self.key != self.generate_key():
95
return self.columns, self.dataset