zinnia.comparison
Covered: 76 lines
Missed: 0 lines
Skipped 20 lines
Percent: 100 %
 1
"""Comparison tools for Zinnia
 2
Based on clustered_models app"""
 3
from math import sqrt
 5
from zinnia.settings import F_MIN
 6
from zinnia.settings import F_MAX
 9
def pearson_score(list1, list2):
10
    """Compute the pearson score between 2 lists of vectors"""
11
    sum1 = sum(list1)
12
    sum2 = sum(list2)
13
    sum_sq1 = sum([pow(l, 2) for l in list1])
14
    sum_sq2 = sum([pow(l, 2) for l in list2])
16
    prod_sum = sum([list1[i] * list2[i] for i in range(len(list1))])
18
    num = prod_sum - (sum1 * sum2 / len(list1))
19
    den = sqrt((sum_sq1 - pow(sum1, 2) / len(list1)) *
20
               (sum_sq2 - pow(sum2, 2) / len(list2)))
21
    if den == 0:
22
        return 0.0
23
    return 1.0 - num / den
26
class ClusteredModel(object):
27
    """Wrapper around Model class
28
    building a dataset of instances"""
30
    def __init__(self, info_dict):
31
        self.queryset = info_dict.get('queryset', [])
32
        self.fields = info_dict.get('fields', ['id'])
34
    def dataset(self):
35
        """Generate a dataset with the queryset
36
        and specified fields"""
37
        dataset = {}
38
        for item in self.queryset.filter():
39
            dataset[item] = ' '.join([unicode(item.__dict__[field])
40
                                      for field in self.fields])
41
        return dataset
44
class VectorBuilder(object):
45
    """Build a list of vectors based on datasets"""
47
    def __init__(self, *models_conf):
48
        self.key = ''
49
        self.columns = []
50
        self.dataset = {}
51
        self.clustered_models = [ClusteredModel(conf) for conf in models_conf]
52
        self.build_dataset()
54
    def build_dataset(self):
55
        """Generate whole dataset"""
56
        data = {}
57
        words_total = {}
59
        for clustered_model in self.clustered_models:
60
            model_data = clustered_model.dataset()
61
            for instance, words in model_data.items():
62
                words_item_total = {}
63
                for word in words.split():
64
                    words_total.setdefault(word, 0)
65
                    words_item_total.setdefault(word, 0)
66
                    words_total[word] += 1
67
                    words_item_total[word] += 1
68
                data[instance] = words_item_total
70
        top_words = []
71
        for word, count in words_total.items():
72
            frequency = float(count) / len(data)
73
            if frequency > F_MIN and frequency < F_MAX:
74
                top_words.append(word)
76
        self.dataset = {}
77
        self.columns = top_words
78
        for instance in data.keys():
79
            self.dataset[instance] = [data[instance].get(word, 0)
80
                                      for word in top_words]
81
        self.key = self.generate_key()
83
    def generate_key(self):
84
        """Generate key for this list of vectors"""
85
        return '-'.join([str(c.queryset.filter().count())
86
                         for c in self.clustered_models])
88
    def flush(self):
89
        """Flush the dataset"""
90
        if self.key != self.generate_key():
91
            self.build_dataset()
93
    def __call__(self):
94
        self.flush()
95
        return self.columns, self.dataset