Package zinnia :: Module comparison
[hide private]

Source Code for Module zinnia.comparison

 1  """Comparison tools for Zinnia 
 2  Based on clustered_models app""" 
 3  from math import sqrt 
 4   
 5  from zinnia.settings import F_MIN 
 6  from zinnia.settings import F_MAX 
 7   
 8   
9 -def pearson_score(list1, list2):
10 """Compute the pearson score between 2 lists of vectors""" 11 sum1 = sum(list1) 12 sum2 = sum(list2) 13 sum_sq1 = sum([pow(l, 2) for l in list1]) 14 sum_sq2 = sum([pow(l, 2) for l in list2]) 15 16 prod_sum = sum([list1[i] * list2[i] for i in range(len(list1))]) 17 18 num = prod_sum - (sum1 * sum2 / len(list1)) 19 den = sqrt((sum_sq1 - pow(sum1, 2) / len(list1)) * 20 (sum_sq2 - pow(sum2, 2) / len(list2))) 21 if den == 0: 22 return 0.0 23 return 1.0 - num / den
24 25
26 -class ClusteredModel(object):
27 """Wrapper around Model class 28 building a dataset of instances""" 29
30 - def __init__(self, info_dict):
31 self.queryset = info_dict.get('queryset', []) 32 self.fields = info_dict.get('fields', ['id'])
33
34 - def dataset(self):
35 """Generate a dataset with the queryset 36 and specified fields""" 37 dataset = {} 38 for item in self.queryset.filter(): 39 dataset[item] = ' '.join([unicode(item.__dict__[field]) 40 for field in self.fields]) 41 return dataset
42 43
44 -class VectorBuilder(object):
45 """Build a list of vectors based on datasets""" 46
47 - def __init__(self, *models_conf):
48 self.key = '' 49 self.columns = [] 50 self.dataset = {} 51 self.clustered_models = [ClusteredModel(conf) for conf in models_conf] 52 self.build_dataset()
53
54 - def build_dataset(self):
55 """Generate whole dataset""" 56 data = {} 57 words_total = {} 58 59 for clustered_model in self.clustered_models: 60 model_data = clustered_model.dataset() 61 for instance, words in model_data.items(): 62 words_item_total = {} 63 for word in words.split(): 64 words_total.setdefault(word, 0) 65 words_item_total.setdefault(word, 0) 66 words_total[word] += 1 67 words_item_total[word] += 1 68 data[instance] = words_item_total 69 70 top_words = [] 71 for word, count in words_total.items(): 72 frequency = float(count) / len(data) 73 if frequency > F_MIN and frequency < F_MAX: 74 top_words.append(word) 75 76 self.dataset = {} 77 self.columns = top_words 78 for instance in data.keys(): 79 self.dataset[instance] = [data[instance].get(word, 0) 80 for word in top_words] 81 self.key = self.generate_key()
82
83 - def generate_key(self):
84 """Generate key for this list of vectors""" 85 return '-'.join([str(c.queryset.filter().count()) 86 for c in self.clustered_models])
87
88 - def flush(self):
89 """Flush the dataset""" 90 if self.key != self.generate_key(): 91 self.build_dataset()
92
93 - def __call__(self):
94 self.flush() 95 return self.columns, self.dataset
96