making recommandations
from math import sqrt
# 协作型过滤 (collaborative filtering)
## 搜集偏好
critics = {
'Lisa Rose': {
'Lady in the Water': 2.5,
'Snakes on a Plane': 3.5,
'Just My Luck': 3.0,
'Superman Returns': 3.5,
'You, Me and Dupree': 2.5,
'The Night Listener': 3.0,
},
'Gene Seymour': {
'Lady in the Water': 3.0,
'Snakes on a Plane': 3.5,
'Just My Luck': 1.5,
'Superman Returns': 5.0,
'You, Me and Dupree': 3.0,
'The Night Listener': 3.5
},
'Michael Phillips': {
'Lady in the Water': 2.5,
'Snakes on a Plane': 3.0,
#'Just My Luck': 1.5,
'Superman Returns': 3.5,
#'You, Me and Dupree': 3.0,
'The Night Listener': 4.0,
},
'Claudia Puig': {
'Lady in the Water': 4.5,
'Snakes on a Plane': 3.5,
'Just My Luck': 3.0,
'Superman Returns': 4.0,
'You, Me and Dupree': 2.5,
'The Night Listener': 4.5,
},
'Mick LaSalle': {
'Lady in the Water': 3.0,
'Snakes on a Plane': 4.0,
'Just My Luck': 2.0,
'Superman Returns': 3.0,
'You, Me and Dupree': 2.0,
'The Night Listener': 3.0,
},
'Jack Matthews': {
'Lady in the Water': 3.0,
'Snakes on a Plane': 4.0,
#'Just My Luck': 1.5,
'Superman Returns': 5.0,
'You, Me and Dupree': 3.5,
'The Night Listener': 3.0,
},
'Toby': {
#'Lady in the Water': 3.0,
'Snakes on a Plane': 4.5,
#'Just My Luck': 1.5,
'Superman Returns': 4.0,
'You, Me and Dupree': 1.0,
#'The Night Listener': 3.5
},
}
## 寻找相近的用户 finding similar users
# 用于比较的度量算法
# http://en.wikipedia.org/wiki/Metric_%28mathematics%29#Examples
### 欧几里德距离评价 euclidean distance score
# returns a distance-based similarity score for person1 and person2
def sim_distance(prefs, person1, person2):
'''
1/(1+sum((p1.a-p2.a)**2 ...))
'''
# get the list of shared_items
si = {}
for item in prefs[person1]:
if item in prefs[person2]:
si[item] = 1
# if they have no ratings in common, return 0
if (len) == 0: return 0
# add up the squares of all the differences
sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item], 2)
for item in prefs[person1] if item in prefs[person2]])
return 1/(1+sqrt(sum_of_squares))
### 皮尔逊相关度评价 pearson correlation score
# returns the pearson correlation coefficient for p1 and p2
def sim_pearson(prefs, p1, p2):
''' sum(p1.a * p2.a, ...) - sum(p1.a, ...)*sum(p2.a, ...)/n
_______________________________________________________
____________________________________________________________________________
V (sum(p1.a**2, ...)-sum(p1.a, ...)**2/n) * (sum(p2.a**2, ...)-sum(p2.a, ...)**2/n)
'''
# get the list of mutually reated items
si = {item:1 for item in prefs[p1] if item in prefs[p2]}
# find the number of elements
n = len(si)
# if they are no ratings in common, return 0
if n == 0: return 0
# add up all the preferences
sum1 = sum([prefs[p1][it] for it in si])
sum2 = sum([prefs[p2][it] for it in si])
# sum up the squares
sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])
sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])
# sum up the products
pSum = sum([prefs[p1][it]*prefs[p2][it] for it in si])
# calculate pearson score
num = pSum - (sum1 * sum2 / n)
den = sqrt((sum1Sq - pow(sum1, 2)/n) * (sum2Sq - pow(sum2, 2)/n))
if den == 0: return 0
return num/den
### 为评论者打分 ranking the critics
# returns the best matches for person from the prefs dictionary.
# number of results and similarity function are optional params.
def topMatches(prefs, person, n=5, similarity=sim_pearson):
scores = [(similarity(prefs, person, other), other)
for other in prefs if other != person]
# sort the list so the highest scores appear at the top
scores.sort(reverse=True)
return scores[0:n]
## 推荐物品 recommending items
# gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs, person, similarity=sim_pearson):
'''user-based '''
totals = {}
simSums = {}
for other in prefs:
# don't compare me to myself
if other == person: continue
sim = similarity(prefs, person, other)
# ignore scores of zero or lower
if sim <= 0: continue
for item in prefs[other]:
# only score movies I haven't seen yet
if item not in prefs[person] or prefs[person][item] == 0:
# similarity * score
totals.setdefault(item, 0)
totals[item] += prefs[other][item]*sim
# sum of similarities
simSums.setdefault(item, 0)
simSums[item] += sim
# create the normalized list
rankings = [(total/simSums[item], item) for item, total in totals.items()]
# return the sorted list
rankings.sort(reverse=True)
return rankings
## 匹配商品 matching products -- 了解哪些商品是彼此相近的
# 各个人对某一商品评分的集合
def transformPrefs(prefs):
result = {}
for person in prefs:
for item in prefs[person]:
result.setdefault(item, {})
# flip item and person
result[item][person] = prefs[person][item]
return result
## 构建一个基于del.icio.us的链接推荐系统 -- building a del.icio.us link recommender
### the del.icio.us API
### building the dataset
### recommanding neighbors and links
## 基于物品的过滤 -- Item-Based Filtering
s = '''
总体思路:为每件物品预先计算好最为相近的其他物品。
然后,当我们想为某位用户提供推荐时,就可以查看他曾经评过分的物品,
并从中选出排位靠前者,在构造出一个加权列表,
其中包含了与这些选中物品最为相近的其他物品
'''
### building the item comparison dataset
def calculateSimilarItems(prefs, n=10):
# create a dictionary of items showing which other items they
# are most similar to.
result = {}
# invert the preference matrix to be item-centric
itemPrefs = transformPrefs(prefs)
c = 0
for item in itemPrefs:
# status updates for large datasets
c += 1
if c%100 == 0: print '%d / %d' % (c, len(itemPrefs))
# find the most similar items to this one
scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
result[item] = scores
return result
### 获得推荐-- getting recommendations
def getRecommendedItems(prefs, itemMatch, user):
'''Item-based'''
userRatings = prefs[user]
scores = {}
totalSim = {}
# loop over items rated by this user
for (item, rating) in userRatings.items():
# loop over items similar to this one
for (similarity, item2) in itemMatch[item]:
# ignore if this user has already rated this item
if item2 in userRatings: continue
# weighted sum of rating times similarity
scores.setdefault(item2, 0)
scores[item2] += similarity * rating
# sum of all the similarities
totalSim.setdefault(item2, 0)
totalSim[item2] += similarity
# divide each total score by total weighting to get an average
rankings = [(score/totalSim[item], item) for (item, score) in scores.items()]
# return the ranking from highest to lowest
rankings.sort(reverse=True)
return rankings