数据聚类:一种用以寻找紧密相关的事、人或观点,并将其可视化的方法。
监督学习和无监督学习 -- Supervised versus Unsupervised Learning
监督学习法:利用样本输入和期望输出来学习如何预测的技术 如:神经网络、决策树、支持向量机,及贝叶斯过滤
无监督学习:不是利用带有正确答案的样本数据进行训练。它 们的目的是要在一组数据中找寻某种结构,而这 些数据本身并不是我们要找的答案。 如:聚类,负矩阵因式分解,自组织映射
##单词向量 -- Word Vectors
# 为聚类算法准备数据常见的做法是定义一组公共的数值型属性,可以利用这些属性对数据项进行比较。
###对博客用户进行分类 -- Pigeonholing the Bloggers
# 根据单词出现的频度对博客进行聚类。
###对订阅源中的单词进行计数 -- Counting the Words in a feed
# generatefeedvector.py
#import feedparser
import re
# returns title and dictionary of word counts for an RSS feed
def getwordcounts(url):
# parse the feed
d = feedparser.parse(url)
wc = {}
# loop over all the entries
for e in d.entries:
summary = e.summary if 'summary' in e else\
e.description
# extract a list of words
words = getwords(e.title+' '+summary)
for word in words:
wc.setdefault(word, 0)
wc[word] += 1
return d.feed.title, wc
def getwords(html):
# remove all the HTML tags
txt = re.compile(r'<[^>]+>').sub('', html)
# split words by all non-alpha characters
words = re.compile(r'[^A-Z^a-z]+').split(txt)
# convert to lowercase
return [word.lower() for word in words if word != '']
def generatefeedvector():
'''遍历订阅源并生成数据集'''
# 生成针对每个博客的单词统计,及出现这些单词的博客数
apcount = {}
wordcounts = {}
feedlist = [line for line in file('feedlist.txt')]
for feedurl in feedlist:
title, wc = getwordcounts(feedurl)
wordcounts[title] = wc
for word, count in wc.items():
apcount.setdefault(word, 0)
if count > 1:
apcount[word] += 1
# 建立一个单词列表,实际用于针对每个博客的单词计数
# 10% ~ 50%
wordlist = []
for w, bc in apcount.items():
frac = float(bc) / len(feedlist)
if farc > 0.1 and farc < 0.5: wordlist.append(w)
out = file('blogdata.txt', 'w')
out.write('Blog')
for word in wordlist: out.write('\t%s' % word)
out.write('\n')
for blog, wc in wordcounts.items():
out.write(blog)
for word in wordlist:
if word in wc: out.write('\t%d' % wc[word])
else: out.write('\t0')
out.write('\n')
##分级聚类 -- Hierarchical Clustering
# 通过连续不断地将最为相似的群组两两合并,来构造出一个群组的层级结构。其中的每个群组都是从单一元素开始的。
# 树状图(dendrogram) 分级聚类的一种可视化形式
# clusters.py
def readfile(filename):
lines = [line for line in file(filename)]
# first line is column titles
colnames = lines[0].strip().split('\t')[1:]
rownames = []
data = []
for line in lines[1:]:
p = line.strip().split('\t')
# first column in each row is the rowname
rownames.append(p[0])
# the data for this row is the remainder of the row
data.append([float(x) for x in p[1:]])
return rownames, colnames, data
from math import sqrt
def pearson(v1, v2):
# simple sums
sum1 = sum(v1)
sum2 = sum(v2)
# sums of the squares
sum1Sq = sum([pow(v,2) for v in v1])
sum2Sq = sum([pow(v,2) for v in v2])
# sum of the products
pSum = sum([v1[i]*v2[i] for i in range(len(v1))])
# calculate r (Pearson score)
num = pSum - (sum1*sum2/len(v1))
den = sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
if den == 0: return 0
return 1.0 - num/den
class bicluster:
def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
self.left = left
self.right = right
self.vec = vec
self.id = id
self.distance = distance
def hcluster(rows, distance=pearson):
distances = {}
currentclustid = -1
# clusters are initially just the rows
clust = [bicluster(rows[i],id=i) for i in range(len(rows))]
while len(clust)>1:
lowestpair = (0, 1)
closest = distance(clust[0].vec, clust[1].vec)
# loop through every pair looking for the smallest distance
for i in range(len(clust)):
for j in range(i+1, len(clust)):
# distances is the cache of distance calculations
if (clust[i].id, clust[j].id) not in distances:
distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
d = distances[(clust[i].id, clust[j].id)]
if d < closest:
closest = d
lowestpair = (i, j)
# calculate the average of the two clusters
mergevec = [
(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]
# create the new cluster
newcluster = bicluster(mergevec,
left=clust[lowestpair[0]],
right=clust[lowestpair[1]],
distance=closest,
id=currentclustid)
# cluster ids that weren't in the original set are negative
currentclustid -= 1
del clust[lowestpair[1]]
del clust[lowestpair[0]]
clust.append(newcluster)
return clust[0]
#print '调用hcluster方法'
#blognames, words, data = readfile('blogdata.txt')
#clust = hcluster(data)
def printclust(clust, labels=None, n=0):
# indent to make a hierarchy layout
print ' '*n,
if clust.id < 0:
# negative id means that this is branch
print '-'
else:
# positive id means that this is an endpoint
print clust.id if labels==None else labels[clust.id]
# now print the right and left branches
if clust.left != None: printclust(clust.left, labels=labels, n=n+1)
if clust.right!= None: printclust(clust.right, labels=labels, n=n+1)
#print '调用printclust'
#printclust(clust, labels=blognames)
##绘制树状图 -- Drawing the Dendrogram
from PIL import Image, ImageDraw
def getheight(clust):
# is this an endpoint? then the height is just 1
if clust.left == None and clust.right == None: return 1
# otherwise the height is the same of the heights of
# each branch
return getheight(clust.left) + getheight(clust.right)
def getdepth(clust):
# the distance of an endpoint is 0.0
if clust.left == None and clust.right == None: return 0
# the distance of branch is the greater of its two sides
# plus its own distance
return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance
def drawdendrogram(clust, labels, jpeg='clusters.jpg'):
# height and width
h = getheight(clust) * 20
w = 1200
depth = getdepth(clust)
# width is fixed, so scale distances accordingly
scaling = float(w-150)/depth
# create a new image with a white background
img = Image.new('RGB', (w,h), (255,255,255))
draw = ImageDraw.Draw(img)
draw.line((0, h/2, 10, h/2), fill=(255,0,0))
# draw the first node
drawnode(draw, clust, 10, (h/2), scaling, labels)
img.save(jpeg, 'JPEG')
def drawnode(draw, clust, x, y, scaling, labels):
if clust.id < 0:
h1 = getheight(clust.left)*20
h2 = getheight(clust.right)*20
top = y - (h1 + h2)/2
bottom = y + (h1 + h2)/2
# line length
ll = clust.distance * scaling
# vertical line from this cluster to children
draw.line((x, top+h1/2, x, bottom-h2/2), fill=(255,0,0))
# horizontal line to left item
draw.line((x, top+h1/2, x+ll, top+h1/2), fill=(255,0,0))
# horizontal line to right item
draw.line((x, bottom-h2/2, x+ll, bottom-h2/2), fill=(255,0,0))
# call the function to draw the left and right nodes
drawnode(draw, clust.left, x+ll, top+h1/2, scaling, labels)
drawnode(draw, clust.right, x+ll, bottom-h2/2, scaling, labels)
else:
# if this is an endpoint, draw the item label
draw.text((x+5,y-7), labels[clust.id], (0,0,0))
#print '生成图片'
#drawdendrogram(clust, blognames, jpeg='blogclust.jpg')
##列聚类 -- Column Clustering
def rotatematrix(data):
'''使data的每行表示某个单词在每篇博客中出现的次数'''
newdata = []
for i in range(len(data[0])):
newrow = [data[j][i] for j in range(len(data))]
newdata.append(newrow)
return newdata
#print '列聚类 -- 画图'
#rdata = rotatematrix(data)
#wordclust = hcluster(rdata)
#drawdendrogram(wordclust, labels=words, jpeg='wordclust.jpg')
# 聚类有一点很重要:当数据项的数量比变量多的时候,出现无意义的聚类的可能性就会增加?
# 单词聚类:显示了,人们在博客中探讨在线服务或与Internect相关的话题时,经常会用到的一组词汇。
# 我们可能会找到一些反映使用模式(usage patterns)的聚类,如fact,us,say,very及think,这些单词说明博客的写作风格是偏主观的(opinionated)
# 分级聚类的缺点:
# 1. 在没有额外投入的情况下,树形视图是不会真正将数据拆分成不同组的
# 2. 该算法的计算量非常惊人 n**2 - n - 1 = O(n**2)
##K-均值聚类 -- K-Means Clustering
# K-均值聚类:预先告诉算法希望生成的聚类数量,然后算法会根据数据的结构状况来确定聚类的大小。
# 首先会随机确定k个中心位置(位于空间中代表聚类中心的点)# 然后将各个数据项分配给最临近的中心点。
# 分配完成后,聚类的中心会移到分配给该聚类的所有节点的平均位置处
# 整个分配过程重新开始,一致重复,直到分配过程不再产生变化为止。
import random
def kcluster(rows, distance=pearson, k=4):
# determine the minimum and maximum values for each point
ranges = [(min([row[i] for row in rows]), max([row[i] for row in rows]))
for i in range(len(rows[0]))]
# create k randomly placed centroids
clusters = [[random.random()*(ranges[i][1] -ranges[i][0]) for i in range(len(rows[0]))] for j in range(k)]
lastmatches = None
for t in range(100):
print 'Iteration %d' % t
bestmatches = [[] for i in range(k)]
# find which centroid is the closet for each row
for j in range(len(rows)):
row = rows[j]
bestmatch = 0
for i in range(k):
d = distance(clusters[i], row)
if d < distance(clusters[bestmatch], row):
bestmatch = i
bestmatches[bestmatch].append(j)
# if the results are the same as last time, this is complete
if bestmatches == lastmatches: break
lastmatches = bestmatches
# move the cetroids to the average of their members
for i in range(k):
avgs = [0.0] * len(rows[0])
if len(bestmatches[i]) > 0:
for rowid in bestmatches[i]:
for m in range(len(rows[rowid])):
avgs[m] += rows[rowid][m]
for j in range(len(avgs)):
avgs[j] /= len(bestmatches[i])
clusters[i] = avgs
return bestmatches
#print 'k-means'
#kclust = kcluster(data, k=10)
#print [blognames[r] for r in kclust[0]]
##针对偏好的聚类 -- Clusters of Preferences
###获取数据和准备数据 -- Getting and Perparing the Data
###Beautiful Soup
###收集来自Zebo的结果 -- Scraping the Zebo Results
# downloadzebodata.py
def downloadzebodata():
from bs4 import BeautifulSoup
import urllib2
import re
chare = re.compile(r'[!-\.&]')
itemowners = {}
# words to remove
dropwords = ['a','new','some','more','my','own','the','many','other','another']
currentuser = 0
for i in range(1,51):
# url for the want search page
c = urllib2.urlopen(
'http://member.zebo.com/Main?event_key=USERSEARCH&wiowiw=wiw&keyword=car&page=%d'%(i))
soup = BeautifulSoup(c.read())
for td in soup('td'):
# find the table cells of bgverdanasmall class
if ('class' in dict(td.attrs) and td['class'] == 'bgverdanasmall'):
items = [re.sub(chare, '', a.contents[0].lower()).strip() for a in td('a')]
for item in items:
# remove extra words
txt = ' '.join([t for t in item.split(' ') if t not in dropwords])
if len(txt) < 2: continue
itemowners.setdefault(txt, {})
itemowners[txt][currentuser] = 1
currentuser += 1
out = file('zebo.txt', 'w')
out.write('Item')
for user in range(0, currentuser): out.write('\tU%d' % user)
out.write('\n')
for item, owners in itemowners.items():
if len(owners) > 10:
out.write(item)
for user in rnage(0, currentuser):
if user in owners: out.write('\t1')
else: out.write('\t0')
out.write('\n')
###定义距离度量标准 -- Defining a Distance Metric
# tanimoto系数(tonimoto coefficient):它代表的是交集与并集的比率
def tanimoto(v1, v2):
c1, c2, shr = 0, 0, 0
for i in range(len(v1)):
if v1[i] != 0: c1 += 1 # in v1
if v2[i] != 0: c2 += 1 # in v2
if v1[i] != 0 and v2[i] != 0: shr += 1 # in both
return 1.0 - (float(shr)/(c1+c2-shr))
###对结果进行聚类 -- Clustering Results
#wants, people, data = readfile('zebo.txt')
#clust = hcluster(data, distance=tanimoto)
#drawdendrogram(clust, wants)
##以二维形式展现数据 -- Viewing Data in Two Dimensions
# 多维缩放(multidimensional scaling):为数据集找到一种二维表达形式
def scaledown(data, distance=pearson, rate=0.01):
n = len(data)
# 每一对数据项之间的真实距离
realdist = [[distance(data[i], data[j]) for j in range(n)] for i in range(0, n)]
outersum = 0.0
#随机初始化节点在二维空间中的起始位置
loc = [[random.random(), random.random()] for i in range(n)]
fakedist = [[0.0 for j in range(n)] for i in range(n)]
lasterror = None
for m in range(0, 1000):
# 寻找投影后的距离
for i in range(n):
for j in range(n):
fakedist[i][j] = sqrt(sum([pow(loc[i][x]-loc[j][x], 2) for x in range(len(loc[i]))]))
# 移动节点
grad = [[0.0,0.0] for i in range(n)]
totalerror = 0
for k in range(n):
for j in range(n):
if j == k: continue
# 误差值等于目标距离与当前距离之间差值的百分比
errorterm = (fakedist[j][k]-realdist[j][k])/realdist[j][k]
# 每一个节点都需要根据误差的多少,按比例移离或移向其他节点
grad[k][0] += ((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
grad[k][1] += ((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
# 记录总的误差值
totalerror += abs(errorterm)
print totalerror
# 如果节点移动之后的情况变得更糟,则程序结束
if lasterror and lasterror < totalerror: break
lasterror = totalerror
# 根据rate参数与grad值相乘的结果,移动每一个节点
for k in range(n):
loc[k][0] -= rate*grad[k][0]
loc[k][1] -= rate*grad[k][1]
return loc
def draw2d(data, labels, jpeg='mds2d.jpg'):
img = Image.new('RGB', (2000,2000), (255,255,255))
draw = ImageDraw.Draw(img)
for i in range(len(data)):
x = (data[i][0] + 0.5) * 1000
y = (data[i][1] + 0.5) * 1000
draw.text((x,y), labels[i], (0,0,0))
img.save(jpeg, 'JPEG')
#print '二维形式展现数据'
#blognames, words, data = readfile('blogdata.txt')
#coords = scaledown(data)
#draw2d(coords, blognames, jpeg='blogs2d.jpg')