机器学习

发现组群 -- Discovering Groups

数据聚类：一种用以寻找紧密相关的事、人或观点，并将其可视化的方法。

从各种不同的来源中构造算法所需的数据
两种不同的聚类算法
更多有关距离度量（distance metrics）的知识
简单的图形可视化代码，用以观察所生成的群组
如何将异常复杂的数据集投影到二维空间中

监督学习和无监督学习 -- Supervised versus Unsupervised Learning

监督学习法：利用样本输入和期望输出来学习如何预测的技术如：神经网络、决策树、支持向量机，及贝叶斯过滤

无监督学习：不是利用带有正确答案的样本数据进行训练。它们的目的是要在一组数据中找寻某种结构，而这些数据本身并不是我们要找的答案。如：聚类，负矩阵因式分解，自组织映射

##单词向量 -- Word Vectors

# 为聚类算法准备数据常见的做法是定义一组公共的数值型属性，可以利用这些属性对数据项进行比较。

###对博客用户进行分类 -- Pigeonholing the Bloggers

# 根据单词出现的频度对博客进行聚类。

###对订阅源中的单词进行计数 -- Counting the Words in a feed

# generatefeedvector.py
#import feedparser
import re

# returns title and dictionary of word counts for an RSS feed
def getwordcounts(url):
    # parse the feed
    d = feedparser.parse(url)
    wc = {}

    # loop over all the entries
    for e in d.entries:
        summary = e.summary if 'summary' in e else\
                  e.description
        # extract a list of words
        words = getwords(e.title+' '+summary)
        for word in words:
            wc.setdefault(word, 0)
            wc[word] += 1
    return d.feed.title, wc

def getwords(html):
    # remove all the HTML tags
    txt = re.compile(r'<[^>]+>').sub('', html)

    # split words by all non-alpha characters
    words = re.compile(r'[^A-Z^a-z]+').split(txt)

    # convert to lowercase
    return [word.lower() for word in words if word != '']

def generatefeedvector():
    '''遍历订阅源并生成数据集'''
    # 生成针对每个博客的单词统计，及出现这些单词的博客数
    apcount = {}
    wordcounts = {}
    feedlist = [line for line in file('feedlist.txt')]
    for feedurl in feedlist:
        title, wc = getwordcounts(feedurl)
        wordcounts[title] = wc
        for word, count in wc.items():
            apcount.setdefault(word, 0)
            if count > 1:
                apcount[word] += 1

    # 建立一个单词列表，实际用于针对每个博客的单词计数
    # 10% ～ 50%
    wordlist = []
    for w, bc in apcount.items():
        frac = float(bc) / len(feedlist)
        if farc > 0.1 and farc < 0.5: wordlist.append(w)


    out = file('blogdata.txt', 'w')
    out.write('Blog')
    for word in wordlist: out.write('\t%s' % word)
    out.write('\n')
    for blog, wc in wordcounts.items():
        out.write(blog)
        for word in wordlist:
            if word in wc: out.write('\t%d' % wc[word])
            else: out.write('\t0')
        out.write('\n')


##分级聚类 -- Hierarchical Clustering

# 通过连续不断地将最为相似的群组两两合并，来构造出一个群组的层级结构。其中的每个群组都是从单一元素开始的。

# 树状图(dendrogram) 分级聚类的一种可视化形式

# clusters.py
def readfile(filename):
    lines = [line for line in file(filename)]

    # first line is column titles
    colnames = lines[0].strip().split('\t')[1:]
    rownames = []
    data = []
    for line in lines[1:]:
        p = line.strip().split('\t')
        # first column in each row is the rowname
        rownames.append(p[0])
        # the data for this row is the remainder of the row
        data.append([float(x) for x in p[1:]])
    return rownames, colnames, data


from math import sqrt

def pearson(v1, v2):
    # simple sums
    sum1 = sum(v1)
    sum2 = sum(v2)

    # sums of the squares
    sum1Sq = sum([pow(v,2) for v in v1])
    sum2Sq = sum([pow(v,2) for v in v2])

    # sum of the products
    pSum = sum([v1[i]*v2[i] for i in range(len(v1))])

    # calculate r (Pearson score)
    num = pSum - (sum1*sum2/len(v1))
    den = sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
    if den == 0: return 0

    return 1.0 - num/den

class bicluster:
    def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
        self.left = left
        self.right = right
        self.vec = vec
        self.id = id
        self.distance = distance

def hcluster(rows, distance=pearson):
    distances = {}
    currentclustid = -1

    # clusters are initially just the rows
    clust = [bicluster(rows[i],id=i) for i in range(len(rows))]

    while len(clust)>1:
        lowestpair = (0, 1)
        closest = distance(clust[0].vec, clust[1].vec)

        # loop through every pair looking for the smallest distance
        for i in range(len(clust)):
            for j in range(i+1, len(clust)):
                # distances is the cache of distance calculations
                if (clust[i].id, clust[j].id) not in distances:
                    distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
                d = distances[(clust[i].id, clust[j].id)]

                if d < closest:
                    closest = d
                    lowestpair = (i, j)

        # calculate the average of the two clusters
        mergevec = [
            (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]

        # create the new cluster
        newcluster = bicluster(mergevec, 
                               left=clust[lowestpair[0]],
                               right=clust[lowestpair[1]],
                               distance=closest,
                               id=currentclustid)

        # cluster ids that weren't in the original set are negative
        currentclustid -= 1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)

    return clust[0]


#print '调用hcluster方法'
#blognames, words, data = readfile('blogdata.txt')
#clust = hcluster(data)


def printclust(clust, labels=None, n=0):
    # indent to make a hierarchy layout
    print ' '*n,
    if clust.id < 0:
        # negative id means that this is branch
        print '-'
    else:
        # positive id means that this is an endpoint
        print clust.id if labels==None else labels[clust.id]

    # now print the right and left branches
    if clust.left != None: printclust(clust.left, labels=labels, n=n+1)
    if clust.right!= None: printclust(clust.right, labels=labels, n=n+1)


#print '调用printclust'
#printclust(clust, labels=blognames)


##绘制树状图 -- Drawing the Dendrogram

from PIL import Image, ImageDraw

def getheight(clust):
    # is this an endpoint? then the height is just 1
    if clust.left == None and clust.right == None: return 1

    # otherwise the height is the same of the heights of
    # each branch
    return getheight(clust.left) + getheight(clust.right)


def getdepth(clust):
    # the distance of an endpoint is 0.0
    if clust.left == None and clust.right == None: return 0

    # the distance of branch is the greater of its two sides
    # plus its own distance
    return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance


def drawdendrogram(clust, labels, jpeg='clusters.jpg'):
    # height and width
    h = getheight(clust) * 20
    w = 1200
    depth = getdepth(clust)

    # width is fixed, so scale distances accordingly
    scaling = float(w-150)/depth

    # create a new image with a white background
    img = Image.new('RGB', (w,h), (255,255,255))
    draw = ImageDraw.Draw(img)

    draw.line((0, h/2, 10, h/2), fill=(255,0,0))

    # draw the first node
    drawnode(draw, clust, 10, (h/2), scaling, labels)
    img.save(jpeg, 'JPEG')


def drawnode(draw, clust, x, y, scaling, labels):
    if clust.id < 0:
        h1 = getheight(clust.left)*20
        h2 = getheight(clust.right)*20
        top = y - (h1 + h2)/2
        bottom = y + (h1 + h2)/2
        # line length
        ll = clust.distance * scaling
        # vertical line from this cluster to children
        draw.line((x, top+h1/2, x, bottom-h2/2), fill=(255,0,0))

        # horizontal line to left item
        draw.line((x, top+h1/2, x+ll, top+h1/2), fill=(255,0,0))

        # horizontal line to right item
        draw.line((x, bottom-h2/2, x+ll, bottom-h2/2), fill=(255,0,0))

        # call the function to draw the left and right nodes
        drawnode(draw, clust.left, x+ll, top+h1/2, scaling, labels)
        drawnode(draw, clust.right, x+ll, bottom-h2/2, scaling, labels)
    else:
        # if this is an endpoint, draw the item label
        draw.text((x+5,y-7), labels[clust.id], (0,0,0))

#print '生成图片'
#drawdendrogram(clust, blognames, jpeg='blogclust.jpg')

##列聚类 -- Column Clustering

def rotatematrix(data):
    '''使data的每行表示某个单词在每篇博客中出现的次数'''
    newdata = []
    for i in range(len(data[0])):
        newrow = [data[j][i] for j in range(len(data))]
        newdata.append(newrow)
    return newdata

#print '列聚类 -- 画图'
#rdata = rotatematrix(data)
#wordclust = hcluster(rdata)
#drawdendrogram(wordclust, labels=words, jpeg='wordclust.jpg')

# 聚类有一点很重要：当数据项的数量比变量多的时候，出现无意义的聚类的可能性就会增加?
# 单词聚类：显示了，人们在博客中探讨在线服务或与Internect相关的话题时，经常会用到的一组词汇。
# 我们可能会找到一些反映使用模式（usage patterns）的聚类，如fact，us，say，very及think，这些单词说明博客的写作风格是偏主观的（opinionated）

# 分级聚类的缺点：
# 1. 在没有额外投入的情况下，树形视图是不会真正将数据拆分成不同组的
# 2. 该算法的计算量非常惊人 n**2 - n - 1 = O(n**2)

##K-均值聚类 -- K-Means Clustering

# K-均值聚类：预先告诉算法希望生成的聚类数量，然后算法会根据数据的结构状况来确定聚类的大小。
# 首先会随机确定k个中心位置（位于空间中代表聚类中心的点）# 然后将各个数据项分配给最临近的中心点。
# 分配完成后，聚类的中心会移到分配给该聚类的所有节点的平均位置处
# 整个分配过程重新开始，一致重复，直到分配过程不再产生变化为止。

import random

def kcluster(rows, distance=pearson, k=4):
    # determine the minimum and maximum values for each point
    ranges = [(min([row[i] for row in rows]), max([row[i] for row in rows])) 
            for i in range(len(rows[0]))]

    # create k randomly placed centroids
    clusters = [[random.random()*(ranges[i][1] -ranges[i][0]) for i in range(len(rows[0]))] for j in range(k)]

    lastmatches = None
    for t in range(100):
        print 'Iteration %d' % t
        bestmatches = [[] for i in range(k)]

        # find which centroid is the closet for each row
        for j in range(len(rows)):
            row = rows[j]
            bestmatch = 0
            for i in range(k):
                d = distance(clusters[i], row)
                if d < distance(clusters[bestmatch], row): 
                    bestmatch = i
            bestmatches[bestmatch].append(j)

        # if the results are the same as last time, this is complete
        if bestmatches == lastmatches: break
        lastmatches = bestmatches

        # move the cetroids to the average of their members
        for i in range(k):
            avgs = [0.0] * len(rows[0])
            if len(bestmatches[i]) > 0:
                for rowid in bestmatches[i]:
                    for m in range(len(rows[rowid])):
                        avgs[m] += rows[rowid][m]
                for j in range(len(avgs)):
                    avgs[j] /= len(bestmatches[i])
                clusters[i] = avgs

    return bestmatches

#print 'k-means'
#kclust = kcluster(data, k=10)
#print [blognames[r] for r in kclust[0]]

##针对偏好的聚类 -- Clusters of Preferences

###获取数据和准备数据 -- Getting and Perparing the Data

###Beautiful Soup

###收集来自Zebo的结果 -- Scraping the Zebo Results

# downloadzebodata.py
def downloadzebodata():
    from bs4 import BeautifulSoup
    import urllib2
    import re
    chare = re.compile(r'[!-\.&]')
    itemowners = {}

    # words to remove
    dropwords = ['a','new','some','more','my','own','the','many','other','another']

    currentuser = 0
    for i in range(1,51):
        # url for the want search page
        c = urllib2.urlopen(
            'http://member.zebo.com/Main?event_key=USERSEARCH&wiowiw=wiw&keyword=car&page=%d'%(i))
        soup = BeautifulSoup(c.read())
        for td in soup('td'):
            # find the table cells of bgverdanasmall class
            if ('class' in dict(td.attrs) and td['class'] == 'bgverdanasmall'):
                items = [re.sub(chare, '', a.contents[0].lower()).strip() for a in td('a')]
                for item in items:
                    # remove extra words
                    txt = ' '.join([t for t in item.split(' ') if t not in dropwords])
                    if len(txt) < 2: continue
                    itemowners.setdefault(txt, {})
                    itemowners[txt][currentuser] = 1
                currentuser += 1

    out = file('zebo.txt', 'w')
    out.write('Item')
    for user in range(0, currentuser): out.write('\tU%d' % user)
    out.write('\n')
    for item, owners in itemowners.items():
        if len(owners) > 10:
            out.write(item)
            for user in rnage(0, currentuser):
                if user in owners: out.write('\t1')
                else: out.write('\t0')
            out.write('\n')


###定义距离度量标准 -- Defining a Distance Metric

# tanimoto系数（tonimoto coefficient）：它代表的是交集与并集的比率

def tanimoto(v1, v2):
    c1, c2, shr = 0, 0, 0

    for i in range(len(v1)):
        if v1[i] != 0: c1 += 1 # in v1
        if v2[i] != 0: c2 += 1 # in v2
        if v1[i] != 0 and v2[i] != 0: shr += 1 # in both

    return 1.0 - (float(shr)/(c1+c2-shr))


###对结果进行聚类 -- Clustering Results

#wants, people, data = readfile('zebo.txt')
#clust = hcluster(data, distance=tanimoto)
#drawdendrogram(clust, wants)

##以二维形式展现数据 -- Viewing Data in Two Dimensions

# 多维缩放（multidimensional scaling）：为数据集找到一种二维表达形式

def scaledown(data, distance=pearson, rate=0.01):
    n = len(data)

    # 每一对数据项之间的真实距离
    realdist = [[distance(data[i], data[j]) for j in range(n)] for i in range(0, n)]

    outersum = 0.0

    #随机初始化节点在二维空间中的起始位置
    loc = [[random.random(), random.random()] for i in range(n)]
    fakedist = [[0.0 for j in range(n)] for i in range(n)]

    lasterror = None
    for m in range(0, 1000):
        # 寻找投影后的距离
        for i in range(n):
            for j in range(n):
                fakedist[i][j] = sqrt(sum([pow(loc[i][x]-loc[j][x], 2) for x in range(len(loc[i]))]))
        # 移动节点
        grad = [[0.0,0.0] for i in range(n)]

        totalerror = 0
        for k in range(n):
            for j in range(n):
                if j == k: continue
                # 误差值等于目标距离与当前距离之间差值的百分比
                errorterm = (fakedist[j][k]-realdist[j][k])/realdist[j][k]
                # 每一个节点都需要根据误差的多少，按比例移离或移向其他节点
                grad[k][0] += ((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
                grad[k][1] += ((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
                # 记录总的误差值
                totalerror += abs(errorterm)
        print totalerror

        # 如果节点移动之后的情况变得更糟，则程序结束
        if lasterror and lasterror < totalerror: break
        lasterror = totalerror

        # 根据rate参数与grad值相乘的结果，移动每一个节点
        for k in range(n):
            loc[k][0] -= rate*grad[k][0]
            loc[k][1] -= rate*grad[k][1]

    return loc


def draw2d(data, labels, jpeg='mds2d.jpg'):
    img = Image.new('RGB', (2000,2000), (255,255,255))
    draw = ImageDraw.Draw(img)
    for i in range(len(data)):
        x = (data[i][0] + 0.5) * 1000
        y = (data[i][1] + 0.5) * 1000
        draw.text((x,y), labels[i], (0,0,0))
    img.save(jpeg, 'JPEG')

#print '二维形式展现数据'
#blognames, words, data = readfile('blogdata.txt')
#coords = scaledown(data)
#draw2d(coords, blognames, jpeg='blogs2d.jpg')

有关聚类的其他事宜 -- Other Things to Cluster

Exercises

利用第2章中的del.icio.us API，构造一个适合聚类的标签数据集针对该数据集分别运行分级聚类算法和K-均值聚类算法
请修改解析博客的代码，以实现针对每个文章条目（entries）而非整个博客的聚类来自同一博客的不同条目能否聚类在一起？拥有相同日期信息的条目又如何？
请尝试使用实际距离，（即毕达哥拉斯距离）对博客进行聚类这样会对结果产生什么样的影响呢？
找到曼哈顿（manhattan）距离的定义为其编写一个函数，看看它是如何影响Zebo数据集的结果的。
请修改K-均值聚类函数令其在返回聚类结果的同时，一并返回所有数据项彼此之间的距离总和，以及它们各自的中心点位置
待完成第5个练习之后请编写一个函数，令其选择不同的k值来运行K-均值聚类算法看一看总的距离值是如何随着聚类数量的增加而改变的？当处于哪个位置的时候，聚类数的多少对最终结果的影响才会变得微乎其微？
在两个维度上的多维缩放易于打印不过这项技术也可以用于任意数量的维度。请尝试修改代码，以实现在一个维度上的缩放（即所有点都在一条直线上）。再尝试令其支持三个维度。