def createC1(dataSet): C1 = [] #C1是大小为1的所有候选项集的集合 fortransactionin dataSet: foritemintransaction: ifnot [item] in C1: C1.append([item]) C1.sort() return map(frozenset, C1)#use frozen set so we can use it as a key in a dict
根据构建出来的频繁项集,选出满足我们需要的大于我们给定的支持度的项集
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#D表示数据集,CK表示候选项集,minSupport表示最小的支持度,自己设定 def scanD(D, Ck, minSupport): ssCnt = {} for tid inD: for can inCk: if can.issubset(tid): if not ssCnt.has_key(can): ssCnt[can]=1 else: ssCnt[can] += 1 numItems = float(len(D)) retList = [] #存储满足最小支持度要求的项集 supportData = {} #每个项集的支持度字典 for key in ssCnt: #计算所有项集的支持度 support = ssCnt[key]/numItems if support >= minSupport: retList.insert(0,key) supportData[key] = support return retList, supportData
3.1 频繁项集
关于频繁项集的产生,我们单独的抽取出来 首先需要一个生成合并项集的函数,将两个子集合并的函数
1 2 3 4 5 6 7 8 9 10 11
#LK是频繁项集列表,K表示接下来合并的项集中的单个想的个数{1,2,3}表示k=3 defaprioriGen(Lk, k): #creates Ck retList = [] lenLk = len(Lk) for i in range(lenLk): for j in range(i+1, lenLk): L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2] #前k-2个项相同时,将两个集合合并 L1.sort(); L2.sort() if L1==L2:#if first k-2 elements are equal retList.append(Lk[i] | Lk[j]) #set union return retList
接着定义生成频繁项集的函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#只需要输入数据集和支持度即可 def apriori(dataSet, minSupport = 0.5): C1 = createC1(dataSet) D = map(set, dataSet) L1, supportData = scanD(D, C1, minSupport) L = [L1] k = 2 while (len(L[k-2]) > 0): Ck = aprioriGen(L[k-2], k) Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk supportData.update(supK) L.append(Lk) k += 1 return L, supportData#返回频繁项集和每个项集的支持度值
#输入的参数分别为:频繁项集、支持度数据字典、自定义的最小支持度,返回的是可信度规则列表 def generateRules(L, supportData, minConf=0.7): #支持度是通过scanD得到的字典 bigRuleList = [] for i in range(1, len(L)):#只去频繁项集中元素个数大于2的子集,如{1,2}{1,2,3},不取{2}{3},etc... for freqSet in L[i]: H1 = [frozenset([item]) for item in freqSet] if (i > 1): rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) else: calcConf(freqSet, H1, supportData, bigRuleList, minConf) return bigRuleList