一、请知晓

 本文是基于:

  Event Recommendation Engine Challenge分步解析第一步

  Event Recommendation Engine Challenge分步解析第二步

 需要读者先阅读前两篇文章解析

二、用户社交关系信息处理

 这一步需要user_friends.csv.gz文件,我们先来看看文件内容:

import pandas as pd
df_user_friends = pd.read_csv('user_friends.csv.gz', compression='gzip')
df_user_friends.head()

 代码示例结果(发现该记录了用户的所有朋友信息):

 1)变量解释

  nusers:train.csvtest.csv文件涉及的所有用户数目,3391

  self.numFriends:一维向量,每个元素记录了(3391个)每个用户的朋友数目,然后除以总的朋友数(sumNumFriends),得到每个用户的朋友占比

import scipy.io as sio
import pandas as pd
numFriends = sio.mmread('UF_numFriends')
pd.DataFrame(numFriends)

  代码示例结果:

  self.userFriends:稀疏矩阵,shape为(3391,3391),记录每个用户与其朋友的score矩阵,最后归一化

import scipy.io as sio
import pandas as pd
userFriends = sio.mmread('UF_userFriends').todense()
pd.DataFrame(userFriends)

  代码示例结果:

  userEventScores:临时变量,记录某个用户的某个朋友对每个event的兴趣分(1,0,or -1)

  sumNumFriends:每个用户的朋友数相加之和

 2)记录对user_friends.csv.gz文件操作

  逐行读取user_friends.csv.gz文件

    如果用户在第一步userIndex中,获取该用户的朋友数目,并保存在self.numFriends中

      对于该用户每一个朋友,只操作存在于第一步userIndex中的朋友

        获得该朋友的Index,利用该index去操作第一步中的userEventScores,这个userEventScores记录了每个用户对每个event的兴趣分(1,0,or -1),这样我们就获得了该用户的该朋友对每个event的兴趣分,

        并求得该用户的该朋友的平均兴趣分(对每个event的兴趣分和除以总得event数-13481)

 3)有了上面对user_friends.csv.gz文件操作说明,我们来看看完整代码

from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize import gzip
import numpy as np #处理user和event关联数据
class ProgramEntities:
"""
我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
经过统计:train和test中总共3391个users和13418个events
"""
def __init__(self):
#统计训练集中有多少独立的用户的events
uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
for filename in ['train.csv', 'test.csv']:
f = open(filename)
f.readline()#跳过第一行
for line in f:
cols = line.strip().split(',')
uniqueUsers.add( cols[0] )
uniqueEvents.add( cols[1] )
eventsForUser[cols[0]].add( cols[1] )
usersForEvent[cols[1]].add( cols[0] )
f.close() self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
self.userIndex = dict()
self.eventIndex = dict()
for i, u in enumerate(uniqueUsers):
self.userIndex[u] = i
for i, e in enumerate(uniqueEvents):
self.eventIndex[e] = i ftrain = open('train.csv')
ftrain.readline()
for line in ftrain:
cols = line.strip().split(',')
i = self.userIndex[ cols[0] ]
j = self.eventIndex[ cols[1] ]
self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
ftrain.close()
sio.mmwrite('PE_userEventScores', self.userEventScores) #为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
#所谓关联用户指的是至少在同一个event上有行为的用户user pair
#关联的event指的是至少同一个user有行为的event pair
self.uniqueUserPairs = set()
self.uniqueEventPairs = set()
for event in uniqueEvents:
users = usersForEvent[event]
if len(users) > 2:
self.uniqueUserPairs.update( itertools.combinations(users, 2) )
for user in uniqueUsers:
events = eventsForUser[user]
if len(events) > 2:
self.uniqueEventPairs.update( itertools.combinations(events, 2) )
#rint(self.userIndex)
cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') ) #数据清洗类
class DataCleaner:
def __init__(self):
#一些字符串转数值的方法
#载入locale
self.localeIdMap = defaultdict(int) for i, l in enumerate(locale.locale_alias.keys()):
self.localeIdMap[l] = i + 1 #载入country
self.countryIdMap = defaultdict(int)
ctryIdx = defaultdict(int)
for i, c in enumerate(pycountry.countries):
self.countryIdMap[c.name.lower()] = i + 1
if c.name.lower() == 'usa':
ctryIdx['US'] = i
if c.name.lower() == 'canada':
ctryIdx['CA'] = i for cc in ctryIdx.keys():
for s in pycountry.subdivisions.get(country_code=cc):
self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1 self.genderIdMap = defaultdict(int, {'male':1, 'female':2}) #处理LocaleId
def getLocaleId(self, locstr):
#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
return self.localeIdMap[ locstr.lower() ] #处理birthyear
def getBirthYearInt(self, birthYear):
try:
return 0 if birthYear == 'None' else int(birthYear)
except:
return 0 #性别处理
def getGenderId(self, genderStr):
return self.genderIdMap[genderStr] #joinedAt
def getJoinedYearMonth(self, dateString):
dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
return "".join( [str(dttm.year), str(dttm.month) ] ) #处理location
def getCountryId(self, location):
if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:
return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]
else:
return 0 #处理timezone
def getTimezoneInt(self, timezone):
try:
return int(timezone)
except:
return 0 #用户与用户相似度矩阵
class Users:
"""
构建user/user相似度矩阵
"""
def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
cleaner = DataCleaner()
nusers = len(programEntities.userIndex.keys())#3391
#print(nusers)
fin = open('users.csv')
colnames = fin.readline().strip().split(',') #7列特征
self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
for line in fin:
cols = line.strip().split(',')
#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
if cols[0] in programEntities.userIndex:
i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
fin.close() #归一化矩阵
self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('US_userMatrix', self.userMatrix) #计算用户相似度矩阵,之后会用到
self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
for i in range(0, nusers):
self.userSimMatrix[i, i] = 1.0 for u1, u2 in programEntities.uniqueUserPairs:
i = programEntities.userIndex[u1]
j = programEntities.userIndex[u2]
if (i, j) not in self.userSimMatrix:
#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
self.userSimMatrix[i, j] = usim
self.userSimMatrix[j, i] = usim
sio.mmwrite('US_userSimMatrix', self.userSimMatrix) #用户社交关系挖掘
class UserFriends:
"""
找出某用户的那些朋友,想法非常简单
1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
"""
def __init__(self, programEntities):
nusers = len(programEntities.userIndex.keys())#3391
self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数
self.userFriends = ss.dok_matrix( (nusers, nusers) )
fin = gzip.open('user_friends.csv.gz')
print( 'Header In User_friends.csv.gz:',fin.readline() )
ln = 0
#逐行打开user_friends.csv.gz文件
#判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user
#获取该用户的Index,和朋友数目
#对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应
#score即为该朋友对所有events的平均分
#userFriends矩阵记录了用户和朋友之间的score
#如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友
#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
for line in fin:
if ln % 200 == 0:
print( 'Loading line:', ln )
cols = line.decode().strip().split(',')
user = cols[0]
if user in programEntities.userIndex:
friends = cols[1].split(' ')#获得该用户的朋友列表
i = programEntities.userIndex[user]
self.numFriends[i] = len(friends)
for friend in friends:
if friend in programEntities.userIndex:
j = programEntities.userIndex[friend]
#the objective of this score is to infer the degree to
#and direction in which this friend will influence the
#user's decision, so we sum the user/event score for
#this user across all training events
eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1
#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
#socre即是用户朋友在13418个events上的平均分
score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
#print(score)
self.userFriends[i, j] += score
self.userFriends[j, i] += score
ln += 1
fin.close()
#归一化数组
sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
print(sumNumFriends)
self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )
self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
sio.mmwrite('UF_userFriends', self.userFriends) print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n') print('第2步:计算用户相似度信息,并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n') print('第3步:计算用户社交关系信息,并存储...')
UserFriends(pe)
print('第3步完成...\n')

 至此,第三步完成,哪里有不明白的请留言

 我们继续看Event Recommendation Engine Challenge分步解析第四步

最新文章

  1. 如何获取安卓系统自带应用的package和activity
  2. 探索性思维——How to Solve It
  3. 灰色预测原理及JAVA实现
  4. 很少再用left join
  5. Atitit 图像处理之仿油画效果 Oilpaint油画滤镜 水彩画 漫画滤镜 v2
  6. Linux perf tools
  7. Silverlight通过Wcf Data Service访问数据库之ADO.NET Entity Framework篇
  8. C#计算程序执行速度
  9. 在 Mac OS X 中建立加密的 Zip 压缩 -- 让机密资料加上密码
  10. Lost Cows(BIT poj2182)
  11. 剑指offier第10题
  12. cocos2D(八)---- CCMenu && CCMenuItem
  13. 详细,Qt Creator快捷键大全,附快捷键配置方法
  14. linux server 常见参数修改
  15. 基于CAS在.NET中实现SSO单点登录
  16. 1064 Financial Management
  17. 洛谷 P1032 子串变换
  18. 007_ip统计及攻击ip分析
  19. spring依赖注入之手工装配
  20. 复制web项目时注意修改web项目名

热门文章

  1. SQL中使用循环结构
  2. thinkphp 5内置验证规则-基本版
  3. 【NOI2002】
  4. Docker 私有仓库 Harbor registry 安全认证搭建 [Https]
  5. 分数规划模板(洛谷P4377 [USACO18OPEN]Talent Show)(分数规划,二分答案,背包)
  6. Codeforces | CF1029F 【Multicolored Markers】
  7. iptables(3)
  8. 【转】WEB服务器与应用服务器的区别
  9. Bomb HDU - 5934 (Tarjan)
  10. centos7/centos6修改系统默认语言