Visualizing Email Data(Week 6&7)

code segment

gword.py

import sqlite3
import time
import zlib
import string conn = sqlite3.connect('index.sqlite')
cur = conn.cursor() cur.execute('SELECT id, subject FROM Subjects')
subjects = dict()
for message_row in cur :
subjects[message_row[0]] = message_row[1] # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages')
cur.execute('SELECT subject_id FROM Messages')
counts = dict()
for message_row in cur :
text = subjects[message_row[0]]
text = text.translate(str.maketrans('','',string.punctuation))
text = text.translate(str.maketrans('','','1234567890'))
text = text.strip()
text = text.lower()
words = text.split()
for word in words:
if len(word) < 4 : continue
counts[word] = counts.get(word,0) + 1 x = sorted(counts, key=counts.get, reverse=True)
highest = None
lowest = None
for k in x[:100]:
if highest is None or highest < counts[k] :
highest = counts[k]
if lowest is None or lowest > counts[k] :
lowest = counts[k]
print('Range of counts:',highest,lowest) # Spread the font sizes across 20-100 based on the count
bigsize = 80
smallsize = 20 fhand = open('gword.js','w')
fhand.write("gword = [")
first = True
for k in x[:100]:
if not first : fhand.write( ",\n")
first = False
size = counts[k]
size = (size - lowest) / float(highest - lowest)
size = int((size * bigsize) + smallsize)
fhand.write("{text: '"+k+"', size: "+str(size)+"}")
fhand.write( "\n];\n")
fhand.close() print("Output written to gword.js")
print("Open gword.htm in a browser to see the vizualization")

gline.py

import sqlite3
import time
import zlib conn = sqlite3.connect('index.sqlite')
cur = conn.cursor() cur.execute('SELECT id, sender FROM Senders')
senders = dict()
for message_row in cur :
senders[message_row[0]] = message_row[1] cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
messages = dict()
for message_row in cur :
messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) print("Loaded messages=",len(messages),"senders=",len(senders)) sendorgs = dict()
for (message_id, message) in list(messages.items()):
sender = message[1]
pieces = senders[sender].split("@")
if len(pieces) != 2 : continue
dns = pieces[1]
sendorgs[dns] = sendorgs.get(dns,0) + 1 # pick the top schools
orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
orgs = orgs[:10]
print("Top 10 Organizations")
print(orgs) counts = dict()
months = list()
# cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
for (message_id, message) in list(messages.items()):
sender = message[1]
pieces = senders[sender].split("@")
if len(pieces) != 2 : continue
dns = pieces[1]
if dns not in orgs : continue
month = message[3][:7]
if month not in months : months.append(month)
key = (month, dns)
counts[key] = counts.get(key,0) + 1 months.sort()
# print counts
# print months fhand = open('gline.js','w')
fhand.write("gline = [ ['Month'")
for org in orgs:
fhand.write(",'"+org+"'")
fhand.write("]") for month in months:
fhand.write(",\n['"+month+"'")
for org in orgs:
key = (month, org)
val = counts.get(key,0)
fhand.write(","+str(val))
fhand.write("]"); fhand.write("\n];\n")
fhand.close() print("Output written to gline.js")
print("Open gline.htm to visualize the data")

最新文章

  1. BFC和haslayout
  2. AngularJS进阶学习
  3. html-----013----实体字符/HTML URL 编码
  4. fastjson反序列化
  5. php的多线程使用
  6. html onclick时间传字符串参数
  7. spring4笔记----依赖注入的两种形式
  8. Windows 查看端口占用情况
  9. Substring方法(C#,JS,Java,SQL)的区别
  10. maven 基础
  11. 33.scrapy采集网站表单数据
  12. WebDriver高级应用实例(2)
  13. WHAT I READ FOR DEEP-LEARNING
  14. ehcache入门基础示例
  15. dvwa 源码分析(三) --- config.inc.php分析
  16. 【刷水-二分答案】BZOJ1650 &amp; BZOJ1639
  17. 20145331 《Java程序设计》第10周学习总结
  18. Shiro配置cookie以及共享Session和Session失效问题
  19. P4编程环境搭建
  20. 导入导出SQL数据库

热门文章

  1. Java核心技术--接口与内部类
  2. LeetCode466. Count The Repetitions
  3. 5、flink常见函数使用及自定义转换函数
  4. JS - Promise使用详解
  5. python学习07列表
  6. 【Linux常见命令】find命令
  7. 《JavaScript和jQuery实战手册(原书第2版)》——2.1节语句
  8. 第七周CorelDRAW课总结
  9. 原生JS设计轮播图
  10. 一张图告诉你UML图怎么画❀