自己做站点的时候,都看看收录和关键词排名什么的,所以打造的这个批量关键词查询工具。

#encoding:utf-8
import urllib,re,random,time,sys,StringIO,socket
try:
import pycurl
except:
pass
from bs4 import BeautifulSoup
score={1: 28.56,
2: 19.23,
3: 10.20,
4: 8.14,
5: 7.50,
6: 5.72,
7: 4.01,
8: 4.41,
9: 5.53,
10: 6.70,} #获取根域名,百度产品直接显示子域名
def root_domain(url):
if 'baidu.com' in url:
return url
else:
try:
url = url.replace('http://', '')
l = ['.com.cn', '.org.cn', '.net.cn', '.gov.cn']
for suffix in l:
if suffix in url:
return re.search('^(.*?\..*?)*([^.]+?\.[^.]+?\.[^.]+)', url).group(2)
return re.search('^(.*?\..*?)*([^.]+?\.[^.]+)', url).group(2)
except:
return '-' def curl(url, debug=False, **kwargs):
list=['Mozilla/5.0 (Windows NT 5.1; rv:37.0) Gecko/20100101 Firefox/37.0','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36']
randhead=random.sample(list,1)
while 1:
try:
s = StringIO.StringIO()
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.REFERER, url)
c.setopt(pycurl.FOLLOWLOCATION, True)
c.setopt(pycurl.TIMEOUT, 60)
c.setopt(pycurl.ENCODING, 'gzip')
c.setopt(pycurl.USERAGENT, '%s'%randhead[0])
c.setopt(pycurl.NOSIGNAL, True)
c.setopt(pycurl.WRITEFUNCTION, s.write)
for k, v in kwargs.iteritems():
c.setopt(vars(pycurl)[k], v)
c.perform()
c.close()
return s.getvalue()
except:
if debug:
raise
continue def get_baidudata(keyword,rn):
search_url = 'http://www.baidu.com/s?wd=%s&rn=%d'%(urllib.quote(keyword),rn)
pagetext = curl(search_url) #获取百度搜索结果源代码 while 'http://verify.baidu.com' in pagetext: #判断 如果查询过程中出现验证码则提示并停止10分钟,然后重新查询
print u"查询过程出现验证码,休息10分钟",keyword
time.sleep(600)
pagetext = curl(search_url)
else:
soup = BeautifulSoup(pagetext)
data = soup.find_all("div",attrs={'class':'result c-container '})#提取自然排名结果
return data return def get_rank_data(keyword,rn):
data = get_baidudata(keyword,rn)#获取自然排名结果
items = {}
for result in data:
g = result.find_all("a",attrs={'class':'c-showurl'})#获取主域名
if g:
site=re.search(r'([a-zA-Z0-9\.\-]+)',g[0].text)
host = site.groups(1)[0]
host=root_domain(host)#获取根域名
rank = int(result['id'])#排名
if host not in items.keys():
items[host] = []
items[host].append(score[rank])
else:
items[host].append(score[rank])
return items#返回单个词前十数据 def get_keywords(filename):#读取关键词返回列表
kwfile = open(filename,'r')
keywords = kwfile.readline()
kw_list = []
while keywords:
kw = keywords.strip()
kw_list.append(kw)
keywords = kwfile.readline()
kwfile.close()
return kw_list def get_all_data(filename,rn):#单域名数据合并
kw_list = get_keywords(filename)
items = {}
for i,kw in enumerate(kw_list,1):
print i,kw
item = get_rank_data(kw,rn)
for host,rank in item.items():
if host not in items.keys():
items[host] = rank
else:
items[host].extend(rank)
return items def get_score(filename,rn):
data = get_all_data(filename,rn)
fh = open('score.csv','a+')
fh.write('host,kws,average_score,host_score,\n') for host,rank in data.items():
if host != None:
host = host.encode('utf-8')
else:
host = 'error page'
kws = len(rank)#关键词数
host_score = sum(rank)#总得分
average_score = host_score/kws#平均分
fh.write(host+','+str(kws)+','+str(average_score)+','+str(host_score)+'\n')
return if __name__=="__main__":
file=raw_input("请输入包含关键词的文件名:")

  

最新文章

  1. iOS-开发者相关的几种证书
  2. 自然语言14_Stemming words with NLTK
  3. js原生选项卡(包含移动端无缝选项卡)三
  4. linux 查看是否安装perl模块
  5. leetcode 107
  6. MyBatis之八:需要说明的几个java api的生命周期以及封装
  7. 配置hibernate出现的错误一
  8. sql中的触发器、视图、事务
  9. TSharding:用于蘑菇街交易平台的分库分表组件
  10. CSS远程加载字体
  11. 复习一下SpringMVC的工作原理
  12. windows Tomcat+Nginx 集群 迷你版
  13. 1.Java关键字和保留字
  14. windows server 2012 浏览器IE10无法下载。
  15. .NET Core开发日志——依赖注入
  16. 单例设计模式 --c#
  17. 经典DFS问题实践
  18. 以太网,IP,TCP,UDP数据包分析【转】
  19. Tuxedo安装、配置、以及演示样例程序 (学习网址)
  20. ARC 101 C - Candles

热门文章

  1. Java程序内存分析
  2. Appium移动自动化测试-----(五) java-client安装与测试
  3. SUPPA 可变剪切分析
  4. (一)构建基于ubuntu docker MySQL 5.6 镜像并推送到Docker Hub
  5. SQL——INSERT INTO(增)
  6. WUSTOJ 1336: Lucky Boy(Java)博弈
  7. springboot基础、注解等
  8. 怎样查看python的所有关键字
  9. 怎样安装ipython
  10. 无法解析的外部符号 ___argc nafxcw.lib(appcore.obj)