#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''Thread3 多线程测试采集'''
import threading,time,queue,Mongo_utils,mysqlUtils,requests,json,os
from lxml import html
etree = html.etree
exitFlag = 0
db = Mongo_utils.mongodb_15_27017task()
table = db["xx_anjuke_agent1"]
table_urls = db["xx_spider_urls1"]
list_pro = mysqlUtils.select_pro()
list_urls = table_urls.find().limit(2000)
insert_list = []
del_list = []
class myThread(threading.Thread):
def __init__(self,threadId,name,q):
threading.Thread.__init__(self)
self.threadId = threadId
self.name = name
self.q = q def run(self):
print("开始线程" + self.name)
spider(self.name,self.q)
print("退出线程" + self.name)
def head():
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "",
"Connection": "keep-alive",
"Content-Type": "text/html; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
}
return headers
def spider(name,q):
while not exitFlag:
queueLock.acquire()
if not workQueue.empty():
i = q.get()
queueLock.release()
_id = i["_id"]
city = i["city"]
zone = i["zone"]
street = i["street"]
urls = i["urls"]
headers = head()
try:
url = "https://。。。。。。。。。。。" % _id
# //,proxies=proxy
response_contact = requests.session().get(url=url, allow_redirects=False, headers=headers,
timeout=1)
print(response_contact.status_code)
if response_contact.status_code == 302:
print("验证")
print(url)
os._exit(0)
res = json.loads(response_contact.text) contact = res['data']
response_dl = requests.session().get(url=urls, allow_redirects=False, headers=headers,
timeout=1)
if response_dl.status_code == 302:
print("验证")
print(urls)
os._exit(0)
if ("获取成功") not in response_contact.text or ("房屋编码") not in response_dl.text:
print("pass")
pass
html = etree.HTML(response_dl.content)
name = html.xpath("//div[@class='brokercard-name']/text()")[0].strip()
company = html.xpath("//div[@class='broker-company']/p[1]/a/text()")[0]
company_url = html.xpath("//div[@class='broker-company']/p[1]/a/@href")[0]
store = html.xpath("//div[@class='broker-company']/p[2]/span/text()")[0]
# re = name, company, company_url, store, contact,_id,city,zone,street
staffNo = "https://anjuxingye1.anjuke.com/gongsi-jjr-%s/" % _id
mydict = {"_id": _id, "city": city, "zone": zone, "street": street, "name": name, "company": company,
"company_url": company_url,
"store": store, "site": "anjuke", "store_url": "", "staffNo": "", "store_url": "",
"staffNo": staffNo, "tag": "", "all_comm": ""
, "contact": contact} insert_list.append(mydict)
# del_list.append(urls)
print("size: %s" % insert_list.__len__())
except:
pass
print("%s processing %s" % (name, i))
else:
queueLock.release()
# time.sleep(1) threadList = range(0,5)
queueLock = threading.Lock()
workQueue = queue.Queue(50000)
threads = []
threadID = 1
for tName in threadList:
thread = myThread(threadID, tName, workQueue)
thread.start()
threads.append(thread)
threadID += 1 # 填充队列
queueLock.acquire()
for word in list_urls:
workQueue.put(word)
queueLock.release() # 等待队列清空
while not workQueue.empty():
pass
if insert_list.__len__() > 10:
try:
table.insert_many(insert_list, ordered=False)
# table_urls.remove({"urls": {"$in": del_list}})
print("插入1000")
except Exception as e:
print(e)
insert_list.clear()
del_list.clear()
# 通知线程是时候退出
# os._exit(0)
exitFlag = 1
try:
table.insert_many(insert_list, ordered=False)
# table_urls.remove({"urls": {"$in": del_list}})
print("插入1000")
except:
pass
insert_list.clear()
del_list.clear()
# 等待所有线程完成
for t in threads:
t.join()
print ("退出主线程")

最新文章

  1. 湘潭邀请赛 Hamiltonian Path
  2. 【转】在C#用HttpWebRequest中发送GET/HTTP/HTTPS请求
  3. poj2546Circular Area(两圆相交面积)
  4. kerberos+ladp+hadoop-ha 安全认证部署配置
  5. Silverlight通过Wcf Data Service访问数据库之ADO.NET Entity Framework篇
  6. java 常见异常(二)
  7. 基于二叉树和数组实现限制长度的最优Huffman编码
  8. C# typeof() 和 GetType()区别
  9. ios @property
  10. leetcode python 033 旋转数组查找
  11. pygame-KidsCanCode系列jumpy-part14-背景音乐及音效
  12. python测试开发django-11.模型models详解
  13. poj 3046 Ant Counting(多重集组合数)
  14. Intellij-怎么避免import.*包,以及import包顺序问题
  15. Red Hat 6.5 Samba服务器的搭建(登录访问)
  16. Servlet 学习总结-1
  17. [TJOI2013]最长上升子序列 平衡树
  18. FreeRTOS - 任务使用注意
  19. mysql:查询以逗号相隔的字符串
  20. 如何把win10系统迁移到SSD固态硬盘

热门文章

  1. ArcGIS超级工具SPTOOLS-制图篇
  2. 使用hibernate利用实体类生成表和利用表生成实体类
  3. 批量停止、删除docker容器
  4. linux的最简socket编程
  5. linux下怎么用ssh连接另一台linux服务器
  6. C++ STL partial_sort_copy iterator
  7. mac下的夜神模拟器链接vscode
  8. 阶段5 3.微服务项目【学成在线】_day04 页面静态化_16-页面静态化-模板管理-模板制作
  9. 使用rsync备份数据
  10. react中 如何异步展示后台接口的提示消息