爬虫多线程模板,xpath,etree
2024-09-01 17:33:02
class QuiShi:
def __init__(self):
self.temp_url = "http://www.lovehhy.net/Joke/Detail/QSBK/{0}"
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"}
#1.Queue url队列
self.url_query = Queue()
# html网页队列
self.html_query = Queue()
# content内容队列
self.content_query = Queue()
def get_url_list(self):
for i in range(1,5):
self.url_query.put(self.temp_url.format(i)) def parse_url(self):
while True:
url = self.url_query.get()
self.html_query.put(requests.get(url,headers=self.headers).content.decode("gbk"))
self.url_query.task_done() def get_content_list(self):
# print(html_str)
#etree.HTML 变成树状结构
while True:
html_str = self.html_query.get()
html_str = html_str.replace("<br />","").strip("")
html = etree.HTML(html_str)
# s = html.xpath('//div[@id="footzoon"]')
h3_list = html.xpath('//div[@id="footzoon"]/h3')
content_list=[]
for h3 in h3_list:
item = {}
item["title"] = h3.xpath("./a/text()")
item["title_href"] = h3.xpath("./a/@href")
item["content"] =[]
s = h3.xpath('./following-sibling::div/text()')
for i in s:
item["content"].append(i.replace("\u3000",""))
content_list.append(item)
self.content_query.put(content_list)
self.html_query.task_done() def save_content_list(self):
while True:
cons = self.content_query.get()
print(cons)
self.content_query.task_done() def run(self):
# 1.获取url地址列表 t1 = threading.Thread(target=self.get_url_list)
t21 = threading.Thread(target=self.parse_url)
t22 = threading.Thread(target=self.parse_url)
t23 = threading.Thread(target=self.parse_url)
t3 = threading.Thread(target=self.get_content_list)
t4 = threading.Thread(target=self.save_content_list)
t1.start()
t21.start()
t22.start()
t23.start()
t3.start()
t4.start()
self.url_query.join()
self.html_query.join()
self.content_query.join() if __name__ == '__main__':
t1 = time.time()
quishi = QuiShi()
quishi.run()
print(time.time() - t1)
最新文章
- Java 自动装箱、拆箱机制及部分源码分析
- 鼠标选择文字事件js代码,增加层问题
- 启动tomcat时 一闪而过解决方法
- Java Day 02
- C++ Primer 学习笔记_76_模板与泛型编程 --模板定义[续]
- java mysql模板
- Shell学习之Shift的用法
- C++中出现的计算机术语1
- Java调用摄像头截图
- python中字符串中一些函数的用法
- Java Enum用法详解
- Servlet(自己实现的Servlet)细节
- pytorch怎么抽取中间的特征或者梯度
- scss初学小结(转阮一峰老师SASS用法指南http://www.ruanyifeng.com/blog/2012/06/sass.html)
- 安装swoole
- Spring Boot笔记六:Thymeleaf介绍
- LInux Zebra
- flask异步
- 常用的Markdown语法
- Nginx的https配置记录以及http强制跳转到https的方法梳理