详情点我跳转

关注公众号“轻松学编程”了解更多。

一、多线程抓取网页

流程:a.设置种子url b.获取区域列表 c.循环区域列表 d.创建线程获取页面数据

e、启动线程

import csv
import threading
import time
import requests
import lxml
from lxml import etree
import json # 递归锁
rlock = threading.RLock()
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} def getAreaList(url):
'''
获取区域列表
:param url:
:return: dict {"cityName":"cityUrl"}
'''
# 获取响应
response = requests.get(url,headers=headers).text
# 创建xml树形结构对象
mytree = lxml.etree.HTML(response)
# 分区
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div/a')
#分区字典
areaDict = {}
for area in areaList:
#区域名
areaName = area.xpath('./text()')[0]
areaUrl = "https://gz.lianjia.com"+area.xpath('./@href')[0]
areaDict[areaName] = areaUrl
# print(areaName,areaUrl)
return areaDict def getPageTotal(url):
'''
获取分区页数
:param url: utl
:return: int 总页数
'''
response = requests.get(url,headers=headers).text
mytree = lxml.etree.HTML(response)
# 获取总页数
pageTotal = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
pageTotal = int(json.loads(pageTotal)["totalPage"])
# print(pageTotal)
return pageTotal def getHouseInfo(area,url):
'''
获取房子信息
:param area:区域
:param url: url
:return:
'''
pageTotal = getPageTotal(url)
for page in range(1,pageTotal+1):
newurl = url+"pg%d/"%page
# print(newurl)
response = requests.get(newurl,headers=headers).text
mytree = lxml.etree.HTML(response)
houseList = mytree.xpath('//li[@class="clear"]')
print(houseList)
for house in houseList:
# 房子标题
houseTitle = house.xpath('.//div[@class="title"]/a/text()')[0]
# 房子url
houseUrl = house.xpath('.//div[@class="title"]/a/@href')[0]
# 房子地址
houseAddr = house.xpath('.//div[@class="houseInfo"]//text()')
houseAddr = ''.join(houseAddr)
# 位置信息
positionInfo = house.xpath('.//div[@class="positionInfo"]//text()')
positionInfo = ''.join(positionInfo)
# 总价
priceInfo = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
# 平方价
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
print(houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice)
# 保存成csv文件
with rlock:
with open('./data/'+area+'.csv','a+',encoding='utf-8',errors='ignore') as f:
writer = csv.writer(f)
writer.writerow([houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice]) if __name__ == '__main__':
#设置种子url
cityUrl = "https://gz.lianjia.com/ershoufang/"
# 获取区域列表
areaDict = getAreaList(cityUrl) threadList = []
time.clock()
for areaName,areaUrl in areaDict.items():
# 创建线程
t = threading.Thread(target=getHouseInfo,args=(areaName,areaUrl))
t.start()
threadList.append(t) # 保证线程正常结束
for t in threadList:
t.join()
print(time.clock())

二、协程抓取网页

import csv
import threading
import time
import requests
import lxml
from lxml import etree
import json import gevent
from gevent import monkey # 非阻塞型
gevent.monkey.patch_all()
# 递归锁
rlock = threading.RLock()
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} def getAreaList(url):
'''
获取区域列表
:param url:
:return: dict {"cityName":"cityUrl"}
'''
# 获取响应
response = requests.get(url,headers=headers).text
# 创建xml树形结构对象
mytree = lxml.etree.HTML(response)
# 分区
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div/a')
#分区字典
areaDict = {}
for area in areaList:
#区域名
areaName = area.xpath('./text()')[0]
areaUrl = "https://gz.lianjia.com"+area.xpath('./@href')[0]
areaDict[areaName] = areaUrl
# print(areaName,areaUrl)
return areaDict def getPageTotal(url):
'''
获取分区页数
:param url: utl
:return: int 总页数
'''
response = requests.get(url,headers=headers).text
mytree = lxml.etree.HTML(response)
# 获取总页数
pageTotal = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
pageTotal = int(json.loads(pageTotal)["totalPage"])
# print(pageTotal)
return pageTotal def getHouseInfo(area,url):
'''
获取房子信息
:param area:区域
:param url: url
:return:
'''
pageTotal = getPageTotal(url)
for page in range(1,pageTotal+1):
newurl = url+"pg%d/"%page
# print(newurl)
response = requests.get(newurl,headers=headers).text
mytree = lxml.etree.HTML(response)
houseList = mytree.xpath('//li[@class="clear"]')
print(houseList)
for house in houseList:
# 房子标题
houseTitle = house.xpath('.//div[@class="title"]/a/text()')[0]
# 房子url
houseUrl = house.xpath('.//div[@class="title"]/a/@href')[0]
# 房子地址
houseAddr = house.xpath('.//div[@class="houseInfo"]//text()')
houseAddr = ''.join(houseAddr)
# 位置信息
positionInfo = house.xpath('.//div[@class="positionInfo"]//text()')
positionInfo = ''.join(positionInfo)
# 总价
priceInfo = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
# 平方价
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
print(houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice)
# 保存成csv文件
with rlock:
with open('./data/'+area+'.csv','a+',encoding='utf-8',errors='ignore') as f:
writer = csv.writer(f)
writer.writerow([houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice]) if __name__ == '__main__':
#设置种子url
cityUrl = "https://gz.lianjia.com/ershoufang/"
# 获取区域列表
areaDict = getAreaList(cityUrl) geventList = []
time.clock()
for areaName,areaUrl in areaDict.items():
# 创建协程
g = gevent.spawn(getHouseInfo,areaName,areaUrl) geventList.append(g)
# 保证协程正常结束
gevent.joinall(geventList)
print(time.clock())

三、协程与进程结合抓取网页

import csv
import threading
import time
import requests
import lxml
from lxml import etree
import json
import multiprocessing
import gevent
from gevent import monkey # 非阻塞型
gevent.monkey.patch_all()
# 递归锁
rlock = threading.RLock()
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} def getAreaList(url):
'''
获取区域列表
:param url:
:return: dict {"cityName":"cityUrl"}
'''
# 获取响应
response = requests.get(url,headers=headers).text
# 创建xml树形结构对象
mytree = lxml.etree.HTML(response)
# 分区
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div/a')
#分区字典
areaDict = {}
for area in areaList:
#区域名
areaName = area.xpath('./text()')[0]
areaUrl = "https://gz.lianjia.com"+area.xpath('./@href')[0]
areaDict[areaName] = areaUrl
# print(areaName,areaUrl)
return areaDict def getPageTotal(url):
'''
获取分区页数
:param url: utl
:return: int 总页数
'''
response = requests.get(url,headers=headers).text
mytree = lxml.etree.HTML(response)
# 获取总页数
pageTotal = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
pageTotal = int(json.loads(pageTotal)["totalPage"])
# print(pageTotal)
return pageTotal def getHouseInfo(area,url):
'''
获取房子信息
:param area:区域
:param url: url
:return:
''' def houesInfo(area,url,pageTotal):
for page in range(1,pageTotal+1):
newurl = url+"pg%d/"%page
# print(newurl)
response = requests.get(newurl,headers=headers).text
mytree = lxml.etree.HTML(response)
houseList = mytree.xpath('//li[@class="clear"]')
print(houseList)
for house in houseList:
# 房子标题
houseTitle = house.xpath('.//div[@class="title"]/a/text()')[0]
# 房子url
houseUrl = house.xpath('.//div[@class="title"]/a/@href')[0]
# 房子地址
houseAddr = house.xpath('.//div[@class="houseInfo"]//text()')
houseAddr = ''.join(houseAddr)
# 位置信息
positionInfo = house.xpath('.//div[@class="positionInfo"]//text()')
positionInfo = ''.join(positionInfo)
# 总价
priceInfo = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
# 平方价
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
print(houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice)
# 保存成csv文件
with rlock:
with open('./data/'+area+'.csv','a+',encoding='utf-8',errors='ignore') as f:
writer = csv.writer(f)
writer.writerow([houseTitle,houseUrl,houseAddr,positionInfo,priceInfo,unitPrice])
# 获取总页数
pageTotal = getPageTotal(url)
# 创建协程
g = gevent.spawn(houesInfo, area, url, pageTotal)
# 保证协程正常结束
gevent.joinall([g]) if __name__ == '__main__':
#设置种子url
cityUrl = "https://gz.lianjia.com/ershoufang/"
# 获取区域列表
areaDict = getAreaList(cityUrl) processList = []
time.clock()
for areaName,areaUrl in areaDict.items():
# 创建进程
p = multiprocessing.Process(target=getHouseInfo,args=(areaName,areaUrl))
p.start()
processList.append(p) # 保证进程正常结束
for p in processList:
p.join()
print(time.clock())

后记

【后记】为了让大家能够轻松学编程,我创建了一个公众号【轻松学编程】,里面有让你快速学会编程的文章,当然也有一些干货提高你的编程水平,也有一些编程项目适合做一些课程设计等课题。

也可加我微信【1257309054】,拉你进群,大家一起交流学习。
如果文章对您有帮助,请我喝杯咖啡吧!

公众号

关注我,我们一起成长~~

最新文章

  1. Scrapy shell调试网页的信息
  2. maven构建过程
  3. Python基础2:流程控制语句 while / for循环
  4. C语言 百炼成钢13
  5. centos+php+coreseek+sphinx+mysql之二sphinx配置篇
  6. Maven 安装Jar包到本地仓库
  7. JDBC用ResultSet访问大量数据时会遇到的问题
  8. 1014: [JSOI2008]火星人prefix - BZOJ
  9. Tomcat内存溢出
  10. Less的内置函数
  11. Jenkins-在windows上安装及其部署
  12. Monkey 生成报告方法
  13. oracle FLASHBACK TABLE
  14. inotifywait实现目录监控--http://man.linuxde.net/inotifywait
  15. CentOS6启动流程
  16. 【原创】QString 函数 replace()indexOf()、 lastindexOf()
  17. 九度OJ1036-空缺数字计算-暴力破解
  18. WEKA从sqlite数据库文件导入数据
  19. Windows下sbt安装配置
  20. centos6.2 安装Mysql5.6

热门文章

  1. Python-判断变量类型和继承链-type isinstance
  2. GAN生成的评价指标 Evaluation of GAN
  3. Python练习题 035:Project Euler 007:第10001个素数
  4. 0923 lca练习
  5. 前端gitlab-ci.yml 入门
  6. Code Test(2)
  7. Python日志采集(详细)
  8. MeteoInfoLab脚本示例:线性拟合
  9. JAVA Schedule的Cron表达式
  10. js实现自定义弹窗