本章学习内容:将网站上的小说都爬下来,存储到本地。

目标网站:www.cuiweijuxs.com

分析页面,发现一共4步:从主页进入分版打开分页列表、打开分页下所有链接、打开作品页面、打开单章内容。

所以实现步骤如下:

1、进入分版页面,www.cuiweijuxs.com/jingpinxiaoshuo/

找到最大分页数

<a href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_122.html" class="last">122</a>

循环打开每个页面

href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html" 

2、找到当页所有链接,循环打开单页链接,下为可定位元素

div id="newscontent"
div class="l"
  <span class="s2">
  <a href="http://www.cuiweijuxs.com/4_4521/" target="_blank">标题</a>

3、打开单页链接,找到章节列表,下为可定位元素

<div id="list">
<dd>
<a href="/4_4508/528170.html">第一章</a>
</dd>
</div>

4、打开单章链接,读取内容

<div id="content">

内容
<div>

 

setup1:创建class,初始化参数,抽象化获取beautifulsoup解析后到网页

# -*- coding: UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import os '''
使用BeautifulSoup抓取网页
''' class Capture(): def __init__(self):
self.index_page_url = 'http://www.cuiweijuxs.com/'
self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
self.folder_path = '小说/'
self.head = {}
# 写入User Agent信息
self.head[
'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' # 获取BeautifulSoup
def getSoup(self, query_url):
req = request.Request(query_url, headers=self.head)
webpage = request.urlopen(req)
html = webpage.read()
#soup = BeautifulSoup(html, 'html.parser')
soup = BeautifulSoup(html, 'html5lib')
return soup
# end getSoup

  

setup2:创建进入分版页面,找到最大分页数,并循环打开每个页面

# 读取更新列表
def readPageOne(self):
soup = self.getSoup(self.one_page_url)
last = soup.find("a","last")
itemSize = int(last.string)
page_url = str(self.two_page_url) for item in range(itemSize):
print( item )
new_page_url = page_url.replace( "?",str(item+1) )
self.readPageTwo(new_page_url) # end readPageOne

  使用getSoup方法获取解析后到html网页,使用find方法找到class是“last”的a标签,获取最大分页数

  循环分页,从1开始

setup3:读取单页链接

#读取单页链接
def readPageTwo(self,page_url):
soup = self.getSoup(page_url)
con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})
a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')
print(a_list)
for a_href in a_list:
#print(child)
href = a_href.get('href')
folder_name = a_href.get_text()
print('a_href',href,'---folder_name',folder_name)
path = self.folder_path + folder_name
self.createFolder(path)
self.readPageThree(href,path)
# end for # end readPageTwo

  找到div下id是newscontent的标签,再往下找到class是“l”的div,再找到所有class是“s2”的span,找到此span下的a标签,循环打开a标签

并找到标签名( a_href.get_text() )作为文件夹名称

setup4:打开作品页面,循环章节链接,拼接文件名称

   #打开作品页面
def readPageThree(self,page_url,path):
soup = self.getSoup(page_url)
print('readPageThree--',page_url)
a_list = soup.find('div', {'id': 'list'}).find_all('a')
idx = 0
for a_href in a_list:
idx = idx+1
href = self.index_page_url + a_href.get('href')
txt_name = path + '/' + str(idx) + '_'+ a_href.get_text() + '.txt'
print('a_href', href, '---path', txt_name)
isExists = os.path.exists(txt_name)
if isExists:
print(txt_name, '已存在')
else:
self.readPageFour(href,txt_name)

  

setup5:打开章节链接,读取id=content的div下所有内容,写入文件中

 #读取单章内容并写入
def readPageFour(self,page_url,path):
soup = self.getSoup(page_url)
con_div = soup.find('div', {'id': 'content'})
content = con_div.get_text().replace('<br/>', '\n').replace(' ', ' ')
self.writeTxt(path,content)

完整代码实现如下:

 # -*- coding: UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import os '''
使用BeautifulSoup抓取网页
''' class Capture(): def __init__(self):
self.index_page_url = 'http://www.cuiweijuxs.com/'
self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
self.folder_path = '小说/'
self.head = {}
# 写入User Agent信息
self.head[
'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' # 获取BeautifulSoup
def getSoup(self, query_url):
req = request.Request(query_url, headers=self.head)
webpage = request.urlopen(req)
html = webpage.read()
#soup = BeautifulSoup(html, 'html.parser')
soup = BeautifulSoup(html, 'html5lib')
return soup
# end getSoup #读取更新列表
def readPageOne(self):
soup = self.getSoup(self.one_page_url)
last = soup.find("a","last")
itemSize = int(last.string)
page_url = str(self.two_page_url) for item in range(itemSize):
print( item )
new_page_url = page_url.replace( "?",str(item+1) )
self.readPageTwo(new_page_url) # end readPageOne #读取单页链接
def readPageTwo(self,page_url):
soup = self.getSoup(page_url)
con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})
a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')
print(a_list)
for a_href in a_list:
#print(child)
href = a_href.get('href')
folder_name = a_href.get_text()
print('a_href',href,'---folder_name',folder_name)
path = self.folder_path + folder_name
self.createFolder(path)
self.readPageThree(href,path)
# end for # end readPage #打开单章链接
def readPageThree(self,page_url,path):
soup = self.getSoup(page_url)
print('readPageThree--',page_url)
a_list = soup.find('div', {'id': 'list'}).find_all('a')
idx = 0
for a_href in a_list:
idx = idx+1
href = self.index_page_url + a_href.get('href')
txt_name = path + '/' + str(idx) + '_'+ a_href.get_text() + '.txt'
print('a_href', href, '---path', txt_name)
isExists = os.path.exists(txt_name)
if isExists:
print(txt_name, '已存在')
else:
self.readPageFour(href,txt_name) #读取单章内容并写入
def readPageFour(self,page_url,path):
soup = self.getSoup(page_url)
con_div = soup.find('div', {'id': 'content'})
content = con_div.get_text().replace('<br/>', '\n').replace('&nbsp;', ' ')
self.writeTxt(path,content) def readPageHtml(self,page_url,path):
soup = self.getSoup(page_url)
con_div = soup.find('div', {'id': 'content'})
content = con_div.get_text().replace('<br/>', '\n').replace('&nbsp;', ' ') def createFolder(self,path):
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 不存在则创建
if not isExists:
os.makedirs(path)
print(path + ' create')
else:
print( path + ' 目录已存在')
#end createFolder def writeTxt(self,file_name,content):
isExists = os.path.exists(file_name)
if isExists:
print(file_name,'已存在')
else:
file_object = open(file_name, 'w',encoding='utf-8')
file_object.write(content)
file_object.close() def run(self):
try:
self.readPageOne()
except BaseException as error:
print('error--',error) Capture().run()

最新文章

  1. Python中文字符串截取
  2. GitHub上史上最全的Android开源项目分类汇总
  3. Linux查看和改变网卡的一些信息
  4. (转载)Mysql中,SQL语句长度限制
  5. Unity 小地图制作插件NJG MiniMap的简单使用
  6. Objective-C学习篇06—NSString与NSMutableString
  7. webstrom的注释
  8. Oracle 中按条件过滤重复记录
  9. mongodb学习(翻译1)
  10. New Adventure----GUI Design Studio
  11. Android 8.0 功能和 API
  12. php导出csv报表
  13. [原创]EBAZ4205 Linux log打印输出
  14. The type &#39;Expression&lt;&gt;&#39; is defined in an assembly that is not referenced.You must add a reference to assembly &#39;System.Core, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089&#39;.
  15. django - 总结 - CRM - 知识点
  16. Thrift架构介绍
  17. GT--记录android app消耗的cpu/内存/流量/电量
  18. 一起学Hive——创建内部表、外部表、分区表和分桶表及导入数据
  19. Java中String型与Date型数据的互相转换
  20. 关于webpy模板自动HTML转义的问题

热门文章

  1. Gerrit2安装配置
  2. AppiumLibrary用户关键字
  3. 将json文件转换成insert语句的sql文件
  4. unsign 字段相减出现负数解决方法
  5. html语义化与前端页面排版规则
  6. 原生JavaScript之实战 模拟重力场(篮球)
  7. 洛谷 P3285 [SCOI2014]方伯伯的OJ
  8. 递归查找无效的符号链接 分类: linux c/c++ 2014-06-02 00:14 345人阅读 评论(0) 收藏
  9. 220 Contains Duplicate III 存在重复 III
  10. 写给W小姐的一封信