python爬取哦漫画

 import requests

 from lxml import etree

 from bs4 import BeautifulSoup

 import os

 from selenium import webdriver

 #解析每个漫画分页并下载漫画

 def manhua(url):

     browser.get(url)

     #获取模拟访问的页面源码

     html=browser.page_source

     html = etree.HTML(html)

     img_url = html.xpath('//img[@id="mangaFile"]/@src')[0]

     alt = html.xpath('/html/body/div[2]/div[2]/h1/a/text()')[0]

     title = html.xpath('/html/body/div[2]/div[2]/h2/text()')[0]

     print(img_url,alt,title)

     # print(html)

     path='./漫画/'+alt+'/'+title+'/'

     if not os.path.exists(path):

         os.makedirs(path)

     fname=img_url.split('/')[-1]

     # print(fname)

     print(os.path.join(path,fname))

     # request.urlretrieve(img_url,os.path.join(path,fname))

     #请求图片地址

     response = requests.get(img_url)

     #二进制解码

     data= response.content

     #保存文件

     with open(path+fname,'wb') as f:

         f.write(data)

 #解析获取漫画分页链接

 def manhua_url(url):

     response = requests.get(url)

     response.encoding = response.apparent_encoding

     html = response.text

     html = etree.HTML(html)

     # print(html)

     #i为漫画页数

     i = html.xpath('/html/body/div[2]/div[2]/span/text()')[1][1:-1]

     i=int(i)

     # print(i)

     #找到分页规律

     #拼接分页链接，选择用format函数

     url = url +'/index.html?p={}'

     # print(url)

     for n in range(1,i+1):

         fullurl = url.format(n)

         print(fullurl)

         # time.sleep(2)

         #fullurl为所有的分页漫画链接

         manhua(fullurl)

 #解析列表页

 def list(lb_url):

     response = requests.get(lb_url)

     response.encoding = response.apparent_encoding

     html = response.text

     html = BeautifulSoup(html,'lxml')

     #匹配所有章节链接

     url_list = html.select('div.subBookList ul li')

     for url in url_list :

         url = url.select('a')[0].get('href').split('/')[-2]

         # print(url)

         fullurl = os.path.join(lb_url,url)

         print(fullurl)

         #章节链接

         manhua_url(fullurl)

     # print(url_list)

     # print(html)

 #解析首页

 def shouye():

     #首页链接

     base_url = 'http://www.omanhua.com/'

     #发起请求

     response = requests.get(base_url)

     #解码

     response.encoding = response.apparent_encoding

     #获取返回的网页

     html = response.text

     # print(html)

     #解析

     html =BeautifulSoup(html,'lxml')

     #匹配最热漫画链接

     url_list = html.select('ul#cartoon_image_show1 li')

     for url in url_list:

         # print(url)

         url = url.select('a')[0].get('href')[1:]

         # alt = url.select('a')

         # print(alt)

         #拼接链接

         fullurl = os.path.join(base_url,url)

         print(fullurl)

         list(fullurl)

 if __name__ == '__main__':

     # 用自动测试模块selenium模拟浏览器访问，这里用谷歌 图片加载获取不到图片链接

     #后面的路径是chorm驱动路径

     browser = webdriver.Chrome(executable_path=r'C:\Users\zhaozhi\Desktop\chromedriver.exe')

     shouye()

刚开始自学爬虫不久，代码可能写的有点繁琐，希望和大家一起学习学习进步
巴特西

python爬取哦漫画

最新文章

热门文章