大规模爬取（新浪为例子）网页之downloader、parser的封装（涉及编码等细节）

import requests

import cchardet

import traceback

from lxml import etree

def downloader(url,timeout = 10,headers = None,debug = False, binary = False):

    _headers = {

        'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '

                       'Windows NT 6.1; Win64; x64; Trident/5.0)')

    }

    redirected_url = url

    if headers:

        headers = _headers

    try:

        res = requests.get(url,headers,timeout = timeout)

        if binary:

            html = res.content

        else:

            encoding = cchardet.detect(res.content)["encoding"]

            html = res.content.decode(encoding)

            status = res.status_code

            redirected_url = res.url

    except:

        if debug:

            traceback.print_exc()

            msg = "failed download:{}".format(url)

            print(msg)

        if binary:

            html =b""

        else:

            html = ""

            status = 0

    return status,html,redirected_url

def parser(html):

    d = 0

    tree = etree.HTML(html)

    divs_list = tree.xpath(".//div[@class = 'main']/div[contains(@class,'clearfix')]")

    for div in divs_list:

        a_list = div.xpath(".//ul[contains(@class,'list-a')]//a")

        for i in a_list:

            try:

                href = i.xpath("./@href")[0].strip().replace("\\n",'').replace('\\t','')

                title = i.xpath("./text()")[0].strip().replace("\\n",'').replace('\\t','')

                d += 1

                print(d,(href,title))

            except (IndexError) as e:

                pass

if __name__ == '__main__':

    url = r"https://www.sina.com.cn/"

    status,html,redirected_url = downloader(url)

    paser = parser(html)

    #print(status,html,redirected_url)

巴特西

大规模爬取（新浪为例子）网页之downloader、parser的封装（涉及编码等细节）

最新文章

热门文章