第一篇 - bsp抓取python中文开发者社区中的所有高级教程

工具：python3.6 pycharm

库：bs4 + urllib

第一步：读取html源码

from bs4 import BeautifulSoup

import urllib.request#导入urllib库

url = 'https://www.p y t hontab.com/html/pythonhexinbiancheng/index.html'#获取网页链接

request = urllib.request.urlopen(url)

html = request.read()#读取网页源代码

第二步：获取内容和标题

soup = BeautifulSoup(html,'html.parser')#解析html

title_links = soup.select('#catlist > li > a')#找到标题与链接

source_list = []#存储标题与链接的字典

for title_link in title_links:

    data = {

        'title' : title_link.get_text(),

        'link' : title_link.get('href')

    }

    source_list.append(data)

第三步：在当前目录下新建一个lesson的文件夹，将文件存储在此文件夹下

for dic in source_list:#遍历每一个字典

    request = urllib.request.urlopen(dic["link"])

    html = request.read()

    soup = BeautifulSoup(html, 'html.parser')

    text_p = soup.select('#Article > div.content > p')#拿到p标签下的数据

    text = []#存储文章内容

    for a in text_p:

        text.append(a.get_text().encode('utf-8'))#取出p标签下的文本部分，即文章的内容

    name = dic["title"]

    with open('lesson/%s.txt' % name, 'wb') as f:#将文章写入文件

        for line in text:

            f.write(line)

数据爬取完毕。

注：以上完成一个页面的抓取，若想多抓取页面的话，可用以下代码：

from bs4 import BeautifulSoup

import urllib.request#导入urllib库

url_list = ['https://www.p y t hontab.com/html/pythonhexinbiancheng/index.html']#获取网页链接

for i in range(2,20):

    url = 'https://www.py tho ntab.com/html/pythonhexinbiancheng/%s.html' % i

    url_list.append(url)

for url in url_list:

    request = urllib.request.urlopen(url)

    html = request.read()#读取网页源代码

    soup = BeautifulSoup(html,'html.parser')#解析html

    title_links = soup.select('#catlist > li > a')#找到标题与链接

    source_list = []#存储标题与链接的字典

    for title_link in title_links:

        data = {

            'title' : title_link.get_text(),

            'link' : title_link.get('href')

        }

        source_list.append(data)

    for dic in source_list:#遍历每一个字典

        request = urllib.request.urlopen(dic["link"])

        html = request.read()

        soup = BeautifulSoup(html, 'html.parser')

        text_p = soup.select('#Article > div.content > p')#拿到p标签下的数据

        text = []#存储文章内容

        for a in text_p:

            text.append(a.get_text().encode('utf-8'))#取出p标签下的文本部分，即文章的内容

        name = dic["title"]

        directory = '%s.txt' % name

        dir = directory.replace('/','_').replace('*','@').replace('"','o').replace('?','w').replace(':','m')

        with open('lesson/'+dir, 'wb') as f:#将文章写入文件

            for line in text:

                f.write(line)

巴特西

第一篇 - bsp抓取python中文开发者社区中的所有高级教程

最新文章

热门文章