用selenium 自动爬取某一本小说章节及其内容，并存入数据库中

 from selenium import webdriver

 import pymysql

 from selenium.webdriver.support.ui import WebDriverWait     # 等待

 from selenium.webdriver.support import expected_conditions as ec # 等待条件

 from selenium.webdriver.common.by import By

 import html

 import _thread

 from selenium.webdriver.chrome.options import Options

 def ceil(x, y):

     if x % y == 0:              # 相除后为整数

         return int(x / y)

     else:                       # 相除有小数

         return int(x / y) + 1

 # 创建一个浏览器

 chrome_options = Options()

 chrome_options.add_argument('--headless')

 dr = webdriver.Chrome(chrome_options=chrome_options)

 # 设置访问的网站

 dr.get('https://doupocangqiong1.com/1/list_piaotian/')

 # 获取所有的a标签

 a = dr.find_elements_by_css_selector('.dirlist > li > a')

 # 连接数据库

 db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8')

 # 获取游标

 cursor = db.cursor()

 for i in a:

     name = i.text

     href = i.get_attribute('href')

     sql = "INSERT INTO novel (name,href,content) VALUES ('%s','%s','%s')"%(name,href,'')

     cursor.execute(sql)         # 使用execute方法执行SQL语句

     db.commit()

 dr.close()          # 关闭浏览器

 def line(lineName, start, count):

     dr = webdriver.Chrome(chrome_options=chrome_options)         # 创建一个浏览器

     # 连接数据库

     db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8')

     # 获取游标

     cursor = db.cursor()

     sql = "SELECT id,href FROM novel LIMIT %s, %s"%(start, count)

     cursor.execute(sql)  # 使用execute方法执行SQL语句

     data = cursor.fetchall()  # 使用 fetchall() 方法获取所有数据

     for i in data:

         dr.get(i[1])

         # 放置等待

         WebDriverWait(dr, 5, 0.1).until_not(ec.text_to_be_present_in_element((By.CSS_SELECTOR, '#chaptercontent'),

                                                                             U'正在转码，请稍后......'))  # 等待dr浏览器10秒钟，每0.1秒钟问一次

         content = html.escape(dr.find_element_by_css_selector('#chaptercontent').text)

         # escape()将特殊字符转为特殊的编码格式，unescape()将编码格式转回特殊字符

         sql = "UPDATE novel SET content = '%s' WHERE id = %s" % (content, i[0])

         cursor.execute(sql)  # 使用execute方法执行SQL语句

         db.commit()

         print(lineName, '完成了', i[0], '的采集')

     dr.close()          # 关闭窗口

     dr.quit()           # 关闭浏览器

     cursor.close()

     db.close()

     print(lineName, '完成了采集')

 def productLine(func, total, lineCount):

     every = ceil(total[0][0], lineCount)

     print('every', every)

     for i in range(lineCount):

         print('-------------', i)

         print(_thread.start_new_thread(func, ('line-' + str(i) + '', i * every, every)))

 try:

     sql = 'SELECT COUNT(*) FROM novel'

     cursor.execute(sql)  # 使用execute方法执行SQL语句

     total = cursor.fetchall()  # 使用 fetchall() 方法获取所有数据

     print(total)

     productLine(line, total, 5)

 except:

     print ("Error: unable to start thread")

 while 1:

    pass
巴特西

用selenium 自动爬取某一本小说章节及其内容，并存入数据库中

最新文章

热门文章