展开阅读全文 js 爬虫操作

from selenium import webdriver

import time

import random

from bs4 import *

browser = webdriver.Chrome()

url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'

browser.get(url)

ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))

ck_l_ori_ok = 0

try:

    for isc in range(100):

        if ck_l_ori_ok == ck_l_ori_len:

            break

        time.sleep(1)

        js = 'window.scrollTo(0,document.body.scrollHeight)'

        js = 'window.scrollTo(0,100*{})'.format(isc)

        browser.execute_script(js)

        ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')

        for i in ck_l:

            try:

                i.click()

                ck_l_ori_ok += 1

            except Exception as e:

                print(e)

except Exception as e:

    print('window.scrollTo-->', e)

# ck_l=browser.find_elements_by_link_text('展开阅读全文 ∨')

# for i in ck_l:

#     try:

#         i.click()

#     except Exception as e:

#         print(e)

xp_l = ['//*[@id="fanyi967"]/div/div[3]/a', ]

myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))

with open(myhtml, 'w', encoding='utf-8') as fw:

    fw.write(browser.page_source)

sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '

with open(myhtml, 'r', encoding='utf-8') as myhtml_o:

    bs = BeautifulSoup(myhtml_o, 'html.parser')

    dd = 9

a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){a_[i].click()}}

a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}

from selenium import webdriver

import time

import random

from bs4 import *

browser = webdriver.Chrome()

url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'

browser.get(url)

# ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))

# ck_l_ori_ok = 0

# try:

#     for isc in range(100):

#         if ck_l_ori_ok == ck_l_ori_len:

#             break

#         time.sleep(1)

#         js = 'window.scrollTo(0,document.body.scrollHeight)'

#         js = 'window.scrollTo(0,100*{})'.format(isc)

#         browser.execute_script(js)

#         ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')

#         for i in ck_l:

#             try:

#                 i.click()

#                 ck_l_ori_ok += 1

#             except Exception as e:

#                 print(e)

# except Exception as e:

#     print('window.scrollTo-->', e)

js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"

try:

    browser.execute_script(js)

except Exception as e:

    print(e)

    ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))

    ck_l_ori_ok = 0

    try:

        for isc in range(100):

            if ck_l_ori_ok == ck_l_ori_len:

                break

            time.sleep(1)

            js = 'window.scrollTo(0,document.body.scrollHeight)'

            js = 'window.scrollTo(0,100*{})'.format(isc)

            browser.execute_script(js)

            ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')

            for i in ck_l:

                try:

                    i.click()

                    ck_l_ori_ok += 1

                except Exception as e:

                    print(e)

    except Exception as e:

        print('window.scrollTo-->', e)

from selenium import webdriver

import time

import random

from bs4 import *

from pyquery import PyQuery as pq

browser = webdriver.Chrome()

url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'

browser.get(url)

js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"

try:

    browser.execute_script(js)

except Exception as e:

    print(e)

    ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))

    ck_l_ori_ok = 0

    try:

        for isc in range(100):

            if ck_l_ori_ok == ck_l_ori_len:

                break

            time.sleep(1)

            js = 'window.scrollTo(0,document.body.scrollHeight)'

            js = 'window.scrollTo(0,100*{})'.format(isc)

            browser.execute_script(js)

            ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')

            for i in ck_l:

                try:

                    i.click()

                    ck_l_ori_ok += 1

                except Exception as e:

                    print(e)

    except Exception as e:

        print('window.scrollTo-->', e)

doc = pq(browser.page_source)

pq_r_d = {'xmlns="http://www.w3.org/1999/xhtml"': ''}

r_k, r_v = 'xmlns="http://www.w3.org/1999/xhtml"', ''

article_ = doc('.left>:nth-child(2).sons>.cont>.contson').html().replace(r_k, r_v)

title_d = {'h1': doc('.left>:nth-child(2).sons>.cont>:nth-child(2)').html().replace(r_k, r_v)}

author_d = {'h3': doc('.left>:nth-child(2).sons>.cont>:nth-child(3)').text()}

translation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(2)').html().replace(r_k, r_v)

explanation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(3)').html().replace(r_k, r_v)

refer_ = doc('.left>:nth-child(4)>.cankao').html().replace(r_k, r_v)

author_img_url = doc('.left>.sonspic>.cont>.divimg>:nth-child(1)').html().split('src="')[-1].split('"')[0]

d = 4

巴特西

展开阅读全文 js 爬虫操作

最新文章

热门文章