spider from mobile to mobile to mobile

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys tag_jmtool_list = ['(', '(', '-'] ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
for i in uafile:
if i.find('Mozilla') > -1:
ua_list.append(i.replace('\n', '').strip()) ua_list_len_ = len(ua_list) - 1 def extract_name(name_):
for i in tag_jmtool_list:
name_ = name_.split(i)[0]
return name_ target_type_list = ['住宅小区', '写字楼']
target_type_list = ['住宅小区']
target_dic = {}
with open('JMTool0819am/任务JMTool.csv', 'r', encoding='utf-8') as csvfile:
for i in csvfile:
l = i.replace(' ', '').replace('\n', '').split('";"')
if l[0].replace('"', '') in target_type_list:
type_, city, district, addr, name_ = l
type_, name_ = type_.replace('"', ''), name_.replace('"', '')
name_reduction = extract_name(name_) if city not in target_dic:
target_dic[city] = {}
if district not in target_dic[city]:
target_dic[city][district] = {}
if type_ not in target_dic[city][district]:
target_dic[city][district][type_] = {}
if name_reduction not in target_dic[city][district]:
target_dic[city][district][type_][name_reduction] = {}
target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []
target_dic[city][district][type_][name_reduction]['history_list'] = [] target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)
target_dic[city][district][type_][name_reduction]['history_list'].append(l) def write_res_html(browser, dir_='baidu_map_html/'):
current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
page_source = '%s%s' % (current_url_, browser.page_source)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
fo = open(file_name, 'w', encoding='utf-8')
fo.write(page_source)
fo.closed def gen_random_letter():
return chr(random.randint(97, 122)) def gen_random_num():
return random.randint(0, 10) def gen_sougo_pid():
res_ = ''
for i in range(1, 17, 1):
if i in [1, 3, 4, 15]:
res_ = '%s%s' % (res_, gen_random_letter())
else:
res_ = '%s%s' % (res_, gen_random_num())
return res_ def close_alert(browser, attitude='accept'):
try:
sleep(2)
al = browser.switch_to.alert()
sleep(1)
if attitude == 'accept':
al.accept()
elif attitude == 'dismiss':
al.dismiss()
print(sys._getframe().f_lineno, 'alert-closed-ok')
except Exception:
print(sys._getframe().f_lineno, Exception, 'no-alert') # input_ = '深圳市南山区荟芳园' def mobile_mobile_pages_html(input_):
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
mobile_emulation = {
"deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}} mobile_emulation['userAgent'] = ua_list[ua_list_index]
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options) url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid())
print(url_seed)
browser.get(url_seed)
js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
browser.execute_script(js)
xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp_newpage).click()
sleep(2) # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
# sleep(1)
# browser.find_element_by_xpath(xp).click()
close_alert(browser)
try:
xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
sleep(2)
close_alert(browser)
browser.find_element_by_xpath(xp)
except Exception:
print(sys._getframe().f_lineno, Exception)
return
close_alert(browser)
if browser.find_element_by_xpath(xp).text.find('全部') == -1:
return
res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
res_num = int(res_num)
page_num = 10
loop_breaker = math.ceil(res_num / page_num) close_alert(browser)
if res_num <= page_num:
write_res_html(browser)
browser.quit()
return
close_alert(browser)
xp = '//*[@id="place-widget-placenewlist-showall"]'
browser.find_element_by_xpath(xp).click()
write_res_html(browser)
close_alert(browser)
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
try:
xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1)
except Exception:
print(sys._getframe().f_lineno, Exception)
write_res_html(browser)
browser.quit()
return for i in range(1, loop_breaker, 1):
sleep(1)
try:
xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
sleep(3)
browser.find_element_by_xpath(xp).click()
except Exception:
print(sys._getframe().f_lineno, Exception)
sleep(10)
break
try:
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
except Exception:
print(sys._getframe().f_lineno, Exception)
sleep(10)
try:
xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
sleep(1)
print(input_, i)
browser.find_element_by_xpath(xp_newpage).click()
write_res_html(browser)
except Exception:
print(sys._getframe().f_lineno, Exception)
sleep(10)
sleep(2)
browser.quit() for city in target_dic:
for district in target_dic[city]:
for type_ in target_dic[city][district]:
for name_reduction in target_dic[city][district][type_]:
input_ = '%s%s%s' % (city, district, name_reduction)
mobile_mobile_pages_html(input_)

  

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from time import sleep
import math url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
"deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options) browser.get(url_seed)
input_ = '深圳市南山区荟芳园' js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
browser.execute_script(js)
xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1) xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
browser.find_element_by_xpath(xp).click() xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
browser.find_element_by_xpath(xp)
res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
res_num = int(res_num)
page_num = 10
loop_breaker = math.ceil(res_num / page_num) def write_res_html(browser, dir_='baidu_map_html/'):
current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
page_source = '%s%s' % (current_url_, browser.page_source)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
fo = open(file_name, 'w', encoding='utf-8')
fo.write(page_source)
fo.closed xp = '//*[@id="place-widget-placenewlist-showall"]'
browser.find_element_by_xpath(xp).click()
write_res_html(browser) js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1) for i in range(1, loop_breaker, 1):
sleep(1)
xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
browser.find_element_by_xpath(xp).click()
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
write_res_html(browser)

  

from selenium import webdriver
from selenium.webdriver.chrome.options import Options url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
"deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options) browser.get(url_seed)
js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"'
browser.execute_script(js)
xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp).click()

  

ua

Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Opera/9.25 (Windows NT 5.1; U; en)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0
import os, sys
import time
import logging
import requests
import threading from random import choice
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities """
全局约定,便于后期做日志分析
os._exit(INT)
4001 4002 4003 4004
"""
os_sep = os.sep
this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
-1]
base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[0:-2]))
log_abspath = '%s%s%s' % (base_dir, os_sep, 'log') """
日志的记录不能依赖于日志类
"""
now_, e = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), '启动脚本'
logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e)
with open(logf, 'a') as fo:
fo.write(s)
print(s) try:
sys.path.append(base_dir)
from core.utils import MysqlHelper
except Exception as e:
s = '%s%s%s' % (
'from core.utils import MysqlHelper EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())),
e)
with open(logf, 'a') as fo:
fo.write(s)
print(s)
os._exit(4001) try:
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
datefmt='%a, %d %b %Y %H:%M:%S',
filename=logf,
filemode='a')
except Exception as e:
s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
with open(logf, 'a') as fo:
fo.write(s)
print(s)
os._exit(4002) try: fua, lua = '%s%s%s' % (this_file_abspath, os_sep,
'ua_list.txt'), []
with open(fua, 'r') as fo:
for i in fo:
lua.append(i.replace('\n', ''))
except Exception as e:
s = '%s%s' % ('打开文件 EXCEPTION ua文件路径: ', fua)
logging.error(s)
print(s)
os._exit(4003) dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = choice(lua)
dcap['browserName'], dcap['platform'] = '', '' class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.func, self.args, self.name = func, args, name def run(self):
self.func(self.args) ctrl_start, max_script_time = time.time(), 3600 * 4 def ctrl_runtime(exit_type=''):
if time.time() - ctrl_start >= max_script_time:
s = '%s%s%s%s%s%s%s%s%s' % (
'程序开始执行时间', ctrl_start, '执行时间阈值', max_script_time, '终止执行', ' exit_type =', exit_type, ' threadID ',
threading.get_ident())
logging.info(s)
if exit_type == '':
exit(s)
elif exit_type == 'sys':
sys.exit(s)
elif exit_type == 'os':
# an integer is required
# Required argument 'status' (pos 1) not found
os._exit(4004) url_counter = 0 def main():
"""
对异常无限重启
""" try:
mysql_obj = MysqlHelper()
q = 'SELECT direct_order_id FROM test_error;'
tuple_l = mysql_obj.select(q)
pass_id_l = [i[0] for i in tuple_l]
pass_id_l = [str(i) for i in pass_id_l]
pass_id_l_s = ','.join(pass_id_l)
del mysql_obj, tuple_l # 业务当前未失效的url在在test_order具有唯一行
#
"""
后期任务:
test_error积累一定数据后对url重新检测
#3个功能点:当前半个小时、当前未失效的url test_order内url的异常情况(当前的2个功能点)、(后期任务:test_error积累一定数据后对url重新检测) q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) - create_time<=3600*48 AND id NOT in ( %s ) ORDER BY id DESC ;' % (
pass_id_l_s) q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in ( %s ) ORDER BY id DESC ;' % (
pass_id_l_s) """ mysql_obj = MysqlHelper()
q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in ( %s ) ORDER BY id DESC ;' % (
pass_id_l_s)
tuple_l = mysql_obj.select(q)
del mysql_obj
if len(tuple_l) == 0:
s = '无待检测url,程序退出'
print(s)
logging.info(s)
except Exception as e:
s = '%s%s%s' % ('初始数据,查询数据库异常,无限次重启该脚本', e, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())))
print(s)
logging.warning(s)
cmd = 'python %s' % (__file__)
os.system(cmd)
os._exit(1024) # 考虑到每1小时执行下该脚本,对url异常的处理为:第一次请求为预期则终止请求,反之,间隔30后,再至多请求2次,每次间隔10s
sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [
'g3user.com', '51g3.com.cn'], 4, 10 # 重构到基类 where list
# d当前为为了f_l字段的需求改动
def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_order'):
t = -1
try:
mysql_obj = MysqlHelper()
f_s = ','.join(f_l)
q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url)
s = '%s%s' % (' DB ', q)
logging.info(s)
t = mysql_obj.select(q)
if t != -1:
t = t[0]
del mysql_obj
except Exception as e:
s = '%s%s' % (' DB ', e)
logging.info(s)
return t
return t def chk_exception_url(url, sleep_seconds=0, http_tag='http://'):
time.sleep(sleep_seconds)
global url_counter ret = {}
# db url状态值 状态 0:打不开 1:打开无广告 2:已处理
ret['ok'], ret['status_code'], s = -1, -1, '%s%s%s%s' % (
time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
try:
if url.find('http') == -1:
url = '%s%s' % (http_tag, url)
r = requests.get(url)
ret['status_code'], txt_pos = int(r.status_code), -1
s = '%s,%s,%s,%s,%s' % (s, ret['status_code'], url, r, r.reason)
except Exception as e:
ret['ok'] = 0
s = '%s %s %s' % (s, ' SPIDER ', e)
logging.error(s)
print(e, url) # 当前,仅考虑目标站返回200
if ret['status_code'] == 200:
for ii in mycode_l:
if r.text.find(ii) > -1:
ret['ok'], txt_pos = 1, 1
break
if txt_pos == -1:
try:
driver = webdriver.PhantomJS(desired_capabilities=dcap,
executable_path='/usr/local/phantomjs/bin/phantomjs')
driver.get(url)
time.sleep(1)
page_source = driver.page_source
driver.quit()
for ii in mycode_l:
if page_source.find(ii) > -1:
ret['ok'] = 1
break
if ret['ok'] == -1:
s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。')
ret['ok'], ret['info'] = 0, s
except Exception as e:
s = '%s %s %s' % (s, ' SPIDER ', e)
logging.error(s)
print(e, url) # elif ret['status_code'] == 403:
# www.hsdcw.com/fenlei/41668214.html
elif ret['status_code'] == 403:
pass
else:
ret['ok'], ret['info'] = 0, s url_counter += 1
s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s)
print(s)
if ret['ok'] == 0:
logging.warning(s)
else:
logging.info(s)
return ret tn, tl, tstep = len(tuple_l), [], 4000 def tf(ts): te = ts + tstep
te = min(te, tn)
for i in tuple_l[ts:te]:
ctrl_runtime(exit_type='os')
url, chk_id = i
s = '%s%s%s%s' % (
time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
if chk_id in pass_id_l:
s = '%s%s' % (s, ' 跳过,之前test_error已写入该url ')
logging.info(s)
print(s)
"""
针对新浪爱问的规则: 不检测
"""
if url.find('iask.sina.com') > -1:
continue
write_db_flag = 1
for t in range(0, repeat_times, 1):
ret = chk_exception_url(url, repeat_sleep_times)
if ret['ok'] == 1:
write_db_flag = 0
break if write_db_flag == 1:
try:
title, uid, money_total = get_onerow(url)
except Exception as e:
s = '%s%s%s' % (s, ' DB Exception-去test_order查', e)
logging.info(s)
print(s)
break # 多线程 考虑到原包的 数据库限制,每次均实例化数据库类,用后删除
try:
# 可以考虑分装到类构造器中
mysql_obj = MysqlHelper()
except Exception as e:
s = '%s%s%s' % (s, ' DB Exception- ', e)
logging.error(s)
print(s)
break """
多进程、线程并发
待优化,比如队列
"""
q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url)
try:
r = mysql_obj.select(q)
s = '%s%s%s' % (s, ' -SQL- ', q)
logging.info(s)
print(q)
except Exception as e:
s = '%s %s %s %s' % (s, ' DB Exception-', q, e)
logging.info(s)
print(s)
break ctime = int(time.time())
# 建议优化此处数据库设计
db_status = 1 if ret['status_code'] == 200 else 0
if len(r) == 0:
q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
title, url, db_status, ret['info'], ctime, ctime, uid, money_total, chk_id)
try:
mysql_obj.execute(q)
mysql_obj.commit()
del mysql_obj
s = '%s%s%s' % (s, ' DB SQL ok ', q)
logging.info(s)
print(s)
except Exception as e:
s = '%s%s%s%s' % (s, ' DB Exception- ', q, e)
logging.error(s)
print(s) elif len(r) == 1:
continue for i in range(0, tn, tstep):
if i >= tn:
break
thread_instance = MyThread(tf, (i), tf.__name__)
tl.append(thread_instance) for t in tl:
t.setDaemon = False
t.start()
for t in tl:
t.join() if __name__ == '__main__':
main()

  

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
from random import choice
# import urllib.parse
from bs4 import BeautifulSoup ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
for i in uafile:
if i.find('Mozilla') > -1:
ua_list.append(i.replace('\n', '').strip()) ua_list_len_ = len(ua_list) - 1 def close_alert(browser, attitude='accept'):
# js='alert(window.alert=function(str){return;}'
# browser.execute_script(js) # js= 'window.alert = function(str){return ;}'
# browser.execute_script(js)
return # mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation['userAgent'] = choice(ua_list)
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome()
s_wd = '长尾'
url_seed = 'https://m.baidu.com/s?word=s_wd' url_seed = url_seed.replace('s_wd', s_wd)
print(url_seed)
browser.get(url_seed) rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
browser.quit()
d = 3

最新文章

  1. Images.xcassets不能获取图片路径
  2. Linux下apache+phppgadmin安装配置
  3. Entity Framework 实体框架的形成之旅--界面操作的几个典型的处理(8)
  4. jquery绑定事件失效的情况(转)
  5. Server Tomcat v7.0 Server at localhost was unable
  6. StringBuilder、StringBuffer和String三者的联系和区别(转)
  7. ESFramework 4.0 进阶(01)-- 消息
  8. iOS NSTimer
  9. MongoDB增删改查操作详解
  10. 用UltraISO制作CentOS U盘安装盘
  11. 三大家族,offset,scroll,client
  12. OpenCV-Python-图像梯度
  13. 14: InfluxDB+Grafana打造大数据监控利器
  14. Testlink1.9.17使用方法(第五章 测试用例管理)
  15. Django中model层详解
  16. Paint the Tree
  17. 12.4 hdfs总结
  18. Java注解的原理
  19. html导出pdf的四种方式
  20. 【Unity】4.3 地形编辑器

热门文章

  1. 联想T470设置U盘启动
  2. sql取随机结果集
  3. Win7 无法将快捷方式从任务栏移除怎么办
  4. iOS技巧
  5. MySQL几个性能指标
  6. Windows重装系统
  7. DbVisualizer出现下列错误:Could not read XML file
  8. Extjs4 Combobox 联动始终出现loading错误的解决的方法
  9. GDBus
  10. kernel BUG