python 爬取头条视频
2024-10-01 20:39:44
知识点总结
1. 利用webdriver 模拟浏览器访问
from selenium import webdriver
2.import requests
3. from bs4 import BeautifulSoup
简单小例
import requests
from selenium import webdriver
from urllib.parse import urlencode
from bs4 import BeautifulSoup def get_one_page():
# headers = {
#
# 'Referer': 'https://www.365yg.com/',
# 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
# }
da = {
'min_behot_time': '0',
'category': 'video_new',
'utm_source': 'toutiao',
'widen': '1',
'tadrequire': 'true',
'as': 'A1654C1827C2B37',
'cp': '5C87724B93A76E1',
'_signature': ' o54nnxAd.ygc6NZ537gIfKOeJ4'
}
url = 'https://www.365yg.com/?'+urlencode(da) opetions = webdriver.ChromeOptions()
browser = webdriver.Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
# cookie={
#
# 'name': 'tt_webid',
# 'value': '6671039337541174792;',
# }
# browser.add_cookie(cookie) #browser = webdriver.Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
browser.get(url)
data=browser.page_source
dass=BeautifulSoup(data,"lxml")
lists=dass.select('ul[infinite-scroll-distance="80"]')
#所有的A标签的list dict={}
for i in lists:
for j in i.find_all('div',class_="title-box"):
href=j.find_all("a", class_="link")
for v in href: dict.update({v.text:v['href']})
#href.update(dic)
#href.append(v['href'])
#print(dict)
req_url(dict) #a.append(href)
#print(a)
# for i in a:
# print(i)
#for i in li: def req_url(dict):
vido={}
# dict={
# '江苏爆炸救治伤员640人 负责人被抓': '/group/6671032572195111437/',
# '儿媳没工作,却每天大鱼大肉,婆婆疑惑跟踪过去,结局让人感动': '/group/6660699394188247559/',
# '赵文卓演的《中南海保镖》, 感觉和李连杰那版有的一拼, 都没看过': '/group/6671090090191618573/',
# '新年就快到来,是时候换个发型了': '/group/6651463804318122508/',
# '150万买226斤新疆和田玉,老汉害怕推来鉴宝,专家见后脸色大变': '/group/6669675946359915016/',
# '声音的抉择:钱正昊改编《遇见》开口跪!这个00后小伙不简单': '/group/6669760801097646600/',
# '金灿荣:美国曾整趴五个老二国家,而中国有一个他们都没有的优点': '/group/6670396349554360846/'
# }
url="http://www.365yg.com"
headers = {
'Referer': 'https://www.365yg.com/',
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
}
#tt_video_c94c3
for i in dict: urls=url + dict[i]
resfsdaf=requests.get(urls,headers=headers) if resfsdaf.status_code == 200:
opetions = webdriver.ChromeOptions()
browser = webdriver.Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
browser.get(urls)
data = browser.page_source dass = BeautifulSoup(data, "lxml")
dafdasfa=dass.select('video[mediatype="video"]')
browser.quit()
for src_i in dafdasfa:
vido.update({i:src_i['src']})
# 解析视频 # print(vido)
url_ursl(vido) # parame={
# '江苏爆炸救治伤员640人 负责人被抓': '//v11-default.ixigua.com/c339099c91d2a0c39b3a9200debe69a7/5c94bd76/video/m/220f7561952529b4afb9cef2b40af5dd0c51161a67c40000b9c386f1dc51/?rc=amZocHZoM2Q3bDMzZTczM0ApQHRAbzQ6Njg8MzQzMzc3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QDZqMi9oZGI2Xl8tLWMtMHNzLW8jbyMvMzQuMS4tLi80NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '儿媳没工作,却每天大鱼大肉,婆婆疑惑跟踪过去,结局让人感动': '//v9-default.ixigua.com/7e61514e57d41846ed863d168a0361ce/5c94bd5b/video/m/2207ca36d5a9cf848e1aa4a7d4dd075ea711161aa4050000665b3ffb9a63/?rc=M3JwOTh0eDhyazMzZTczM0ApQHRAbzw0Njw1MzQzMzg3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QDFvL2pfNnNra18tLTQtL3NzLW8jbyMzLzEtMS4tLjI0NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '赵文卓演的《中南海保镖》, 感觉和李连杰那版有的一拼, 都没看过': '//v9-default.ixigua.com/b127db8ae33afc9b4b014ffb95d232e3/5c94be87/video/m/220ab9e47974bca495591b5b225f7f8fc5b1161a641b0000793976a0813a/?rc=amk6ZXI6eDQ6bDMzZjczM0ApQHRAbzY6Njw8MzUzMzM3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QHAtaGhpam1mXl8tLS8tL3NzLW8jbyM1My0tMDAtLjI1NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '新年就快到来,是时候换个发型了': '//v6-default.ixigua.com/cda31fdec095cdabe314ecabf54cfac8/5c94bd90/video/m/2203341eb294d084664887ba8ae7610a72d11615851000001009ee6f2ad5/?rc=M3VodHg8anFlazMzMzczM0ApQHRAbzM5NjU3MzUzMzQ3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QG9wMjJjYS9wNF8tLTYtL3NzLW8jbyMxMzYtNC0tLjU1NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '150万买226斤新疆和田玉,老汉害怕推来鉴宝,专家见后脸色大变': '//v1-default.ixigua.com/f907f4793a7ac1798df4abb05350e1c7/5c94be73/video/m/2206ae1f1a9023d4f2dae60429d94e3363e1161a4c0c0000123747a41c8b/?rc=M3VqeHlqO3VxbDMzNzczM0ApQHRAbzk4NTQ8MzQzMzU3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QC1ecWhiaGkwNF8tLV8tMHNzLW8jbyMxLzYvMjYtLjY1NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '声音的抉择:钱正昊改编《遇见》开口跪!这个00后小伙不简单': '//v11-default.ixigua.com/905afd5062cb03ee969d80a7b14d7c78/5c94be61/video/m/220846a029f3c7b44ccbe5d499db86f723111619ad0e0000b41407fdc7ed/?rc=M3J4dHVuOmRwbDMzNDczM0ApQHRAbzw0NjY2MzQzMzc3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QGIvZHNjLWdjNF8tLTMtMHNzLW8jbyMyNDMuLy0tLi41NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '金灿荣:美国曾整趴五个老二国家,而中国有一个他们都没有的优点': '//v6-default.ixigua.com/09c71fd5a249e775b9c74b1dbc8a652e/5c94c322/video/m/2206eb1feaeca2e4bcea36dab3c5b603e071161a502b00007cfe73398e5b/?rc=MzU7dmk1ODQzbDMzNzczM0ApQHRAbzczNTY6MzQzMzM3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QDIwa25iY2kxNl8tLV8tMHNzLW8jbyMwMC0tLzEtLjU2NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer'
# } def url_ursl(parame):
for i in parame:
with open(str(i)+".mp4", "wb") as f:
f.write(requests.get("http:"+parame[i]).content) if __name__ == '__main__':
get_one_page() # #req_url()
# #pageOne = get_one_page()
# # print(pageOne)
# url_ursl(parame) 有时间在优化优化
最新文章
- DOM(文档对象模型)
- 常用的一些webshell木马官方后门
- ThinkPHP 模板截取字符串 【转载】
- Magento中调用JS文件的几种方法
- 转:C#写的WEB服务器
- 12306火车票订票网站的一个Bug
- unity tips
- cocos2dx 在windows上实现键盘输入
- VMTools安装
- 那些日常琐事(iPhone上的细小提示,大数据分析)
- 移动端https抓包那些事--进阶篇
- Day062--django--模板,母版和继承
- 操作docker容器
- 一、Dev单元格
- 1-OSI七层模型详解
- 金蝶核算项目余额表卡号余额与天财商龙CRM卡号余额对比
- 训练题(代码未检验)(序列前k大和问题)
- 【Unity】Protobuf的使用与常见问题
- Vue.js 2.0生命周期
- YOLOv3-darknet 内容解析