python- www.thisamericanlife.org转pdf

环境安装

pip install  requests

pip install  beautifulsoup4

pip install  pdfkit

$ sudo apt-get install wkhtmltopdf  # ubuntu

$ sudo yum intsall wkhtmltopdf      # centos

脚本

#!/usr/bin/env python3.5

# -*- coding: utf-8 -*-

# @Time    : 2019/11/18 下午10:48

# @Author  : yon

# @Email   : xxx@qq.com

# @File    : day1.py.py

import os

import re

import time

import logging

import pdfkit

from bs4 import BeautifulSoup

import requests

headers = {

    # 'Accept': 'application/json, text/javascript, */*; q=0.01',

    # 'Accept': '*/*',

    # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',

    # 'Cache-Control': 'no-cache',

    # 'accept-encoding': 'gzip, deflate, br',

    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',

    'Referer': 'https://www.google.com/'

}

options= {

    'page-size': 'Letter',

    'encoding': "UTF-8",

    'custom-header': [

        ('Accept-Encoding', 'gzip')

    ]

}

resp = requests.get('https://www.thisamericanlife.org/687/transcript', headers=headers)

soup = BeautifulSoup(resp.content, "html.parser")

body = soup.find("article")

all1 = str(body)

pdfkit.from_string(all1, "/home/yon/Desktop/tt.pdf")

另外一种写法

import os

import re

import time

import logging

import requests

import urllib.request

import os

import stat

import pdfkit

from bs4 import BeautifulSoup

# headers = {

#     # 'Accept': 'application/json, text/javascript, */*; q=0.01',

#     'Accept': '*/*',

#     'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',

#     'Cache-Control': 'no-cache',

#     'accept-encoding': 'gzip, deflate, br',

#     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',

#     'Referer': 'https://www.google.com/'

# }

#

#

# resp = requests.get('https://www.thisamericanlife.org/687/transcript', headers=headers)

#

# html = resp.content

# with open("thisaericanlife.html", 'wb') as f:

#     f.write(html)

soup = BeautifulSoup(open("thisaericanlife.html"), "html.parser")

print(soup.article.contents)

print("类型")

html = ""

for x in soup.article.contents:

    # print(str(x))

    html += str(x)

print(html)

# html = BeautifulSoup(soup.article.contents)

#print(type(html))

# print(html)

pdfkit.from_string(html, "/home/baixiaoxu/desk/tt.pdf")

巴特西

python- www.thisamericanlife.org转pdf

环境安装

脚本

另外一种写法

最新文章

热门文章