crawl blog website: www.apress.com

# -*- coding: utf-8 -*-
"""
Created on Wed May 10 18:01:41 2017
@author: Raghav Bali
""" """
This script crawls apress.com's blog page to:
+ extract list of recent blog post titles and their URLS
+ extract content related to each blog post in plain text
using requests and BeautifulSoup packages
``Execute``
$ python crawl_bs.py
""" import requests
from time import sleep
from bs4 import BeautifulSoup def get_post_mapping(content):
"""This function extracts blog post title and url from response object
Args:
content (request.content): String content returned from requests.get
Returns:
list: a list of dictionaries with keys title and url
"""
post_detail_list = []
post_soup = BeautifulSoup(content,"lxml")
h3_content = post_soup.find_all("h3") for h3 in h3_content:
post_detail_list.append(
{'title':h3.a.get_text(),'url':h3.a.attrs.get('href')}
) return post_detail_list def get_post_content(content):
"""This function extracts blog post content from response object
Args:
content (request.content): String content returned from requests.get
Returns:
str: blog's content in plain text
"""
plain_text = ""
text_soup = BeautifulSoup(content,"lxml")
para_list = text_soup.find_all("div",
{'class':'cms-richtext'}) for p in para_list[0]:
plain_text += p.getText() return plain_text if __name__ =='__main__': crawl_url = "http://www.apress.com/in/blog/all-blog-posts"
post_url_prefix = "http://www.apress.com" print("Crawling Apress.com for recent blog posts...\n\n") response = requests.get(crawl_url) if response.status_code == 200:
blog_post_details = get_post_mapping(response.content) if blog_post_details:
print("Blog posts found:{}".format(len(blog_post_details))) for post in blog_post_details:
print("Crawling content for post titled:",post.get('title'))
post_response = requests.get(post_url_prefix+post.get('url')) if post_response.status_code == 200:
post['content'] = get_post_content(post_response.content) print("Waiting for 10 secs before crawling next post...\n\n")
sleep(10) print("Content crawled for all posts") # print/write content to file
for post in blog_post_details:
print(post)

最新文章

  1. 《Qt Quick 4小时入门》学习笔记4
  2. MySQL学习笔记六:基本DML操作
  3. MapReduce多重MR如何实现
  4. SUSE Linux Enterprise Server 设置防火墙开启ssh远程端口
  5. Class
  6. hibernate分页实现
  7. CentOS 6.4编译安装淘宝web服务器Tengine
  8. Django URL 命名空间
  9. DDD领域驱动设计仓储Repository
  10. C#伪彩色处理
  11. C++ stack
  12. Push to origin/master was rejected (Git提交错误)
  13. CTF---安全杂项入门第三题 这是捕获的黑客攻击数据包,Administrator用户的密码在此次攻击中泄露了,你能找到吗?
  14. vs不支持通过afxgetmainwnd()获取窗口句柄(转)
  15. Django学习笔记(3)--模板
  16. HTML目录:
  17. P1316 丢瓶盖--(二分答案)
  18. redisi应用--布隆过滤器
  19. flask 渲染jinja2模版和传参
  20. bzoj5048: 塌陷的牧场

热门文章

  1. js图片压缩+ajax上传
  2. 递归求兔子数列第n项的值
  3. ModbusTCP协议
  4. 用这个模型去理解CPU?
  5. ElasticSearch 中文分词搜索环境搭建
  6. java中创建线程的3种方法
  7. html5调用手机震动
  8. JAVA项目从运维部署到项目开发(四. Tomcat)
  9. Mysql 游标初识
  10. Prometheus学习笔记(5)Grafana可视化展示