


  BeautifulSoap的API:  https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html



import requests
url = "http://www.~~~~~~~~~~~~~~~~~~~~~~~~~~"
r = requests.get(url)


from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text)

  (3)利用soup找到超链接href 并把href保存到文件中,为了后续的使用;

with open(r"E:\aa.txt", "wb") as code:
for link in soup.find_all('a'):
code.write(str(link.get('href')) + '\r\n')
print "Download Complete!"


fd = open(r"E:\juchao.txt","r")
mylist = []for line in fd:


headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Cookie': 'JSESSIONID=27AF575249A833C368677F9B5869A463',
'Host': 'www.cninfo.com.cn',
'Referer': 'http://www.~~~~~~~~~~~~~~~',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0',
'Content-Length': '',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
urlpath = 'http://www.cninfo.com.cn/information/brief/szmb'
myUrls = []
for submylist in mylist:
    urlId = ''
    url = ''
    urlId = submylist[-7:-1]
    url = urlpath + urlId + '.html'


import json
with open(r"E:\juchao_json.txt", "wb") as code: for k in xrange(len(myUrls)):
r1 = requests.get(myUrls[k])
r1.encoding = r1.apparent_encoding
# print r1.encoding soup = BeautifulSoup(r1.text)
jsonMap = {}
jsonMapKey = []
jsonMapValue = []
for i in soup.select(".zx_data"):
jsonMapKey.append(i.text) for i in soup.select(".zx_data2"):
jsonMapValue.append(i.text[:-]) for j in xrange(len(jsonMapKey)):
jsonMap[jsonMapKey[j]] = jsonMapValue[j] strJson = json.dumps(jsonMap, ensure_ascii=False)
# print strJson
code.write(strJson.encode('utf-8') + '\r\n') print 'Done!'


    1)安装: pip install BeautifulSoap

    2 )对象:Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象.

      所有对象可以归纳为4种: Tag , NavigableString ,BeautifulSoup , Comment

    3)遍历文档:.tag  .contents  .children  .descendants  .parent   .parents  .next_slibling  .previous_slibling  .next_element

    4)搜索文档:find()  find_all()  find_parents  find_next_siblings  select



