Python crawler ajax crawls the content of Ma Yun's father's Micro blog

ajax crawling

Sometimes when we use Requests to grab the page, the result may be different from what we see in the browser. We can see the normal page data in the browser, but we don't get the result using Requests. The reason is that Requests gets the original HTML document, while the page in the browser is the page passing by again The results generated after JavaScript processes the data. There are many sources of these data. They may be loaded through Ajax, included in HTML documents, or generated after JavaScript is calculated by a specific algorithm

The project code is as follows

 1 import requests
 2 from fake_useragent import UserAgent
 3 from pyquery import PyQuery
 4 from urllib.parse import urlencode
 5 from requests.packages import urllib3
 6 from pymongo import MongoClient
 7 
 8 # Turn off warning
 9 urllib3.disable_warnings()
10 
11 base_url = 'https://m.weibo.cn/api/container/getIndex?'
12 
13 # Activate local MongoDB Client
14 client = MongoClient('localhost',27001)
15 # Create database
16 pages = client['pages']
17 # Create set
18 ma_yun = pages['ma_yun']
19 
20 # Save to mongoDB in
21 def save_to_mongo(result):
22     if ma_yun.insert_one(result):
23         print('saved to Mongo','Acquired{number}Bar data'.format(number=ma_yun.count()))
24 
25 # generate UA
26 def create_user_agent():
27     ua = UserAgent(use_cache_server=False)
28     # print(ua.chrome)
29     return ua.chrome
30 
31 # generate headers
32 def create_headers():
33     headers = {
34         'User-Agent': create_user_agent()
35     }
36     return headers
37 
38 # Get pages
39 def get_page(page):
40     # Setting parameters
41     params = {
42         'sudaref':'germey.gitbooks.io',
43         'display':'0',
44         'retcode':'6102',
45         'type':'uid',
46         'value':'2145291155',
47         'containerid':'1076032145291155',
48         'page':page
49     }
50     url = base_url + urlencode(params)
51     try:
52         response = requests.get(url,create_headers(),verify=False)
53         if response.status_code == 200:
54             return response.json()
55     except requests.ConnectionError as e:
56         print('Error',e.args)
57 
58 # Parsing page
59 def parse_page(json):
60     if json:
61         items = json.get('data').get('cards')
62         if items != None:
63             for item in items:
64                 item = item.get('mblog')
65                 weibo = {}
66                 weibo['id'] = item.get('id')
67                 # In the body HTML Label removed
68                 weibo['text'] = PyQuery(item.get('text')).text()
69                 # Praise points
70                 weibo['attitudes_count'] = item.get('attitudes_count')
71                 # Comment number
72                 weibo['comments_count'] = item.get('comments_count')
73                 # Release time
74                 weibo['datetime'] = item.get('created_at')
75                 # Forwarding number
76                 weibo['reposts_count'] = item.get('reposts_count')
77 
78                 yield weibo
79 
80 # Set the main method to call other methods
81 def main():
82     for page in range(1,30):
83         json = get_page(page)
84         results = parse_page(json)
85         for result in results:
86             save_to_mongo(result)
87 
88 if __name__ == '__main__':
89     main()

Project operation

Tags: Python JSON Javascript MongoDB Database

Posted on Sat, 22 Feb 2020 12:37:06 -0500 by fifin04