python3 crawler: crawling through the Encyclopedia of embarrassing things

The website we crawled this time is the Encyclopedia of embarrassing events. The website is: http://www.qiushibaike.com/hot/page/1

The number '1' after the parameter 'page /' refers to the number of pages, and the second page is' / page/2 ', and so on...

 

1, Analyze web pages

Webpage picture

 

Then identify the elements to crawl: author name, content, funny number, and number of comments

 

The information of each segment is stored in the div under 'div id = "content left"

Location of crawling elements

 

2, Crawling part

Tools:

    Python3

   requests

   xpath

 

1. Get every segment

1 # Return to the div? List of the page
2     def getHtmlDivList(self, pageIndex):
3         pageUrl = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
4         html = requests.get(url=pageUrl, headers=self.headers).text
5         selector = etree.HTML(html)
6         divList = selector.xpath('//div[@id="content-left"]/div')
7         return divList

 

   

Every segment is in the Div. here, you use xpath to filter it out and return a list. Every div is in it

  

2. Get the elements in each segment

 1     def getHtmlItems(self, divList):
 2 
 3         items = []
 4 
 5         for div in divList:
 6             item = []
 7             # Issued by
 8             name = div.xpath('.//h2/text()')[0].replace("\n", "")
 9             item.append(name)
10 
11             # content(read the whole passage)
12             contentForAll = div.xpath('.//div[@class="content"]/span[@class="contentForAll"]')
13             if contentForAll:
14                 contentForAllHref = div.xpath('.//a[@class="contentHerf"]/@href')[0]
15                 contentForAllHref = "https://www.qiushibaike.com" + contentForAllHref
16                 contentForAllHrefPage = requests.get(url=contentForAllHref).text
17                 selector2 = etree.HTML(contentForAllHrefPage)
18                 content = selector2.xpath('//div[@class="content"]/text()')
19                 content = "".join(content)
20                 content = content.replace("\n", "")
21             else:
22                 content = div.xpath('.//div[@class="content"]/span/text()')
23                 content = "".join(content)
24                 content = content.replace("\n", "")
25             item.append(content)
26 
27             # Likes
28             love = div.xpath('.//span[@class="stats-vote"]/i[@class="number"]/text()')
29             love = love[0]
30             item.append(love)
31 
32             # Number of comments
33             num = div.xpath('.//span[@class="stats-comments"]//i[@class="number"]/text()')
34             num = num[0]
35             item.append(num)
36 
37             items.append(item)
38 
39         return items

 

 

 

It should be noted that xpath returns a list. After filtering out, you need to use [0] to get the string type

In the above code, some segments in the crawled content are as follows:

          

There will be a label < br > in the content, so after crawling out with xpath, the content in the content will become a list (the div here is the list),

The div[0] is "once I went back to my hometown to see grandma, and my aunt talked about my cousin's childhood ~", so I need to convert the div into a string

The rest is about the use of xpath syntax

 

3. Save into text

 1 # Save in text
 2     def saveItem(self, items):
 3         f = open('F:\\Pythontest1\\qiushi.txt', "a", encoding='UTF-8')
 4 
 5         for item in items:
 6             name = item[0]
 7             content = item[1]
 8             love = item[2]
 9             num = item[3]
10 
11             # Write text
12             f.write("Issued by:" + name + '\n')
13             f.write("Content:" + content + '\n')
14             f.write("Likes:" + love + '\t')
15             f.write("Number of comments:" + num)
16             f.write('\n\n')
17 
18         f.close()

  

4. All codes

  1 import os
  2 import re
  3 import requests
  4 from lxml import etree
  5 
  6 
  7 # Encyclopedia of embarrassing things
  8 class QSBK:
  9     # Initialization method, defining variables
 10     def __init__(self):
 11         self.pageIndex = 1
 12         self.headers = {
 13             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"
 14         }
 15         self.enable = False
 16 
 17     # Back to the div_list
 18     def getHtmlDivList(self, pageIndex):
 19         pageUrl = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
 20         html = requests.get(url=pageUrl, headers=self.headers).text
 21         selector = etree.HTML(html)
 22         divList = selector.xpath('//div[@id="content-left"]/div')
 23         return divList
 24 
 25     # Get the elements in the text to be intercepted
 26     def getHtmlItems(self, divList):
 27 
 28         items = []
 29 
 30         for div in divList:
 31             item = []
 32             # Issued by
 33             name = div.xpath('.//h2/text()')[0].replace("\n", "")
 34             item.append(name)
 35 
 36             # content(read the whole passage)
 37             contentForAll = div.xpath('.//div[@class="content"]/span[@class="contentForAll"]')
 38             if contentForAll:
 39                 contentForAllHref = div.xpath('.//a[@class="contentHerf"]/@href')[0]
 40                 contentForAllHref = "https://www.qiushibaike.com" + contentForAllHref
 41                 contentForAllHrefPage = requests.get(url=contentForAllHref).text
 42                 selector2 = etree.HTML(contentForAllHrefPage)
 43                 content = selector2.xpath('//div[@class="content"]/text()')
 44                 content = "".join(content)
 45                 content = content.replace("\n", "")
 46             else:
 47                 content = div.xpath('.//div[@class="content"]/span/text()')
 48                 content = "".join(content)
 49                 content = content.replace("\n", "")
 50             item.append(content)
 51 
 52             # Likes
 53             love = div.xpath('.//span[@class="stats-vote"]/i[@class="number"]/text()')
 54             love = love[0]
 55             item.append(love)
 56 
 57             # Number of comments
 58             num = div.xpath('.//span[@class="stats-comments"]//i[@class="number"]/text()')
 59             num = num[0]
 60             item.append(num)
 61 
 62             items.append(item)
 63         
 64         return items
 65 
 66     # Save in text
 67     def saveItem(self, items):
 68         f = open('F:\\Pythontest1\\qiushi.txt', "a", encoding='UTF-8')
 69 
 70         for item in items:
 71             name = item[0]
 72             content = item[1]
 73             love = item[2]
 74             num = item[3]
 75 
 76             # Write text
 77             f.write("Issued by:" + name + '\n')
 78             f.write("Content:" + content + '\n')
 79             f.write("Likes:" + love + '\t')
 80             f.write("Number of comments:" + num)
 81             f.write('\n\n')
 82 
 83         f.close()
 84 
 85     # Determine whether the text has been created and add a path
 86     def judgePath(self):
 87         if os.path.exists('F:\\Pythontest1') == False:
 88             os.mkdir('F:\\Pythontest1')
 89         if os.path.exists("F:\\Pythontest1\\qiushi.txt") == True:
 90             os.remove("F:\\Pythontest1\\qiushi.txt")
 91 
 92     def start(self):
 93         self.judgePath()
 94         print("Reading the Encyclopedia of embarrassing things,Press enter to continue saving the next page, Q Sign out")
 95         self.enable = True
 96         while self.enable:
 97             divList = self.getHtmlDivList(self.pageIndex)
 98             data = self.getHtmlItems(divList)
 99             self.saveItem(data)
100             print('Saved the%d Content of page' % self.pageIndex)
101             pan = input('Do you want to continue saving')
102             if pan != 'Q':
103                 self.pageIndex += 1
104                 self.enable = True
105             else:
106                 print('End of program running!!')
107                 self.enable = False
108 
109 
110 spider = QSBK()
111 spider.start()

Tags: Python encoding REST Windows

Posted on Mon, 04 May 2020 08:33:14 -0400 by smarthouseguy