The website we crawled this time is the Encyclopedia of embarrassing events. The website is: http://www.qiushibaike.com/hot/page/1
The number '1' after the parameter 'page /' refers to the number of pages, and the second page is' / page/2 ', and so on...
1, Analyze web pages
Webpage picture
Then identify the elements to crawl: author name, content, funny number, and number of comments
The information of each segment is stored in the div under 'div id = "content left"
Location of crawling elements
2, Crawling part
Tools:
Python3
requests
xpath
1. Get every segment
1 # Return to the div? List of the page 2 def getHtmlDivList(self, pageIndex): 3 pageUrl = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex) 4 html = requests.get(url=pageUrl, headers=self.headers).text 5 selector = etree.HTML(html) 6 divList = selector.xpath('//div[@id="content-left"]/div') 7 return divList
Every segment is in the Div. here, you use xpath to filter it out and return a list. Every div is in it
2. Get the elements in each segment
1 def getHtmlItems(self, divList): 2 3 items = [] 4 5 for div in divList: 6 item = [] 7 # Issued by 8 name = div.xpath('.//h2/text()')[0].replace("\n", "") 9 item.append(name) 10 11 # content(read the whole passage) 12 contentForAll = div.xpath('.//div[@class="content"]/span[@class="contentForAll"]') 13 if contentForAll: 14 contentForAllHref = div.xpath('.//a[@class="contentHerf"]/@href')[0] 15 contentForAllHref = "https://www.qiushibaike.com" + contentForAllHref 16 contentForAllHrefPage = requests.get(url=contentForAllHref).text 17 selector2 = etree.HTML(contentForAllHrefPage) 18 content = selector2.xpath('//div[@class="content"]/text()') 19 content = "".join(content) 20 content = content.replace("\n", "") 21 else: 22 content = div.xpath('.//div[@class="content"]/span/text()') 23 content = "".join(content) 24 content = content.replace("\n", "") 25 item.append(content) 26 27 # Likes 28 love = div.xpath('.//span[@class="stats-vote"]/i[@class="number"]/text()') 29 love = love[0] 30 item.append(love) 31 32 # Number of comments 33 num = div.xpath('.//span[@class="stats-comments"]//i[@class="number"]/text()') 34 num = num[0] 35 item.append(num) 36 37 items.append(item) 38 39 return items
It should be noted that xpath returns a list. After filtering out, you need to use [0] to get the string type
In the above code, some segments in the crawled content are as follows:
There will be a label < br > in the content, so after crawling out with xpath, the content in the content will become a list (the div here is the list),
The div[0] is "once I went back to my hometown to see grandma, and my aunt talked about my cousin's childhood ~", so I need to convert the div into a string
The rest is about the use of xpath syntax
3. Save into text
1 # Save in text 2 def saveItem(self, items): 3 f = open('F:\\Pythontest1\\qiushi.txt', "a", encoding='UTF-8') 4 5 for item in items: 6 name = item[0] 7 content = item[1] 8 love = item[2] 9 num = item[3] 10 11 # Write text 12 f.write("Issued by:" + name + '\n') 13 f.write("Content:" + content + '\n') 14 f.write("Likes:" + love + '\t') 15 f.write("Number of comments:" + num) 16 f.write('\n\n') 17 18 f.close()
4. All codes
1 import os 2 import re 3 import requests 4 from lxml import etree 5 6 7 # Encyclopedia of embarrassing things 8 class QSBK: 9 # Initialization method, defining variables 10 def __init__(self): 11 self.pageIndex = 1 12 self.headers = { 13 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36" 14 } 15 self.enable = False 16 17 # Back to the div_list 18 def getHtmlDivList(self, pageIndex): 19 pageUrl = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex) 20 html = requests.get(url=pageUrl, headers=self.headers).text 21 selector = etree.HTML(html) 22 divList = selector.xpath('//div[@id="content-left"]/div') 23 return divList 24 25 # Get the elements in the text to be intercepted 26 def getHtmlItems(self, divList): 27 28 items = [] 29 30 for div in divList: 31 item = [] 32 # Issued by 33 name = div.xpath('.//h2/text()')[0].replace("\n", "") 34 item.append(name) 35 36 # content(read the whole passage) 37 contentForAll = div.xpath('.//div[@class="content"]/span[@class="contentForAll"]') 38 if contentForAll: 39 contentForAllHref = div.xpath('.//a[@class="contentHerf"]/@href')[0] 40 contentForAllHref = "https://www.qiushibaike.com" + contentForAllHref 41 contentForAllHrefPage = requests.get(url=contentForAllHref).text 42 selector2 = etree.HTML(contentForAllHrefPage) 43 content = selector2.xpath('//div[@class="content"]/text()') 44 content = "".join(content) 45 content = content.replace("\n", "") 46 else: 47 content = div.xpath('.//div[@class="content"]/span/text()') 48 content = "".join(content) 49 content = content.replace("\n", "") 50 item.append(content) 51 52 # Likes 53 love = div.xpath('.//span[@class="stats-vote"]/i[@class="number"]/text()') 54 love = love[0] 55 item.append(love) 56 57 # Number of comments 58 num = div.xpath('.//span[@class="stats-comments"]//i[@class="number"]/text()') 59 num = num[0] 60 item.append(num) 61 62 items.append(item) 63 64 return items 65 66 # Save in text 67 def saveItem(self, items): 68 f = open('F:\\Pythontest1\\qiushi.txt', "a", encoding='UTF-8') 69 70 for item in items: 71 name = item[0] 72 content = item[1] 73 love = item[2] 74 num = item[3] 75 76 # Write text 77 f.write("Issued by:" + name + '\n') 78 f.write("Content:" + content + '\n') 79 f.write("Likes:" + love + '\t') 80 f.write("Number of comments:" + num) 81 f.write('\n\n') 82 83 f.close() 84 85 # Determine whether the text has been created and add a path 86 def judgePath(self): 87 if os.path.exists('F:\\Pythontest1') == False: 88 os.mkdir('F:\\Pythontest1') 89 if os.path.exists("F:\\Pythontest1\\qiushi.txt") == True: 90 os.remove("F:\\Pythontest1\\qiushi.txt") 91 92 def start(self): 93 self.judgePath() 94 print("Reading the Encyclopedia of embarrassing things,Press enter to continue saving the next page, Q Sign out") 95 self.enable = True 96 while self.enable: 97 divList = self.getHtmlDivList(self.pageIndex) 98 data = self.getHtmlItems(divList) 99 self.saveItem(data) 100 print('Saved the%d Content of page' % self.pageIndex) 101 pan = input('Do you want to continue saving') 102 if pan != 'Q': 103 self.pageIndex += 1 104 self.enable = True 105 else: 106 print('End of program running!!') 107 self.enable = False 108 109 110 spider = QSBK() 111 spider.start()