Python meituan merchant information

How to climb:

Find the website you want to crawl, for example https://sz.meituan.com/meishi/ (target website of this crawl)
Analyze whether this web page is dynamic or static (dynamic and static web pages are not explained here, and students can understand Baidu)
If the web page is static, you can directly request it back, and then use the corresponding parsing library to parse it to get the data you want; if the web page is dynamic, you can consider using the packet capturing method or Selenium simulation browser to grab the web page (big killer, but the performance will be slow, so be careful)
After obtaining the desired data, the structured storage operation is carried out

Clear crawling website( https://sz.meituan.com/meishi/ )

After opening, you will find that the webmaster is like this. Then right click "view source code",

See the desired data "man Niu Meng Shantou beef store". At this time, use the killer regular expression to match it

import requests
import re
 
url='https://sz.meituan.com/meishi/'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
response=requests.get(url,headers=headers)
shopname=re.findall(r'"frontImg".*?title":(.*?),',response.text)  

And so on. In the same way, get all the other data you need together. They are avgscore, allcommentnum, address, avgprice, corresponding urlid, which is for the second level crawling. Enter the store to get the phone and opentime

The code of the complete data acquisition part is as follows

import requests
import json
import re
 
def one_level(url,headers):
    meishi_dict={}
    response=requests.get(url,headers=headers)
    html=etree.HTML(response.text)
    avgScore=re.findall(r'"avgScore":(.*?),',response.text)
    shopname=re.findall(r'"frontImg".*?title":(.*?),',response.text)
    allCommentNum=re.findall(r'"allCommentNum":(\d+),',response.text)
    address=re.findall(r'"address":(.*?),',response.text)
    avgPrice=re.findall(r'"avgPrice":(\d+),',response.text)
    urlid=re.findall(r'"poiId":(\d+),',response.text)
    two_url=[]
    for k in urlid:
        a='https://www.meituan.com/meishi/'+str(k)+'/'
        two_url.append(a)
    meishi_all_list=[]
    for i in range(len(shopname)):
        meishi_list=[]
        meishi_list.append(shopname[i])
        meishi_list.append(avgScore[i])
        meishi_list.append(allCommentNum[i])
        meishi_list.append(address[i])
        meishi_list.append(avgPrice[i])
        meishi_list.append(two_url[i])
        #meishi_dict.setdefault(shopname[i],meishi_list)
        meishi_all_list.append(meishi_list)
    return meishi_all_list,two_url
 
def two_level(meishi_all_list,two_url,headers):
    for i in meishi_all_list:
        new_url=i[5]
        response=requests.get(new_url,headers=headers)
        phone=re.findall(r'"phone":(.*?),',str(response.text))
        openTime=re.findall(r'"openTime":(.*?),',str(response.text))
        i.append(phone)
        i.append(openTime)
    writer_to_file(meishi_all_list)
    return meishi_all_list

After obtaining the required data, we need to store it in a structured way. Here, we use the write.writerows method

def writer_to_file(meishi_all_list):
    with open('a.csv','a',newline='',encoding='utf-8')as f:  
        write=csv.writer(f)
        write.writerows(meishi_all_list)

The whole process of thinking is just like this. Next, I will attach the complete code to you:

# -*- coding: utf-8 -*-
"""
Created on Sun Jan 20 00:50:47 2019
@author: HHX
"""
import csv
import requests
import json
import re
 
def one_level(url,headers):
    meishi_dict={}
    response=requests.get(url,headers=headers)
    avgScore=re.findall(r'"avgScore":(.*?),',response.text)
    shopname=re.findall(r'"frontImg".*?title":(.*?),',response.text)
    allCommentNum=re.findall(r'"allCommentNum":(\d+),',response.text)
    address=re.findall(r'"address":(.*?),',response.text)
    avgPrice=re.findall(r'"avgPrice":(\d+),',response.text)
    urlid=re.findall(r'"poiId":(\d+),',response.text)
    two_url=[]
    for k in urlid:
        a='https://www.meituan.com/meishi/'+str(k)+'/'
        two_url.append(a)
    meishi_all_list=[]
    for i in range(len(shopname)):
        meishi_list=[]
        meishi_list.append(shopname[i])
        meishi_list.append(avgScore[i])
        meishi_list.append(allCommentNum[i])
        meishi_list.append(address[i])
        meishi_list.append(avgPrice[i])
        meishi_list.append(two_url[i])
        meishi_all_list.append(meishi_list)
    return meishi_all_list,two_url
 
def two_level(meishi_all_list,two_url,headers):
    for i in meishi_all_list:
        new_url=i[5]
        response=requests.get(new_url,headers=headers)
        phone=re.findall(r'"phone":(.*?),',str(response.text))
        openTime=re.findall(r'"openTime":(.*?),',str(response.text))
        i.append(phone)
        i.append(openTime)
    writer_to_file(meishi_all_list)
    return meishi_all_list
 
def writer_to_file(meishi_all_list):
    with open('food.csv','a',newline='',encoding='utf-8')as f:
        write=csv.writer(f)
        write.writerows(meishi_all_list)
        
def main():
    for i in range(1,5):
        url='https://sz.meituan.com/meishi/pn' + str(i) + '/'
        headers={
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
        #To be on the safe side, students can add their own cookie s in the headers
        a,b=one_level(url,headers)
        two_level(a,b,headers)
 
if __name__=='__main__':
    main()

# -*- coding: utf-8 -*-
"""
Created on Sun Jan 20 00:50:47 2019
@author: HHX
"""
import csv
import requests
import json
import re


def one_level(url, headers):
    meishi_dict = {}
    response = requests.get(url, headers=headers)
    avgScore = re.findall(r'"avgScore":(.*?),', response.text)
    shopname = re.findall(r'"frontImg".*?title":(.*?),', response.text)
    allCommentNum = re.findall(r'"allCommentNum":(\d+),', response.text)
    address = re.findall(r'"address":(.*?),', response.text)
    avgPrice = re.findall(r'"avgPrice":(\d+),', response.text)
    urlid = re.findall(r'"poiId":(\d+),', response.text)
    two_url = []
    print("Merchant name:",shopname)
    for k in urlid:
        a = 'https://www.meituan.com/meishi/' + str(k) + '/'
        two_url.append(a)
    meishi_all_list = []
    for i in range(len(shopname)):
        meishi_list = []
        meishi_list.append(shopname[i])
        meishi_list.append(avgScore[i])
        meishi_list.append(allCommentNum[i])
        meishi_list.append(address[i])
        meishi_list.append(avgPrice[i])
        meishi_list.append(two_url[i])
        meishi_all_list.append(meishi_list)
    return meishi_all_list, two_url


def two_level(meishi_all_list, two_url, headers):
    for i in meishi_all_list:
        new_url = i[5]
        response = requests.get(new_url, headers=headers)
        phone = re.findall(r'"phone":(.*?),', str(response.text))
        openTime = re.findall(r'"openTime":(.*?),', str(response.text))
        i.append(phone)
        i.append(openTime)
        print("Phone:",phone)
    writer_to_file(meishi_all_list)
    return meishi_all_list


def writer_to_file(meishi_all_list):
    with open('food.csv', 'a', newline='', encoding='utf-8')as f:
        write = csv.writer(f)
        write.writerows(meishi_all_list)


def main():
    # Query the total number of pages of merchants in the city on meituan, and modify the value below
    for i in range(1, 20):
        url = 'https://ts.meituan.com/meishi/pn' + str(i) + '/'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
        # To be on the safe side, students can add their own cookie s in the headers
        a, b = one_level(url, headers)
        two_level(a, b, headers)


if __name__ == '__main__':
    main()

If after operation food.csv If there is no phone number information of the merchant, you can try the following methods.
Tip: users of meituan website log in, and then run the program after logging in food.csv Find the information address of a merchant in the file, for example: https://www.meituan.com/meishi/94057020/
Open the address in the browser to see if you need to enter the verification code. If you need to enter it correctly, run the program again, so you can get the merchant's phone number.

Tags: JSON Windows encoding Selenium

Posted on Mon, 22 Jun 2020 23:10:44 -0400 by jbille