Taobao product information crawling (logged in)

Thank you for your help in Taobao login
Crawled successfully, the following is the source code:

# Objective: to obtain the information of Taobao Search page and extract the commodity name and price
# Understanding: 1. Taobao's search interface 2. Page turning
# Technical route: requests re
# http://s.taobao.com/search?q = schoolbag & JS = 1 & stats? Click = search? Radio? All% 3A1 & initiative? Id = staobaoz? 20170105 & ie = utf8
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20170105&ie=utf8&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20170105&ie=utf8&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s=88
# Structure design of program
# Step 1: submit the product search request and cycle to get the page
# Step 2: for each page, extract the product name and price information
# Step 3: output the information to the screen

import requests
import re


def getHtmlText(url):
    try:
        headers = {
            'authority': 's.taobao.com',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
            'sec-fetch-dest': 'document',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'referer': 'https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200312&ie=utf8',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cookie': 'miid=253146491399089190; thw=cn; cna=o391FTBhsgUCAdrD5AnR+QjC; hng=CN%7Czh-CN%7CCNY%7C156; tracknick=%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu9896; tg=0; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; t=a7362f9a33953472d0663cbb296a53fd; enc=jXmwR%2BzLAdHsQMP0d%2F9pId1Dn%2BW%2FtphblF4%2FsFXBDkrBKBLkGD4tksHXhn5%2BPTVPEfnLekpkIzYNZENT3sCchA%3D%3D; _samesite_flag_=true; cookie2=1bb1c10d29aca83180c0390f0d1fb1fc; _tb_token_=33b66b43e3335; sgcookie=EBnq%2FRUJ6VqRYs7GbgSt0; unb=2269301708; uc3=id2=UUpngTzc2Y13gg%3D%3D&lg2=VT5L2FSpMGV7TQ%3D%3D&vt3=F8dBxd7CT335hOeshsM%3D&nk2=o688bJ2t2lOAtK8MXRkc9rS1xdZODw%3D%3D; csg=bb101fa2; lgc=%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu9896; cookie17=UUpngTzc2Y13gg%3D%3D; dnk=%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu9896; skt=d1b053176dd80fc6; existShop=MTU4Mzk4NzY5Ng%3D%3D; uc4=id4=0%40U2gtEEjyK2oSMoVT0y3FwsMQmuba&nk4=0%40ofkQgMIYV1w9DtinNnKgQnwEY%2B9mIA3bhXzykl%2Bh56ME; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=%E9%A2%968e; _nk_=%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu9896; cookie1=UNcJvhnwxmsnc%2BQXwkd900lS4%2BSdsCS20k5pE5xYJ2Q%3D; alitrackid=login.taobao.com; lastalitrackid=login.taobao.com; tfstk=cFMhBQjc8XPBkECPG6wIVOcoz-9AZZIURSFZQvf4ybSgh8kNiq5Nge_NRPune_1..; mt=ci=20_1; v=0; Hm_lvt_eaa57ca47dacb4ad4f5a257001a3457c=1583980126,1583987707; uc1=cookie16=Vq8l%2BKCLySLZMFWHxqs8fwqnEw%3D%3D&cookie21=VFC%2FuZ9aiKCaj7AzMHh1&cookie15=VFC%2FuZ9ayeYq2g%3D%3D&existShop=false&pas=0&cookie14=UoTUOafM2gaxgg%3D%3D&tag=8&lng=zh_CN; JSESSIONID=AF10B48836FA6F017CE4F6D93629F9C4; Hm_lpvt_eaa57ca47dacb4ad4f5a257001a3457c=1583989886; isg=BM3NGh5pr8U80wicTAKS00EO3OlHqgF8Ot6V2g9TpGTZBuy41_jeTHTUcJpgnhk0; l=dBEklFpHqPrWkOUQBOfNqASSiU_ONIdb8SFy7q0UFICPO7CHlfwOWZq1YKLMCnGVH6kWR3Rp9jjQBqLO1yCrnxv9-3k_J_DmndC..',
        }
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        print(r.text)
        return r.text
    except:
        print("Crawl failure")

def parsePage(ilist,html):  # List type of results returned by ilt
    try:
        # . for any single character
        # *Indicates that the previous character has 0 or unlimited extensions
        # +Indicates that the previous character has been extended 1 or infinite times
        # ? indicates 0 or 1 extension of the previous character
        # *? indicates that the previous character has 0 or unlimited extended minimum matches
        # \d number is equivalent to [0-9]
        # findall search string returns all matching substrings with list type
        plt=plt = re.findall(r'"view_price":"\d+.\d*"',html)
        # plt = re.findall(r'\"view_price\":\"\d+\.\d*\"',html)
        tlt = re.findall(r'\"raw_title\":\".*?\"',html)
        #print(tlt)
        print(len(plt))
        for i in range(len(plt)):
            price = eval(plt[i].split('\"')[3]) # eval remove the outermost Quotes
            title = tlt[i].split('\"')[3]
            ilist.append([title,price])
        #print(ilist)
    except:
        print("Parsing error")


def printGoodsList(ilist, num):
    print("=====================================================================================================")
    tplt = "{0:<3}\t{1:<30}\t{2:>6}"
    print(tplt.format("Serial number", "Trade name", "Price"))
    count = 0
    for g in ilist:
        count += 1
        if count <= num:
            print(tplt.format(count, g[0], g[1]))
    print("=====================================================================================================")


def main():
    goods = "A bag"
    depth = 2
    start_url = "https://s.taobao.com/search?q=" + goods
    infoList = []
    num = 20
    for i in range(depth):
        try:
            url = start_url + '$S=' + str(44 * i)
            html = getHtmlText(url)
            parsePage(infoList, html)
        except:
            continue

    printGoodsList(infoList, num)


main()

The result is:

No. commodity name price
1. Backpack custom printed logo backpack custom pattern company gift custom made 128.0
2. Backpack custom printed logo backpack casual simple light bag custom made 118.0
3. Business backpack men's work business backpack computer bag custom logo 49.0
4. kk tree schoolbag primary school girls 6-12 years old children from grade one, two, three to grade six girls shoulder pack spine protection load reduction 119.0
5. kk tree schoolbag primary school boy, grade 1-3-4-5 child, backpack girl, 6-12-year-old, shoulder bag and ridge protection 119.0
6. Jasper backpack of JanSport flagship store men's and women's computer bag bag leather bottom TYP7 448.0
7. kipling women's bag large capacity canvas bag campus fashion simple leisure schoolbag backpack | classroom s 749.0
8. Mashalanti schoolbag women's Korean fashion 2019 new large capacity backpack fashion versatile women's anti-theft bag 299.0
9. Jasper backpack for men and women 15 inch computer bag schoolbag fashion leather TYP7 448.0
10. Dickies2019 new fashion brand backpack large capacity female and male carrying backpack schoolbag S014 189.0
11. Dickies2020 new fashion brand backpack large capacity female and male carrying backpack schoolbag S018 219.0
12. Schoolbag for primary school students and girls, grade 1-3-4-6 boys, load reduction and spine protection, light and large capacity children's backpack 99.0
13. CAMS suspension load reduction schoolbag for male middle school students, junior middle school students and female primary school students
14. OMI backpack women 2020 new all-around fashion Korean Oxford cloth school style small backpack bag 459.0
15 Kara sheep grade 1-4 integrated open, easy to clean, easy to store, load reduction and spine protection student bag 239.0
16 karayang junior high school students schoolbag female middle school students large capacity backpack Korean Edition primary school students light backpack 109.0
17. sunearth backpack women's Lotte super fire backpack men's College bag alleno runaway bag 158.0
18. North bag, double shoulder bag, female 2019 new backpack, college style, female student, large capacity schoolbag, lovely travel bag tide 75.0
19 business men 's Backpack Korean fashion backpack simple computer bag leisure Women' s travel bag middle school student bag fashion 199
20. Japanese CILOCALA backpack, female transparent jelly bag, middle school students' schoolbag, traveling backpack, cloth bag 330.0

Published 15 original articles, won praise 5, visited 7596
Private letter follow

Tags: IE xml Windows encoding

Posted on Thu, 12 Mar 2020 07:42:05 -0400 by alcoholic1