Request libraries crawl and hook, session() implements cookie sharing, and list index out of range error resolution

I met many problems and finally succeeded in every attempt.
At first, I didn't realize that the cookie s in the pull-off network were changing and could not crawl normally. Search data found that session can share information, which crawled to a point of normal information.
Half of the crawled pages appear list index out of range, but also found that the wrong pages are different each time. Guess that the background data of the pull-off network is changing, and returned an empty list. If else, it can continue to crawl, but also lost several data.

import requests
from lxml import etree
import re

#Sharing cookie s using session
s = requests.session()

headers_1 = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }

def main():
    #Sharing cookie s by using the Start Web Site
    ref_url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
    data_url='https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

    data = {
        'first': 'false',
        'pn': 1,
        'kd': 'python',
        'sid': '15fc1c7c57184b6ab5137cb384bfa498'
    }
    for x in range(1,11):
        data['pn'] = x
        res = s.get(ref_url,headers=headers_1)
        
        headers_2= {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'Referer':ref_url
        }
        get_data(data_url,headers_2,data)
#Crawl a page first
        break
        
#Get the url of the job details page
def get_data(url,headers,data):
    resp = s.post(url,headers=headers,data=data)
    html = resp.json()
    results = html['content']['positionResult']['result']
    for result in results:
        id = result['positionId']
        data_detail_url = 'https://www.lagou.com/jobs/{}.html?show=ab8107555dd04359ad1490655d0066e3'.format(id)
        detail_data(data_detail_url)


def detail_data(url):
    res = s.get(url,headers = headers_1)
    text = res.text
    html = etree.HTML(text)
    a = html.xpath("//h2[@class='name']/text()")
    b = html.xpath("//dd[@class='job_request']//span/text()")
#The list index out of range error occurs when there is no if, guessing that it may be an empty list
    if a==[]:
        pass
    else:
        name = a[0]
        print(name)
    if b==[]:
        pass
    else:
        salary = b[0]
        city = re_sub(b[1])
        education = re_sub(b[3])
        print(salary,city,education)
#Job details
    c = html.xpath("//div[@class='job-detail']/p/text()")
    for d in c:
        print(d)



def re_sub(data):
    data = re.sub(r'[\s/]','',data)
    return data

if __name__ == '__main__':
    main()

Tags: Session network Windows JSON

Posted on Wed, 09 Oct 2019 15:13:22 -0400 by dodgeqwe