Simulated login of python crawler Douban Network

Links to the original text: https://www.jianshu.com/p/c121c0280887

thinking
I. Key Points for Wanting to Log on to Douban

Analyse the real post address - Find its form data, as shown below, by browser F12.

Actual operation
Realization: Simulate login Douban, verify code processing, login to personal home page is even success

Data: No data is captured. The main purpose of this battle is to simulate login and process the learning of authentication codes. If there is a need to grab data, the content can be grabbed by compiling relevant grabbing rules.

The successful login is shown as follows:


The main code of DouBan.py in the spiders folder is as follows:

# -*- coding: utf-8 -*-
import scrapy,urllib,re
from scrapy.http import Request,FormRequest
import ruokuai
'''
//What problems do you not understand? Python Learning Exchange Group: 821460695 to meet your needs, information has been uploaded group files, you can download!
'''
class DoubanSpider(scrapy.Spider):
    name = "DouBan"
    allowed_domains = ["douban.com"]
    #start_urls = ['http://douban.com/']
    header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"} #For login simulation
    def start_requests(self):
        url='https://www.douban.com/accounts/login'
        return [Request(url=url,meta={"cookiejar":1},callback=self.parse)]#You can pass an identifier to use more than one. For example, in the sentence meta={'cookiejar': 1}, the latter one is the identifier.

    def parse(self, response):
        captcha=response.xpath('//* [@id="captcha_image"]/@src').extract()# Get a link to the validation code picture
        print captcha
        if len(captcha)>0:
            '''At this point, there is a verification code.'''
            #Manual input validation code
            #urllib.urlretrieve(captcha[0],filename="C:/Users/pujinxiao/Desktop/learn/douban20170405/douban/douban/spiders/captcha.png")
            #captcha_value=raw_input('Look at captcha.png, enter the validation code:')

            #Processing Verification Code with Quick Ruo Coding Platform - --- Verification Code is an arbitrary length letter with a low success rate
            captcha_value=ruokuai.get_captcha(captcha[0])
            reg=r'<Result>(.*?)</Result>'
            reg=re.compile(reg)
            captcha_value=re.findall(reg,captcha_value)[0]
            print 'The verification code is:',captcha_value

            data={
                "form_email": "weisuen007@163.com",
                "form_password": "weijc7789",
                "captcha-solution": captcha_value,
                #"redir": "https://www.douban.com/people/151968962/",
            }
        else:
            '''There is no validation code at this time.'''
            print 'No verification code'
            data={
                "form_email": "weisuen007@163.com",
                "form_password": "weijc7789",
                #"redir": "https://www.douban.com/people/151968962/",
            }
        print 'In the process of landing......'
        ####FormRequest.from_response() for login
        return [
            FormRequest.from_response(
                response,
                meta={"cookiejar":response.meta["cookiejar"]},
                headers=self.header,
                formdata=data,
                callback=self.get_content,
            )
        ]
    def get_content(self,response):
        title=response.xpath('//title/text()').extract()[0]
        if u'Log bean' in title:
            print 'Logon failed, please try again!'
        else:
            print 'Login successfully'
            '''
            //Follow-up crawling can be continued
            '''

The ruokaui.py code is as follows:

What I use is if the block coding platform chooses url identification verification code, directly gives the link address of the verification code picture of the coding platform, and returns the value of the verification code.

# -*- coding: utf-8 -*-
import sys, hashlib, os, random, urllib, urllib2
from datetime import *
'''
//What problems do you not understand? Python Learning Exchange Group: 821460695 to meet your needs, information has been uploaded group files, you can download!
'''
class APIClient(object):
    def http_request(self, url, paramDict):
        post_content = ''
        for key in paramDict:
            post_content = post_content + '%s=%s&'%(key,paramDict[key])
        post_content = post_content[0:-1]
        #print post_content
        req = urllib2.Request(url, data=post_content)
        req.add_header('Content-Type', 'application/x-www-form-urlencoded')
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())  
        response = opener.open(req, post_content)  
        return response.read()

    def http_upload_image(self, url, paramKeys, paramDict, filebytes):
        timestr = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        boundary = '------------' + hashlib.md5(timestr).hexdigest().lower()
        boundarystr = '\r\n--%s\r\n'%(boundary)
        
        bs = b''
        for key in paramKeys:
            bs = bs + boundarystr.encode('ascii')
            param = "Content-Disposition: form-data; name=\"%s\"\r\n\r\n%s"%(key, paramDict[key])
            #print param
            bs = bs + param.encode('utf8')
        bs = bs + boundarystr.encode('ascii')
        
        header = 'Content-Disposition: form-data; name=\"image\"; filename=\"%s\"\r\nContent-Type: image/gif\r\n\r\n'%('sample')
        bs = bs + header.encode('utf8')
        
        bs = bs + filebytes
        tailer = '\r\n--%s--\r\n'%(boundary)
        bs = bs + tailer.encode('ascii')
        
        import requests
        headers = {'Content-Type':'multipart/form-data; boundary=%s'%boundary,
                   'Connection':'Keep-Alive',
                   'Expect':'100-continue',
                   }
        response = requests.post(url, params='', data=bs, headers=headers)
        return response.text

def arguments_to_dict(args):
    argDict = {}
    if args is None:
        return argDict
    
    count = len(args)
    if count <= 1:
        print 'exit:need arguments.'
        return argDict
    
    for i in [1,count-1]:
        pair = args[i].split('=')
        if len(pair) < 2:
            continue
        else:
            argDict[pair[0]] = pair[1]

    return argDict

def get_captcha(image_url):
    client = APIClient()
    while 1:
        paramDict = {}
        result = ''
        act = raw_input('Please enter the typing method. url:')
        if cmp(act, 'info') == 0: 
            paramDict['username'] = raw_input('username:')
            paramDict['password'] = raw_input('password:')
            result = client.http_request('http://api.ruokuai.com/info.xml', paramDict)
        elif cmp(act, 'register') == 0:
            paramDict['username'] = raw_input('username:')
            paramDict['password'] = raw_input('password:')
            paramDict['email'] = raw_input('email:')
            result = client.http_request('http://api.ruokuai.com/register.xml', paramDict)
        elif cmp(act, 'recharge') == 0:
            paramDict['username'] = raw_input('username:')
            paramDict['id'] = raw_input('id:')
            paramDict['password'] = raw_input('password:')
            result = client.http_request('http://api.ruokuai.com/recharge.xml', paramDict)
        elif cmp(act, 'url') == 0:
            paramDict['username'] = '********'
            paramDict['password'] = '********'
            paramDict['typeid'] = '2000'
            paramDict['timeout'] = '90'
            paramDict['softid'] = '76693'
            paramDict['softkey'] = 'ec2b5b2a576840619bc885a47a025ef6'
            paramDict['imageurl'] = image_url
            result = client.http_request('http://api.ruokuai.com/create.xml', paramDict)
        elif cmp(act, 'report') == 0:
            paramDict['username'] = raw_input('username:')
            paramDict['password'] = raw_input('password:')
            paramDict['id'] = raw_input('id:')
            result = client.http_request('http://api.ruokuai.com/create.xml', paramDict)
        elif cmp(act, 'upload') == 0:
            paramDict['username'] = '********'
            paramDict['password'] = '********'
            paramDict['typeid'] = '2000'
            paramDict['timeout'] = '90'
            paramDict['softid'] = '76693'
            paramDict['softkey'] = 'ec2b5b2a576840619bc885a47a025ef6'
            paramKeys = ['username',
                 'password',
                 'typeid',
                 'timeout',
                 'softid',
                 'softkey'
                ]

            from PIL import Image
            imagePath = raw_input('Image Path:')
            img = Image.open(imagePath)
            if img is None:
                print 'get file error!'
                continue
            img.save("upload.gif", format="gif")
            filebytes = open("upload.gif", "rb").read()
            result = client.http_upload_image("http://api.ruokuai.com/create.xml", paramKeys, paramDict, filebytes)
        
        elif cmp(act, 'help') == 0:
            print 'info'
            print 'register'
            print 'recharge'
            print 'url'
            print 'report'
            print 'upload'
            print 'help'
            print 'exit'
        elif cmp(act, 'exit') == 0:
            break
        
        return result

Tags: xml ascii Python Windows

Posted on Mon, 07 Oct 2019 13:43:41 -0400 by balloontrader