Using nodejs to build a simple and easy-to-use crawler

1, Initialize project

npm init

2, Install required npm packages

yarn add superagent cheerio

  • Super simulates the browser to send requests, such as login https://www.npmjs.com/package/superagent
  • Cherio parsing static html https://www.npmjs.com/package/cheerio

3, Modify package.json script

  1. package.json
"scripts": {
    "start": "node index.js"
},
  1. index.js

console.log('123')

  1. Run it and try

npm run start

4, Introduce the required packages and simply visit Baidu to test them

You can see that the results of visiting www.baidu.com can be obtained from res, and the html of the whole website can be obtained through res.text

const superagent = require("superagent");
const cheerio = require("cheerio");


superagent.get('http://www.baidu.com/').end((err, res) => {
    if (err) {
        console.log(`Access failed - ${err}`)
    } else {
        console.log(res.text);
    }
});

5, Parsing the obtained html

Through cherio. Load, we can parse the html we get, and then operate various elements

For example, let's try to get the meta tag content of Baidu website:

const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');


superagent.get('http://www.baidu.com').end((err, res) => {
    if (err) {
        console.log(`Access failed - ${err}`)
    } else {
        const htmlText = res.text;
        const $ = cheerio.load(htmlText);
        $('meta').each((index, ele) => {
            console.log(index);
            console.log($(ele).attr('content'));
        })
    }
});

6, Grab Baidu pictures

Create a new image.handler.js file to deal with picture logic

  1. Check url

Baidu search "ha ha" to see what changes have taken place in the url?

You can see that the following fields are more important

https://image.baidu.com/search/index?tn=baiduimage&word=%B9%FE%B9%FE&ie=gbk

TN = Baidu image
word=encode('ha ha ')
ie=gbk should refer to the content encoding format

  1. Check DOM structure

What we need to do is download the image, so now there is a question, how do we get the Url of the image?

Right click to view the source code of the web page. What we can see here is the content that we can climb directly through SUPERGEN

We can find that the url of the image is a field called obj url, so we can match them later through regularization

/"objURL":"(.*?)",/g
  1. Write code to access Baidu pictures
const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');

const word = 'ha-ha';

superagent
    .get(`http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=${encodeURIComponent(word)}`)
    .end((err, res) => {
        if (err) {
            console.log(`Access failed - ${err}`)
        } else {
            const htmlText = res.text;
            const $ = cheerio.load(htmlText);
            console.log(htmlText);
        }
    });

Run it and try

<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="utf-8">
    <title>Baidu security verification</title>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black">
    <meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0">
    <meta name="format-detection" content="telephone=no, email=no">
    <link rel="shortcut icon" href="https://www.baidu.com/favicon.ico" type="image/x-icon">
    <link rel="icon" sizes="any" mask href="https://www.baidu.com/img/baidu.svg">
    <meta http-equiv="X-UA-Compatible" content="IE=Edge">
    <meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">
    <link rel="stylesheet" href="https://ppui-static-wap.cdn.bcebos.com/static/touch/css/api/mkdjump_0635445.css" />
</head>
<body>
    <div class="timeout hide">
        <div class="timeout-img"></div>
        <div class="timeout-title">The network suck up. Please try again later.</div>
        <button type="button" class="timeout-button">Return to home page</button>
    </div>
    <div class="timeout-feedback hide">
        <div class="timeout-feedback-icon"></div>
        <p class="timeout-feedback-title">Problem feedback</p>
    </div>

<script src="https://wappass.baidu.com/static/machine/js/api/mkd.js"></script>
<script src="https://ppui-static-wap.cdn.bcebos.com/static/touch/js/mkdjump_fbb9952.js"></script>
</body>
</html>

Why doesn't something seem right? How to come up with a Baidu security verification?
Blind guess is Baidu's anti - climbing strategy

What should I do? Simulate browser behavior to the greatest extent!! Let's try to fill in the request headers

  1. Add request header

Go to the browser network panel and copy the values of these request headers

const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');

const word = 'ha-ha';

const headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"'
}


superagent
    .get(`http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=${encodeURIComponent(word)}`)
    .set('Accept', headers['Accept'])
    .set('Accept-Encoding', headers['Accept-Encoding'])
    .set('Accept-Language', headers['Accept-Language'])
    .set('Cache-Control', headers['Cache-Control'])
    .set('Connection', headers['Connection'])
    .set('User-Agent', headers['User-Agent'])
    .set('sec-ch-ua', headers['sec-ch-ua'])
    .end((err, res) => {
        if (err) {
            console.log(`Access failed - ${err}`)
        } else {
            const htmlText = res.text;
            const $ = cheerio.load(htmlText);
            console.log(htmlText);
        }
    });

You can find it, Meizizi!

  1. Get imgurlList
const htmlText = res.text;
const $ = cheerio.load(htmlText);
const imageMatches = htmlText.match(/"objURL":"(.*?)",/g);
const imageUrlList = imageMatches.map(item => {
    const imageUrl = item.match(/:"(.*?)",/g)
    return RegExp.$1;
})

console.log(imageUrlList);
  1. Get the title list of pictures
const titleMatches = htmlText.match(/"fromPageTitle":"(.*?)",/g);
const titleList = titleMatches.map(item => {
    const title = item.match(/:"(.*?)",/g)
    return RegExp.$1;
})

console.log(titleList);
  1. Extract common function

Do you think as like as two peas for getting pictures url and title? Let's wrap them up.

Note that we need to write dynamic regular expressions here, because we need to pass in dynamic key s

function getValueListByReg(str, key) {
    const reg = new RegExp(`"${key}":"(.*?)"`, 'g');
    const matchResult = str.match(reg);
    const resList = matchResult.map(item => {
        const res = item.match(/:"(.*?)"/g)
        return RegExp.$1;
    })
    return resList
}


const htmlText = res.text;
const $ = cheerio.load(htmlText);

const imageUrlList = getValueListByReg(htmlText, 'objURL')
console.log(imageUrlList);

const titleList = getValueListByReg(htmlText, 'fromPageTitle')
console.log(titleList);
  1. Remove redundant content in duplicate title

As you can see, the title we obtained contains html tags, which we remove through regularization

const titleList = getValueListByReg(htmlText, 'fromPageTitle').map(item => item.replace("<strong>", '').replace("<\\/strong>", ''));
console.log(titleList);

Easy code for final molding

const superagent = require("superagent");
const cheerio = require('cheerio');
console.log(1123);
const word = 'Kitty'
//  The encodeURIComponent() function encodes a string as a URI component.
const headers_defalult = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"'
    //Cookies are not required
    // sec is security policy related    
}
//Creating dynamic regular functions
function getValueListByReg(str,key){
    const reg = new RegExp(`"${key}":"(.*?)"`,'g');
    const resMatches = str.match(reg) //    (. *?) regular means to match any character to the next qualified character
    const reslist = resMatches.map(item =>{
        const res = item.match(/:"(.*?)"/g); 
        return RegExp.$1;
    })  
    return reslist
}
superagent.get(`https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=${encodeURIComponent(word)}`)
.set("Accept",headers_defalult['Accept'])
.set("Accept-Encoding",headers_defalult['Accept-Encoding'])
.set("Accept-Language",headers_defalult['Accept-Language'])
.set("Cache-Control",headers_defalult['Cache-Control'])
.set("Connection",headers_defalult['Connection'])
.set("User-Agent",headers_defalult['User-Agent'])
.set("sec-ch-ua",headers_defalult['sec-ch-ua'])
.end((err,res)=>{
if(err){
        console.log(`Access failed- ${err}`);
    }else{
        const htmlText = res.text;
        const imagelist = getValueListByReg(htmlText,'objURL')
        const titlelist = getValueListByReg(htmlText,'fromPageTitle').map(item => item.replace('<\\/strong>','').replace('<strong>',''))
        console.log(imagelist);
        console.log(titlelist);
        
    }
})

Tags: Javascript node.js Vue crawler

Posted on Tue, 21 Sep 2021 22:35:07 -0400 by pdub56