webmagic crawler's handling of static page, dynamic page and js request mode crawling

webmagic crawls web data. For [page crawling content], see the previous blog post https://segmentfault.com/a/1190000020005655
The official documents of webmagic are as follows: http://webmagic.io/docs/zh/ The node information can be obtained according to different selectors
Web content crawling can be basically crawled by the following methods according to the way of page generation:
I. static page [the most common] can directly crawl data through the conventional method of webmagic
2. Some dynamically generated web pages need to be rendered to the page by browser driver in the crawler before crawling
3. Web pages that can get data from js requests can directly construct http requests to get data
The following will list the use of webmagic for these three crawling methods. The article is long and can be used according to your needs [crawling method]

I. static page crawling
Example:

import com.boe.mps.jrj.dataas.entity.BigDeposit;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.List;

/**
 * Large deposit receipt crawler
 */
@Repository
public class BigDepositProcessor implements PageProcessor{

    private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);
    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
    //  Data update time
        String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
    // Get data node
        List<Selectable> nodes = page.getHtml().$(".ebdp-pc4promote-circularcontainer-wrapper").nodes();
        List<BigDeposit> list = new ArrayList<>();
        for (int i = 0; i < nodes.size(); i++) {
            BigDeposit bigDeposit = new BigDeposit();
            //  Get td node in table list according to xpth
            List<Selectable> table = page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td").nodes();
            bigDeposit.setItemName(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[1]/span[1]/span/text()").get());
            bigDeposit.setItemRate(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[2]/a/text()").get());
            bigDeposit.setUpdateTime(updateTime);
            if (table.size()<=7){
                bigDeposit.setStartDepositPrice(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[4]/text()").get());
                bigDeposit.setGrading(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[6]/text()").get());
            }else {
                bigDeposit.setStartDepositPrice(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[5]/text()").get());
                bigDeposit.setGrading(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[7]/text()").get());
            }
            list.add(bigDeposit);
        }
        page.putField("bigDeposit",list);
        //Print crawled content
        list.forEach(e->{
            System.out.println(e);
        });
    }
//  Reptile test method
    public static void main(String[] args) {
    // Crawling path
        String bigdeposit="https://mybank.icbc.com.cn/servlet/ICBCBaseReqServletNoSession?dse_operationName=per_accountQueryFixedProductsOutOp&cmd=0&NormalOrBooking=0&IN_CURRFLAG=&IN_APPID=02&IN_SAVETYPE=&IN_BIGFLAG=1&JJGFLAG=0&Area_code=1001";
        //Crawler structure, print crawling results to the console
        Spider.create(new BigDepositProcessor()).addPipeline(new ConsolePipeline()).addUrl(bigdeposit).thread(5).run();
    }

}

/**
*BigDeposit Entity class
*/
@Data
public class BigDeposit {
    private Long id;
    /**Product name*/
    private String itemName;
    /**Product interest rate%*/
    private String itemRate;
    /**Initial deposit amount (yuan)*/
    private String startDepositPrice;
    /**Transaction differential*/
    private String grading;
    /**Update time*/
    private String updateTime;
}

2. The dynamic page chrome driver is rendered first and then crawled
webmagic calls the chromedriver driver, rendering the page first, and then crawling the data
Example:

import com.boe.mps.jrj.dataas.entity.Bond;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.List;

/**
 *All bond products
 */
@Repository
public class BondProcessor implements PageProcessor{


    String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
    long lastTime = DateTime.now().getMillis();

    private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);
    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        long execTime = DateTime.now().getMillis();
        //It takes more than one minute to reassign the update time, otherwise it will not be assigned! Solve the problem of page data and inconsistent execution time for multiple times
        if((execTime - lastTime) > 1000*60){
            updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
            lastTime = execTime;
        }
        //  Get pages
        String s = page.getHtml().xpath("//*[@id=lbInfo]").get();
        String sum = s.substring(s.indexOf("Total records:") + 5, s.indexOf("strip"));
        System.err.println("ss=="+sum);
        int total = Integer.parseInt(sum);
        for (int i = 0; i < total; i+=8) {
            String nextUrl="https://mybank.icbc.com.cn/icbc/newperbank/nationaldebt/nationaldebt_infoquery_product_nosession.jsp?pageFlag=0&qryBeginPos=1&jSonStrFilter=&isFilterFlag=0&QryTypex=0&matureYear=aaa&term2=aaa&keywords=&remainTerm=aaa&debtType=aaa&couponRate=aaa&currTypeFilter=aaa&pos=0&pos1=0&OrderString=0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C&ExtendTableDisplayFlag=2&beginPos="+(i+1);
            //  Page crawling, put the url of the next page into the crawler task list
            page.addTargetRequest(nextUrl);
        }
        List<Selectable> nodes1 = page.getHtml().xpath("//*[@id=ebdp-pc4promote-nationaldebtList]/div").nodes();
        System.out.println("tiaoshu="+nodes1.size());
        List<Bond> list = new ArrayList<>();
        for (int i = 1; i <=nodes1.size() ; i++) {//*[@id="ebdp-pc4promote-nationaldebtList"]/div[2]/div[1]/div[1]/div[1]/a
        Bond bond = new Bond();
        String s0 = page.getHtml().xpath("//*[@ id = ebdp-pc4promote-nationalebtlist] / div ["+ I +"] / div [3] "). $(". Ebdp-pc4promote-tuijian "). Get() = = null?" ":" recommendation ";
        String itemname = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/a/text()").get();
        String s1 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[1]/text()").get();
        String s2 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[2]/text()").get();
        String s3 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[3]/text()").get();
        String tradingtime = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[2]/span[2]/text()").get();
        String clientbuyingrate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[2]/span/text()").get();
        String clientbuyingprice=page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[3]/b/text()").get();
        String clientsellrate=page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[3]/div[2]/span/text()").get();
        String clientsellprice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[3]/div[3]/b/text()").get();
        String s4 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[4]/p/text()").get();
        String s5 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[4]/p/b/text()").get();
        String couponrate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[1]/text()").get();
        String accruedInterest = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[2]/text()").get();
        String currentPaymentDate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[3]/text()").get();
        String currentInterestIncome = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[4]/text()").get();
        String interestFrequency = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[5]/text()").get();
        String couponBondValue = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[6]/text()").get();
        String expireDate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[7]/text()").get();
        String holdExpireInterestIncome = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[8]/text()").get();
        String clientBuyingNetPrice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[9]/text()").get();
        String clientSellingNetPrice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[10]/text()").get();
        String itemType = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[11]/text()").get();
        bond.setItemName(itemname);
        bond.setItemFeature(s0+" "+s1+" "+s2+" "+s3);
        bond.setTradingHours(tradingtime);
        bond.setClientBuyingRate(clientbuyingrate);
        bond.setClientBuyingFullPrice(clientbuyingprice);
        bond.setClientSellingRate(clientsellrate);
        bond.setClientSellingFullPrice(clientsellprice);
        bond.setRemainTimeLimit(s4+s5);
        bond.setCouponRate(couponrate.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setAccruedInterest(accruedInterest.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setCurrentPaymentDate(currentPaymentDate.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setCurrentInterestIncome(currentInterestIncome.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setInterestFrequency(interestFrequency.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setCouponBondValue(couponBondValue.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setExpireDate(expireDate.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setHoldExpireInterestIncome(holdExpireInterestIncome.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setClientBuyingNetPrice(clientBuyingNetPrice.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setClientSellingNetPrice(clientSellingNetPrice.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setItemType(itemType.contains(": ")?couponrate.substring(couponrate.indexOf(": ")+1):"");
        bond.setUpdateTime(updateTime);
        list.add(bond);
        }

        page.putField("bond",list);
        list.forEach(e->{
            System.out.println(e);
        });
    }
//  Test example
    public static void main(String[] args) {
        String url="https://mybank.icbc.com.cn/icbc/newperbank/nationaldebt/nationaldebt_infoquery_product_nosession.jsp?pageFlag=0&qryBeginPos=1&jSonStrFilter=&isFilterFlag=0&QryTypex=0&matureYear=aaa&term2=aaa&keywords=&remainTerm=aaa&debtType=aaa&couponRate=aaa&currTypeFilter=aaa&pos=0&pos1=0&OrderString=0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C&ExtendTableDisplayFlag=2&beginPos=1";
        //  Get the configuration of chromedriver? Linux64 in the system
        System.setProperty("selenuim_config", "/home/myfile/tool_station/chromedriver_linux64/config.ini");
        //  Set the crawling mode of selenium downloader driver
        Spider.create(new BondProcessor()).thread(1)
                .addPipeline(new ConsolePipeline())
                .addUrl(url)
                .setDownloader(new SeleniumDownloader("/home/myfile/tool_station/chromedriver_linux64/chromedriver").setSleepTime(1000))
                .run();
    }
}

Download and configuration of chrome driver

chromedriver Download path[http://chromedriver.storage.googleapis.com/index.html](http://chromedriver.storage.googleapis.com/index.html)
[Please download the same version as your browser chromedriver Bao]
config.ini The file configuration is as follows:
driver=chrome
#chrome_exec_path=/usr/bin/google-chrome-stable
chrome_driver_loglevel=DEBUG

//The entity classes in this example are as follows:
@Data
public class Bond {
    private Long id;
    /**Name of bond*/
    private String itemName;
    /**Return on maturity of customers' purchase*/
    private String clientBuyingRate;
    /**Full price of customer buying transaction*/
    private String clientBuyingFullPrice;
    /**Customer selling yield*/
    private String clientSellingRate;
    /**Full price of customer selling transaction*/
    private String clientSellingFullPrice;
    /**Coupon rate*/
    private String couponRate;
    /**Accrued interest*/
    private String accruedInterest;
    /**Current interest payment date*/
    private String currentPaymentDate;
    /**Current interest income*/
    private String currentInterestIncome;
    /**Interest rate frequency*/
    private String interestFrequency;
    /**Bond debt denomination*/
    private String couponBondValue;
    /**Due date*/
    private String expireDate;
    /**Interest income at maturity*/
    private String holdExpireInterestIncome;
    /**Customer purchase net price*/
    private String clientBuyingNetPrice;
    /**Net selling price of customers*/
    private String clientSellingNetPrice;
    /**Bond type*/
    private String itemType;
    /**Characteristics of bonds*/
    private String itemFeature;
    /**Remaining period*/
    private String remainTimeLimit;
    /**Transaction time*/
    private String tradingHours;
    /**Update time*/
    private String updateTime;
    }

3. Dynamic rendering web page that requests to see data in js
Example:

import com.boe.mps.jrj.dataas.entity.ExchangeMarket;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.JsonPathSelector;

import java.util.*;

@Repository
public class ExchangeMarketProcessor implements PageProcessor{

    private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
        String json = page.getJson().get();
        System.out.println("=="+json);
        //  post requests to obtain data. The value is as follows
        List<String> strings = new JsonPathSelector("$.rf").selectList(page.getRawText());
        List<ExchangeMarket> list = new ArrayList<>();
        for (String str:strings) {
            Json item = new Json(str);
            ExchangeMarket exchangeMarket = new ExchangeMarket();
            exchangeMarket.setItemName(item.jsonPath("$.proName").get());
            exchangeMarket.setRisefall(item.jsonPath("$.riseSign").get());
            exchangeMarket.setBankBuyingPrice(item.jsonPath("$.buyRate").get());
            exchangeMarket.setBankSellingPrice(item.jsonPath("$.sellRate").get());
            exchangeMarket.setMiddlePrice(item.jsonPath("$.middPrice").get());
            exchangeMarket.setDayRisefallRange(item.jsonPath("$.openprice_dr").get());
            exchangeMarket.setDayRisefallValue(item.jsonPath("$.openprice_dv").get());
            exchangeMarket.setYearRisefallRange(item.jsonPath("$.openprice_yr").get());
            exchangeMarket.setUpdateTime(updateTime);
            list.add(exchangeMarket);
        }
        page.putField("exchangeMarket",list);
    }
    public static void main(String[] args) {
        String agriculturalUrl="https://mybank.icbc.com.cn/ctp/ctpservlet/EbdpAjaxServlet";
        Request exchangeMarketRequest = new Request(agriculturalUrl);
        exchangeMarketRequest.setMethod(HttpConstant.Method.POST);
        //  Construct post request and parameter setting
        Map<String, Object> agriculturalMap = new HashMap<>();
        agriculturalMap.put("tranCode","A00513");
        exchangeMarketRequest.setRequestBody(HttpRequestBody.form(agriculturalMap,"utf-8"));
        Spider.create(new ExchangeMarketProcessor()).addPipeline(new ConsolePipeline()).addRequest(exchangeMarketRequest).thread(1).run();
    }
}

In this example, the entity class is as follows:

@Data
public class ExchangeMarket {
    /**Primary key*/
    private Long id;
    /**
     *Varieties
     */
    private String itemName;
    /**
     *Ups and downs
     */
    private String risefall;
    /**
     *Bank purchase price
     */
    private String bankBuyingPrice;
    /**
     *Bank selling price
     */
    private String bankSellingPrice;
    /**
     *Middle price
     */
    private String middlePrice;
    /**
     *Up and down value of the day
     */
    private String dayRisefallValue;
    /**
     *Up and down of the day
     */
    private String dayRisefallRange;
    /**
     *Year up and down
     */
    private String yearRisefallRange;
    /**
     * Update time
     */
    private String updateTime;
    }

The request url of this example [see code above] https://mybank.icbc.com.cn/ct...
js requests can be called by postman first to confirm the request mode and required parameters.

Tags: Java JSON JSP Selenium

Posted on Tue, 03 Dec 2019 16:43:42 -0500 by wikedawsum