python selenium crawler website seo brush traffic Article 1: Baidu search

SEO students and webmasters know that after a large number of original contents of the website are included, it is appropriate to do a little white hat to brush the traffic, which is still good for the promotion of website keyword ranking. This article first introduces a simple Demo to brush the traffic with Baidu search.

1, Demo system design idea

Simulate real users to search for website keywords in Baidu, find a snapshot of their company's website by turning the page in the search results, and then click the snapshot link to jump to the company's website, so as to improve the popularity and ranking of website keywords by simulating this scene process.

2, Demo system features

1. Website snapshot address can be set: by configuring the website snapshot address, you can brush different website keywords

2. The number of times of brushing per day can be set: by configuring the number of times of brushing, the frequency of brushing different keywords on different websites can be realized

3. Ranking keywords can be set: ranking keywords can be read from files, and keyword addition, deletion and modification are supported.

4. Page number of search results can be set: you can configure the page number of search results. If it is set to 5, the company website snapshot will be found from the first 5 pages of search results. If it is set to 10, the company website snapshot will be found from the first 10 pages of search results.

5. Support manual suspension of brush amount

6. Support recording brush log

3, Demo system main code

Code snippet 1. Create WebDriver browser driver. The specific code is as follows:

#Create browser drive
def createWebDriver():
    # configuration parameter
    options = webdriver.ChromeOptions()
    # Manually specify the browser location to use
    # options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
    # Set page encoding
    options.add_argument('lang=zh_CN.UTF-8')
    # Disable loading pictures
    options.add_argument('blink-settings=imagesEnabled=false')
    # Disable graphics card
    # options.add_argument('--disable-gpu')
    # No interface mode
    options.add_argument('--headless')
    # It seems useless to disable the prompt that the browser is being controlled by an automated program
    # options.add_argument('--disable-infobars')
    # Set different request headers
    # useragent=['--User-Agent=Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0',
    # '--User-Agent=Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
    # '--User-Agent=Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
    # '--User-Agent=Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)',
    # '--User-Agent=Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0',
    # '--User-Agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36']
    # options.add_argument(useragent[random.randint(0, len(useragent)-1)])

    # Actions for IP agents
    # ip=getProxyId()
    # logger.info("proxyIp:%s" % ip)
    # options.add_argument('--proxy-server=http://%s' % ip)

    driver = webdriver.Chrome(options=options, keep_alive=True)

    # Prevent selenium access from being recognized, not counting traffic
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver', {
              get: () => undefined
            })
          """
    })

    return driver

Code segment 2. Timed task thread is used to execute timed tasks concurrently. The specific code is as follows:

# Worker thread
class JobThread(threading.Thread):
    def __init__(self,index,waitime):
        threading.Thread.__init__(self)
        self.index=index
        self.waittime = waitime
        self.bgntime=0

    def run(self):
        # Waiting to trigger task
        while (not myframe.stop_refresh_page_thread) and self.bgntime<self.waittime:
            time.sleep(1)
            self.bgntime=self.bgntime+1
        # Perform tasks
        runjob()
        # Execution complete
        my_logger_info(logger,'thread%d bgntime is %d second'%(self.index,self.bgntime))

# Manual stop or wait for the specified time
def runIdleSomeTime(idletime):
    bgntime = 0
    while not myframe.stop_refresh_page_thread and bgntime < idletime:
        time.sleep(1)
        bgntime = bgntime + 1

Code snippet 3. Enter keywords for Baidu search and find website snapshots from the search results. The specific codes are as follows:

# Find the website snapshot, find it and click open
def  findSiteSnapshot(driver,siteurl):
    # Load ranking keywords
    wdary=myframe.nowtask_keywordlist.split(";")
    for n in range(len(wdary)):
        # 1. Load login page
        wd_s=wdary[n]
        # wd=urllib.parse.quote(wdary[n])
        wd=wd_s
        url ="https://www.baidu.com/"
        driver.get(url)
        locator = (By.ID, 'kw')
        WebDriverWait(driver, 1, 0.5).until(EC.presence_of_element_located(locator))
        driver.find_element_by_id("kw").send_keys(wd)
        submitinput=driver.find_element_by_id("su")
        driver.execute_script("arguments[0].click();", submitinput)

        # Get main window handle
        mainpagehandle = driver.current_window_handle

        locator = (By.LINK_TEXT,"next page >")
        WebDriverWait(driver, 3, 0.5).until(EC.presence_of_element_located(locator))

        sitelink_s=None
        kuaizhaostr="Baidu snapshot"
        includeurl = False
        iskuaizhao = False
        matchnum=0
        my_logger_info(logger,"<<Start site'%s'key word'%s'Snapshot record search" % (siteurl,wd_s))
        # Cycle per page
        pagenum = myframe.pagenuminput.GetValue()
        pagenum = int(pagenum)
        for pageindex in range(1,pagenum+1):
            # Simulate finding keyword snapshot records on the current page
            runIdleSomeTime(random.randint(10, myframe.pagewaittime))
            # Find current page record
            pagesource = driver.page_source
            soup = BeautifulSoup(pagesource, "html.parser")
            #Sometimes the record entry class is result c-container XPath log new PMD, such as "200x100 bridge national standard thickness"
            # Sometimes it is result c-container new PMD, such as "bridge" and "bridge installation"
            recorddiv=soup.find_all(class_="result c-container xpath-log new-pmd")
            if len(recorddiv)==0:
                recorddiv = soup.find_all(class_="result c-container new-pmd")
            # Cycle through each record on the current page
            for recordindex in range(len(recorddiv)):
                includeurl = False
                iskuaizhao = False
                for link in recorddiv[recordindex].find_all("a"):
                    if link.text==kuaizhaostr:
                        iskuaizhao=True
                    if link.text.find(siteurl.replace("www.","")+"/")>-1:
                        sitelink_s=link
                        includeurl=True
                if iskuaizhao and includeurl:
                    my_logger_info(logger,"Match No%d Page%d Records succeeded"%(pageindex,recordindex+1))
                    try:
                        sitelink = driver.find_element_by_link_text(sitelink_s.text)
                        driver.execute_script("arguments[0].click();", sitelink)
                        windows = driver.window_handles
                        driver.switch_to.window(windows[0])
                        matchnum+=1
                    except Exception as e:
                        info = traceback.format_exc()
                        my_logger_info(logger,info)
                else:
                    pass
                    # logger.info("comparing record% d on page% d" (pageindex,recordindex+1))
                # Support manual stop of brushing amount
                if (myframe.stop_refresh_page_thread):
                    break
            # Support manual stop of brushing amount
            if (myframe.stop_refresh_page_thread):
                break
            # Turn to the next page and continue your search
            nextpagelink=driver.find_element_by_link_text("next page >")
            if nextpagelink and nextpagelink.is_displayed():
                driver.execute_script("arguments[0].click();", nextpagelink)
            else:
                break

        # Support manual stop of brushing amount
        if (myframe.stop_refresh_page_thread):
            break
        # Displays the number of matching records
        my_logger_info(logger,"website'%s'key word'%s'Total snapshot records%d individual"%(siteurl,wd_s,matchnum))
        my_logger_info(logger,">>End site'%s'key word'%s'Snapshot record search" % (siteurl,wd_s))
    # Print manual stop brushing log
    if (myframe.stop_refresh_page_thread):
        my_logger_info(logger, "Brush amount stopped")

4, Demo system operation screenshot

 

Students who need Demo source code can scan the following wechat QR code for payment,

 

  When scanning the payment, note down the transfer order number, and then add wechat sfjsffj928,   The format of sending verification information is as follows:

Purchase brush amount Demo source code - transfer doc No

After verification, the Demo source code will be sent to your wechat.

 

Tags: Python Selenium

Posted on Fri, 03 Sep 2021 17:06:28 -0400 by forum