Python beautiful soup crawls all the novels in biqu Pavilion

This is an exercise. Use python script to crawl the free novel above the fun Pavilion.

Environment: python3
Class library: beautifulsop
Data source: http://www.biqukan.cc

The principle is to disguise the normal http request and visit the web page normally. Then, we use bs4 to re parse the html structure to extract the valid data.

1. config file

It contains the masquerade request header and data source configuration (you can write to death if you do not consider expanding other data sources).

#!/usr/bin/python
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf8')

source = {
    'biquge': {
        'base_url': 'http://www.biqukan.cc',
        'category_min': 1,
        'category_max': 2,
        'category_url': 'http://www.biqukan.cc/fenlei{id}/1.html'
    }
}

header = [
    {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'},
    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'},
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
    {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'}
]

config.py file

2. Crawl all kinds of Novels

#!/usr/bin/python
#coding:utf-8

import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header
import hashlib
import time

hash_md5 = hashlib.md5()

## 
# Get the article name and corresponding links through classification
# 
def fiction():
    url = source['biquge']['category_url']
    cur_category_name = ''
    _list = {}
    for i in range(source['biquge']['category_min'], source['biquge']['category_max']):
        req = requests.get(url.replace('{id}', '%s'%i), headers = header[random.randint(0,4)])
        _temp_result = req.content.decode('gbk')
        bs = BeautifulSoup(_temp_result, "html.parser")

        next_page = bs.find('ul', id='pagelink')
        while next_page!=None:
            next_page = next_page.find('a', 'next')
            if next_page==None:
                break

            # Renewal of Novels
            _page = _cur_page(bs)
            print('page.length = %d'%len(_page))
            _list.update(_page)

            # Get next page data
            req = requests.get(next_page.attrs['href'], headers = header[random.randint(0,4)])
            _temp_result = req.content.decode('gbk')
            bs = BeautifulSoup(_temp_result, "html.parser")
            next_page = bs.find('ul', id='pagelink')

            # Take a break
            time.sleep(random.random())

    return _list

## 
# All novel information for the current page
# 
def _cur_page(bs):
    _list = {}
    # top list
    li_tags = bs.findAll('li', 'list-group-item')
    if li_tags==None or len(li_tags)<=0:
        return _list

    for item in li_tags:
        a_tag = item.find('a')
        _item = {'name':a_tag.get_text(), 'link': a_tag.attrs['href']}

        # author
        author = item.find('small').get_text().replace('/ ', '')
        _item['author'] = author

        # Reading number
        readers = item.find('span').get_text()
        _item['readers'] = readers

        hash_md5.update(_item['link'])
        _list[hash_md5.hexdigest()] = _item

    # List of recent updates
    tr_tags = bs.findAll('tr')
    if tr_tags==None or len(tr_tags)<=1:
        return _list

    for item in tr_tags:
        a_tag = item.find('a')
        if a_tag==None:
            continue

        _item = {'name':a_tag.get_text(), 'link': a_tag.attrs['href']}

        # author
        author = item.find('td', 'text-muted').get_text()
        _item['author'] = author

        # state
        status = item.findAll('td')
        _item['status'] = status[len(status)-1].get_text()

        hash_md5.update(_item['link'])
        if _list.has_key(hash_md5.hexdigest())!=True:
            _list[hash_md5.hexdigest()] = _item
        else:
            _list[hash_md5.hexdigest()]['status'] = _item['status']

    return _list

if __name__=="__main__": 
    _temp = fiction()
    print('done')

fiction.py file

3. Crawling novel introduction information

#!/usr/bin/python
#coding:utf-8

import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header

##
# Grab introduction, return to structure
# Title: do you know? Do you know? It should be green, fat, red and thin
# Author: care is chaos
# Classification: urban romance
# Number of words: 2 million
# Number of readings: 2w+
# Status: middle and end of series
# Cover: url
# Introduce: Zhao Liying, Feng Shaofeng stars in TV play "do you know?"? Do you know? It should be the original work of "green, fat, red and skinny". The play was produced by midday sunshine film industry, and Hou Hongliang was the producer. It was launched on September 6, 2017. The house fights the Chu, the ancient saying big God cares is disorderly, the hand teaches you the practical ancient survival guide. An ancient commoner who was slack in her work, how could she struggle for such a hard life. In ancient times, the life tone of noble women was determined by the family. It was also popular for Zhulian. A sudden disaster would completely suffer. To live well and live with dignity, minglan said, Yali was very big. It's too dangerous in ancient times. Let's sleep.
# 
def summary(url):
    _result={'title':'', 'author':'', 'category':'', 'words':'', 'readers':'', 'status':'', 'cover_img':'', 'summary':''}

    req = requests.get(url,headers = header[random.randint(0,4)])
    _temp_result = req.content.decode('gbk')
    bs = BeautifulSoup(_temp_result, "html.parser")

    title_tag = bs.find('h1', 'bookTitle')
    if title_tag!=None:
        _result['title'] = title_tag.get_text()

    book_tag = bs.find('p', 'booktag')
    if book_tag!=None:
        a_tags = book_tag.findAll('a')
        _result['author'] = a_tags[0].get_text()
        _result['category'] = a_tags[1].get_text()

        span_tags = book_tag.findAll('span')
        _result['words'] = span_tags[0].get_text()
        _result['readers'] = span_tags[1].get_text()
        _result['status'] = span_tags[2].get_text()

    intro_tag = bs.find('p', id='bookIntro')
    _result['cover_img'] = intro_tag.find('img').attrs['src']
    _result['summary'] = intro_tag.get_text().replace('\n\r\n                            ','').replace('\r\n                        ','')

    return _result

if __name__=="__main__": 
    _temp = summary('http://www.biqukan.cc/book/47583/')
    print(_temp)

summary.py file

4. Crawling the novel catalogue

#!/usr/bin/python
#coding:utf-8

import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header

# Grab directory
def catalog(url):
    _list=[]
    req = requests.get(url,headers = header[random.randint(0,4)])
    _temp_result = req.content.decode('gbk')
    bs = BeautifulSoup(_temp_result, "html.parser")

    all_list = bs.find('div', id='list-chapterAll')
    if all_list==None:
        return _list

    list_tag = all_list.find('dl', 'panel-chapterlist')
    if list_tag==None:
        return _list

    a_tags = list_tag.findAll('a')
    for k in a_tags:
        _dict={}
        _dict['name'] = k.get_text()
        _dict['link'] = url + k.attrs['href']
        _list.append(_dict)

    return _list

if __name__=="__main__": 
    _temp = catalog('http://www.biqukan.cc/book/47583/')
    print(_temp)

catalog.py file

5. Crawling the text of the novel

#!/usr/bin/python
#coding:utf-8

import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header

##
# Grab the text of the novel
#
def detail(url):
    per_artitle_limit_page = 3;
    title=''
    content=''
    for i in range(1, per_artitle_limit_page):
        if i==1:
            part_url = ''
        else:
            part_url = '_%s'%i

        req = requests.get(url.replace('.html',part_url + '.html'),headers = header[random.randint(0,4)])
        _temp_result = req.content.decode('gbk')
        bs = BeautifulSoup(_temp_result, "html.parser")

        # title
        if len(title)<=0:
            title = bs.find('li','active').get_text()#re.findall(title_re, _temp_result)[0]

        content_tag = bs.find('div', id='htmlContent')
        if content_tag==None:
            break

        next_tag = content_tag.find('p', 'text-danger')
        if next_tag!=None:
            next_tag.clear()
        _ = content_tag.get_text().replace('-->>', '').replace('Remember in a second www.biqukan.cc]´╝îUpdate quickly, no pop-up window, free reading!','')
        content += _
    return content

def filter(content):
    _temp = content.split('\r\n')
    for index in range(len(_temp)):
        _temp[index] = _temp[index].replace(' ','')
    _temp = [elem for elem in _temp if elem != None and len(elem) != 0]
    return ''.join(_temp)

if __name__=="__main__": 
    _temp = detail('http://www.biqukan.cc/book/20461/12592815.html')
    print(filter(_temp))

article.py file

summary

There is no data saving module. If you need to make a complete project, you only need to save the novel data structure (save disk space). Through the novel url, we can quickly extract the brief introduction, catalogue and the text of each chapter.

If you want to do better, you can cache the contents, introduction, body and other parts, of course, there must be enough space.

Tags: Python Windows Linux Android

Posted on Sun, 01 Dec 2019 21:09:44 -0500 by MikeyNoedel