Reader web author publishing times statistics crawler

The first is:

Manually constructing the url of each journal according to the law

Crawl twice. After the first crawl, de duplicate (set) and initialize the result set. After the second crawl, query and count in the result set

Convert the result set to a list and sort the dictionaries (equivalent to the data set) as a whole according to the times keyword in the list

Write result to txt file

from bs4 import BeautifulSoup as Be
import requests as req
import os

BaseUrl = "http://www.52duzhe.com/"

def Do_soup(url):
    try:
        r = req.get(url,headers={'user-agent':'Mozilla/5.0'})
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        html = r.text
        soup = Be(html,'lxml')
        return soup
    except:
        print("Obtain"+url+"fail")


def Search_each_Per(tag):
    aut_set = set()
    a_list = []
    for i in range(2010,2018):
        for j in range(1,25):
            if j<10:
                ExtraUrl = str(i)+'_0'+str(j)
            else:
                ExtraUrl = str(i)+'_'+str(j)
            if i in [2010,2011,2012]:
                if(i == 2012 and j>=14):
                    url = BaseUrl + ExtraUrl + r"/index.html"
                else:
                    url = BaseUrl + '1_' + ExtraUrl + ".html"
            else:
                url = BaseUrl + ExtraUrl + r"/index.html"
            soup = Do_soup(url)  #Usage function
            if soup == None:
                continue
            per_aut_list = soup.find_all('td',class_="author")
            if tag==1:
                for k in per_aut_list:
                    aut_set.add(k.string)
                print("{}year{}Period author entered".format(i,j))
            else:
                for k in per_aut_list:
                    a_list.append(k.string)
    if tag==1:
        return list(aut_set)    #Returned a list of authors after de duplication
    else:
        return a_list          #Returned a list with duplicate elements for counting
    
def main():
    author_list0 = Search_each_Per(1)   # 1 stands for a control flag and receives a non duplicate list
    print("Receiving duplicate data list,Please wait...")
    a_list = Search_each_Per(0)         #Receive duplicate element list
    result = {}                         #Dictionary of results
    for i in author_list0:
        result[str(i)] = 0     #Initialize statistics
        for j in a_list:
            if i==j:
                result[str(i)] += 1
    #Next, the results are processed in descending order according to the publishing times
    print("Next, the results are processed in descending order according to the publishing times...")
    att = []              #Make a container
    for key,value in result.items():
        j={}
        j["author"]=key
        j["times"]=value
        att.append(j)
    att.sort(key = lambda x:x["times"],reverse = True)
    # Write results to text text
    print("Write results to text In text,Please wait patiently...")
    path = os.getcwd()
    filename = path + "Reader author result 1.txt"
    new = open(filename,"w",errors='ignore')   #Illegal characters may exist in the network byte stream to be ignored: illegal multibyte sequence
    for i in att:
        author = i["author"]
        times = i["times"]
        print(author)
        print(times)
        if author == None:                       #unsupported operand type(s) for +: 'NoneType' and 'str'
            new.write("None" +"\t" + str(times) + "\n")
        else:
            new.write(author +"\t" + str(times) + "\n")
    new.close()
    print("Completion statistics")

main()

Second species:

Manually constructing the url of each journal according to the law

Crawl it once, put the collection (data set) in the list (container), and set the tag to judge whether there is this author in the list. If not, add it. If yes, time + 1

Directly return to the list, and sort the dictionary (equivalent to data set) as a whole according to the times keyword in the list

Write result to txt file

from bs4 import BeautifulSoup as Be
import requests as req
import os

BaseUrl = "http://www.52duzhe.com/"

def Do_soup(url):
    try:
        r = req.get(url,headers={'user-agent':'Mozilla/5.0'})
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        html = r.text
        soup = Be(html,'lxml')
        return soup
    except:
        print("Obtain"+url+"fail")


def Search_each_Per():
    obj_list = []
    for i in range(2010,2018):
        for j in range(1,25):
            if j<10:
                ExtraUrl = str(i)+'_0'+str(j)
            else:
                ExtraUrl = str(i)+'_'+str(j)
            if i in [2010,2011,2012]:
                if(i == 2012 and j>=14):
                    url = BaseUrl + ExtraUrl + r"/index.html"
                else:
                    url = BaseUrl + '1_' + ExtraUrl + ".html"
            else:
                url = BaseUrl + ExtraUrl + r"/index.html"
            soup = Do_soup(url)  #Usage function
            if soup == None:
                continue
            per_aut_list = soup.find_all('td',class_="author")
            for it in per_aut_list:               #Don't get used to using i, change the name, it's a pit
                tag = 0
                for jk in obj_list:
                    if(jk["author"] == it.string):
                        jk["times"] += 1
                        tag = 1
                        break
                if(tag == 0):
                    obj = {"author":it.string,"times":1}
                    obj_list.append(obj)
    return obj_list
    
def main():
    print("Creating result object list, please wait patiently...")
    obj_list = Search_each_Per()          #Accept result list
                                          #Next, the results are processed in descending order according to the publishing times
    print("Next, the results are processed in descending order according to the publishing times...")
    obj_list.sort(key = lambda x:x["times"],reverse = True)
                                          # Write results to text text
    print("Write results to text In text,Please wait patiently...")
    path = os.getcwd()
    filename = path + "Reader author results 3.txt"
    new = open(filename,"w",errors='ignore')  #Illegal characters may exist in the network byte stream to be ignored: illegal multibyte sequence
    for i in obj_list:
        author = i["author"]
        times = i["times"]
        print(author)
        print(times)
        if author == None:                       #unsupported operand type(s) for +: 'NoneType' and 'str'
            new.write("None" +"\t" + str(times) + "\n")
        else:
            new.write(author +"\t" + str(times) + "\n")
    new.close()
    print("Completion statistics")

main()

Third species:

Using classes to create objects (datasets)

Directly crawl the links of each journal on the homepage, put them in the list, traverse the list to query the authors of each journal

Crawl it again, use the list (container) to put the object (data set), and set the tag to judge whether there is this object in the list. If not, instantiate an object and update the list. times+1

Directly return to the list, and sort the objects (equivalent to the data set) according to the times attribute of the objects in the list

Write result to txt file

class Author(object):
    def __init__(self,name):
        self.name = name
        self.times = 1

from bs4 import BeautifulSoup as Be
import requests as req
import os

BaseUrl = "http://www.52duzhe.com/"

def Do_soup(url):
    try:
        r = req.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        html = r.text
        soup = Be(html,'lxml')
        return soup
    except:
        print("Obtain"+url+"fail")


def Search_each_Per():
    url_list = []
    obj_list = []
    soup = Do_soup(BaseUrl)
    link = soup.select(".booklist a")   #Get the link, put it back in the dictionary
    for item in link:
        url = BaseUrl +item["href"]
        url_list.append(url)
    for url in url_list:
        soup = Do_soup(url)  #Usage function
        if soup == None:
            continue
        per_aut_list = soup.find_all('td',class_="author")
        for i in per_aut_list:
            tag = 0
            for j in obj_list:
                if(j.name == i.string):
                    j.times += 1
                    tag = 1
                    break
            if(tag == 0):
                obj = Author(i.string)
                obj_list.append(obj)
    return obj_list
    
def main():
    print("Creating object list, please wait...........")
    obj_list = Search_each_Per()
                                                        #Next, the results are processed in descending order according to the publishing times
    print("Next, the results are processed in descending order according to the publishing times...")
    obj_list.sort(key = lambda obj:obj.times,reverse = True)
    # Write results to text text
    print("Write results to text In text,Please wait patiently...")
    path = os.getcwd()
    filename = path + "Reader author results 2.txt"
    new = open(filename,"w",errors="ignore")         #Processing illegal multibyte sequence
    for i in obj_list:
        author = i.name
        times = i.times
        print(author)
        print(times)
        if author == None:
            new.write("None" +"\t" + str(times) + "\n")
        else:
            new.write(author +"\t" + str(times) + "\n")
    new.close()
    print("Completion statistics")

main()

Tags: Python encoding Lambda network Attribute

Posted on Thu, 16 Jan 2020 03:15:10 -0500 by FoTo50