US News university ranking data acquisition

  obtaining the university ranking data of US News is a little complicated. US News 2022 Best Global Universities Rankings Web pages are dynamically refreshed, and the amount of data is relatively large. Data acquisition is divided into three steps:
1. Access to basic university information; 2. Obtain details of each university; 3. Ranking indicators of various disciplines and Universities

Get basic information


   open F12 debugging tool in the process of continuous drop-down, and you can see a package starting with "search? Format = JSON & page =" and the response of this package contains some basic information about the University. Each response package contains information about 10 universities.


  looking at the header of the package, we can find its "request URL". As long as we send the request URL, we can get the corresponding response package. There are a total of 2005 universities, so it only takes 201 cycles.

#encoding=utf-8
import requests
import time

fp = open('collegeInfo.txt', 'w', encoding='utf-8')
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}

for page in range(1, 202, 1):
    response = requests.get(url='https://www.usnews.com/education/best-global-universities/search?format=json&page='+str(page),headers=headers)
    if response.status_code == 200:
        re_text = response.json()
        li = re_text.get("items")
        for i in li:
            strConcat = i['city'] + ';' + i['country_name'] + ';' + str(i['id']) + ';' + i['name'] + ';' + str(i['ranks'][0]['value']) + ';' + str(i["stats"][0]['value']) + ';' + str(i['stats'][1]['value']) + ';' + i['url']
            print(strConcat)
            fp.writelines(strConcat)
        time.sleep(3)
    else:
        print('response error')
        break
fp.close()

Get details

    the above code obtains the basic information of several universities. I save it to a txt file "collegeInfo.txt", and then import it into excel. Select the semicolon with the separator to get the school names and Web links of several universities. Save the school name and web page links to two text files "collegeNames.txt" and "collegeLinks.txt" respectively for subsequent access to the details of the University.
  what are the details? Take Harvard University as an example:


  including "University Data" and "Rankings" above, basically every university has these two parts of data, but some data items may be lacking. Therefore, for each university, I start from an empty dictionary to fill in data. A university corresponds to a Dataframe, and finally concatenate the dataframes of all universities.

# _._ coding:utf-8 _._#
import lxml
from lxml import etree
import requests
import time
import pandas as pd
import random

fp_name = open('collegeNames.txt', 'r', encoding='utf-8')
fp_links = open('collegeLinks.txt', 'r', encoding='utf-8')

dfli = []

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}
cnt = 1
while True:
    schoolName = fp_name.readline()
    schoolName = schoolName.split('\n')[0]
    if not schoolName:
        break
    schoolUrl = fp_links.readline()

    dataDict = {'University':schoolName,
            'Total number of students':'',
            'Number of international students':'', 
            'Total number of academic staff':'', 
            'Number of international staff':'', 
            'Number of undergraduate degrees awarded':'', 
            "Number of master's degrees awarded":'', 
            'Number of doctoral degrees awarded':'', 
            'Number of research only staff':'',
            'Number of new undergraduate students':'', 
            "Number of new master's students":'', 
            'Number of new doctoral students':'',
            'Best Global Universities':'',
            'Best Global Universities in Region':'',
            'Best Global Universities in Country':'',
            'Agricultural Sciences':'',
            'Arts and Humanities':'', 
            'Biology and Biochemistry':'',
            'Biotechnology and Applied Microbiology':'',
            'Cardiac and Cardiovascular Systems':'',
            'Cell Biology':'',
            'Chemical Engineering':'',
            'Chemistry':'',
            'Civil Engineering':'',
            'Clinical Medicine':'',
            'Computer Science':'',
            'Condensed Matter Physics':'',
            'Economics and Business':'',
            'Electrical and Electronic Engineering':'',
            'Endocrinology and Metabolism':'',
            'Energy and Fuels':'',
            'Engineering':'',
            'Environment/Ecology':'',
            'Food Science and Technology':'',
            'Gastroenterology and Hepatology':'',
            'Geosciences':'',
            'Immunology':'',
            'Infectious Diseases':'',
            'Materials Science':'',
            'Mathematics':'',
            'Mechanical Engineering':'',
            'Microbiology':'',
            'Molecular Biology and Genetics':'',
            'Nanoscience and Nanotechnology':'',
            'Neuroscience and Behavior':'',
            'Oncology':'',
            'Optics':'',
            'Pharmacology and Toxicology':'',
            'Physical Chemistry':'',
            'Physics':'',
            'Plant and Animal Science':'',
            'Polymer Science':'',
            'Psychiatry/Psychology':'',
            'Public, Environmental and Occupational Health':'',
            'Radiology, Nuclear Medicine and Medical Imaging':'',
            'Social Sciences and Public Health':'',
            'Space Science':'',
            'Surgery':''}

    response = requests.get(url=schoolUrl, headers=headers)

    if response.status_code == 200:
        html = response.text.encode('utf-8')
        result = etree.HTML(html)


        content = result.xpath('//*[@id="uniData"]/div/div')

        for subItem in range(len(content)):
            label = content[subItem].xpath('./p[1]')[0].text
            value = content[subItem].xpath('./p[2]')[0].text
            valueSplit = value.split(',')
            valueConcat = ''
            for i in range(len(valueSplit)):
                valueConcat = valueConcat + valueSplit[i]

            print(str(cnt) + ' ' + schoolName + ' ' + label + ': ' + valueConcat)
            dataDict[label] = valueConcat

        content = result.xpath('//*[@id="rankings"]/div')
        for i in range(len(content)):
            if content[i].attrib['class'] == 'mb5':
                subUl = content[i].xpath('./div/ul')
            elif content[i].attrib['class'] == 'subject-rankings':
                subUl = content[i].xpath('./ul')
            else:
                break

            for subLi in subUl:
                subLiObj = subLi.xpath('./li')
                for items in subLiObj:
                    rankValue = items.xpath('./a/div/strong/text()')
                    if rankValue[0] == '#':
                        rank = rankValue[1]
                    else:
                        rank = rankValue[0]
                    label = items.xpath('./a/strong')[0].text
                    if label in dataDict.keys():
                        dataDict[label] = rank
                    elif ((label == 'Best Global Universities in Asia') |
                          (label == 'Best Global Universities in Africa') |
                          (label == 'Best Global Universities in Australia/New Zealand') |
                          (label == 'Best Global Universities in Europe') |
                          (label == 'Best Global Universities in Latin America')):
                        dataDict['Best Global Universities in Region'] = rank
                    else:
                        dataDict['Best Global Universities in Country'] = rank
                    print(str(cnt) + ' ' + schoolName + ' ' + label + ': ' + rank)

        df = pd.DataFrame(dataDict, index=[0])       
        dfli.append(df)
        cnt = cnt + 1
    else:
        break

    time.sleep(random.random()*2+1)
    
dfli = pd.concat(dfli)
dfli.to_excel('UniversityDetail.xlsx', index=False)

Ranking indicators of various disciplines and Universities

  different disciplines also have their own rankings. The approximate data are as follows:

   when traversing the web pages of each university, you can find the ranking indicators of multiple disciplines in each web page, set a Dataframe list for each discipline, and then concatenate it and write it into excel.

from lxml import etree
import requests
import time
import re
import pandas
import random

from requests.api import head

fpDict = {'Global Universities': 0,
          'Agricultural Sciences': 1,
          'Arts and Humanities':2,
          'Biology and Biochemistry':3,
          'Biotechnology and Applied Microbiology':4,
          'Cardiac and Cardiovascular Systems':5,
          'Cell Biology':6,
          'Chemical Engineering':7,
          'Chemistry':8,
          'Civil Engineering':9,
          'Clinical Medicine':10,
          'Computer Science':11,
          'Condensed Matter Physics':12,
          'Economics and Business':13,
          'Electrical and Electronic Engineering':14,
          'Endocrinology and Metabolism':15,
          'Energy and Fuels':16,
          'Engineering':17,
          'Environment/Ecology':18,
          'Food Science and Technology':19,
          'Gastroenterology and Hepatology':20,
          'Geosciences':21,
          'Immunology':22,
          'Infectious Diseases':23, 
          'Materials Science':24,
          'Mathematics':25,
          'Mechanical Engineering':26,
          'Microbiology':27,
          'Molecular Biology and Genetics':28,
          'Nanoscience and Nanotechnology':29,
          'Neuroscience and Behavior':30,
          'Oncology':31,
          'Optics':32,
          'Pharmacology and Toxicology':33,
          'Physical Chemistry':34,
          'Physics':35,
          'Plant and Animal Science':36,
          'Polymer Science':37,
          'Psychiatry/Psychology':38,
          'Public, Environmental and Occupational Health':39,
          'Radiology, Nuclear Medicine and Medical Imaging':40,
          'Social Sciences and Public Health':41,
          'Space Science':42,
          'Surgery':43}

fp_name = open('collegeNames.txt', 'r', encoding='utf-8')
fp_country = open('collegeCountries.txt', 'r', encoding='utf-8')
fp_links = open('collegeLinks.txt', 'r', encoding='utf-8')

dfli0  = []
dfli1  = []
dfli2  = []
dfli3  = []
dfli4  = []
dfli5  = []
dfli6  = []
dfli7  = []
dfli8  = []
dfli9  = []
dfli10 = []
dfli11 = []
dfli12 = []
dfli13 = []
dfli14 = []
dfli15 = []
dfli16 = []
dfli17 = []
dfli18 = []
dfli19 = []
dfli20 = []
dfli21 = []
dfli22 = []
dfli23 = []
dfli24 = []
dfli25 = []
dfli26 = []
dfli27 = []
dfli28 = []
dfli29 = []
dfli30 = []
dfli31 = []
dfli32 = []
dfli33 = []
dfli34 = []
dfli35 = []
dfli36 = []
dfli37 = []
dfli38 = []
dfli39 = []
dfli40 = []
dfli41 = []
dfli42 = []
dfli43 = []

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}
cnt = 1
while True:
    schoolName = fp_name.readline()
    schoolName = schoolName.split('\n')[0]
    if not schoolName:
        break

    schoolCountry = fp_country.readline()
    schoolCountry = schoolCountry.split('\n')[0]
    schoolUrl = fp_links.readline()

    response = requests.get(url=schoolUrl, headers=headers)

    if response.status_code == 200:
        html = response.text.encode('utf-8')
        result = etree.HTML(html)

        content = result.xpath('//*[@id="indicators"]/div[2]/div')
        for box in content:
            category = box.xpath('./button/h3')[0].text
            rank = box.xpath('./div/ul/li/a/div/strong/text()')
            if rank[0] == '#':
                rankValue = rank[1]
            else:
                rankValue = rank[0]
            indDict = {'University':schoolName, 
                        'Country':schoolCountry,
                        'Rank':rankValue,
                        'overall score':'',
                        'global research reputation':'',
                        'regional research reputation':'',
                        'publications':'',
                        'books':'',
                        'conferences':'',
                        'normalized citation impact':'',
                        'total citations':'',
                        'number of publications that are among the 10% most cited':'',
                        'percentage of total publications that are among the 10% most cited':'',
                        'international collaboration - relative to country':'',
                        'international collaboration':'',
                        'number of highly cited papers that are among the top 1% most cited':'',
                        'percentage of highly cited papers that are among the top 1% most cited':''}
            fpNum = fpDict.get(category)
            dfli = eval('dfli'+str(fpNum))

            allInd =  box.xpath('./div/div')
            for eachInd in allInd:
                label = eachInd.xpath('./p[1]')[0].text
                value = eachInd.xpath('./p[2]')[0].text
                label = re.sub(category+' ', '', label)
                label = label.lower()
                value = re.sub('#', '', value)
                if ((label == 'global score') | (label == 'overall score')):
                    indDict['overall score'] = value
                else:
                    indDict[label] = value
                print(str(cnt) + ' | ' + schoolName + ' | ' + category + ' | ' + label + ': ' + value)

            df = pandas.DataFrame(indDict, index=[0])
            dfli.append(df)
    time.sleep(random.random()*2+1)
    cnt = cnt + 1

dfli0=pandas.concat(dfli0)
dfli1=pandas.concat(dfli1)
dfli2=pandas.concat(dfli2)
dfli3=pandas.concat(dfli3)
dfli4=pandas.concat(dfli4)
dfli5=pandas.concat(dfli5)
dfli6=pandas.concat(dfli6)
dfli7=pandas.concat(dfli7)
dfli8=pandas.concat(dfli8)
dfli9=pandas.concat(dfli9)
dfli10=pandas.concat(dfli10)
dfli11=pandas.concat(dfli11)
dfli12=pandas.concat(dfli12)
dfli13=pandas.concat(dfli13)
dfli14=pandas.concat(dfli14)
dfli15=pandas.concat(dfli15)
dfli16=pandas.concat(dfli16)
dfli17=pandas.concat(dfli17)
dfli18=pandas.concat(dfli18)
dfli19=pandas.concat(dfli19)
dfli20=pandas.concat(dfli20)
dfli21=pandas.concat(dfli21)
dfli22=pandas.concat(dfli22)
dfli23=pandas.concat(dfli23)
dfli24=pandas.concat(dfli24)
dfli25=pandas.concat(dfli25)
dfli26=pandas.concat(dfli26)
dfli27=pandas.concat(dfli27)
dfli28=pandas.concat(dfli28)
dfli29=pandas.concat(dfli29)
dfli30=pandas.concat(dfli30)
dfli31=pandas.concat(dfli31)
dfli32=pandas.concat(dfli32)
dfli33=pandas.concat(dfli33)
dfli34=pandas.concat(dfli34)
dfli35=pandas.concat(dfli35)
dfli36=pandas.concat(dfli36)
dfli37=pandas.concat(dfli37)
dfli38=pandas.concat(dfli38)
dfli39=pandas.concat(dfli39)
dfli40=pandas.concat(dfli40)
dfli41=pandas.concat(dfli41)
dfli42=pandas.concat(dfli42)
dfli43=pandas.concat(dfli43)
dfli0.to_excel('Global Universities.xlsx', index=False)
dfli1.to_excel('Agricultural Sciences.xlsx', index=False)
dfli2.to_excel('Arts and Humanitie.xlsx', index=False)
dfli3.to_excel('Biology and Biochemistr.xlsx', index=False)
dfli4.to_excel('Biotechnology and Applied Microbiolog.xlsx', index=False)
dfli5.to_excel('Cardiac and Cardiovascular System.xlsx', index=False)
dfli6.to_excel('Cell Biolog.xlsx', index=False)
dfli7.to_excel('Chemical Engineerin.xlsx', index=False)
dfli8.to_excel('Chemistr.xlsx', index=False)
dfli9.to_excel('Civil Engineerin.xlsx', index=False)
dfli10.to_excel('Clinical Medicine.xlsx', index=False)
dfli11.to_excel('Computer Science.xlsx', index=False)
dfli12.to_excel('Condensed Matter Physics.xlsx', index=False)
dfli13.to_excel('Economics and Business.xlsx', index=False)
dfli14.to_excel('Electrical and Electronic Engineering.xlsx', index=False)
dfli15.to_excel('Endocrinology and Metabolism.xlsx', index=False)
dfli16.to_excel('Energy and Fuels.xlsx', index=False)
dfli17.to_excel('Engineering.xlsx', index=False)
dfli18.to_excel('Environment_Ecology.xlsx', index=False)
dfli19.to_excel('Food Science and Technology.xlsx', index=False)
dfli20.to_excel('Gastroenterology and Hepatology.xlsx', index=False)
dfli21.to_excel('Geosciences.xlsx', index=False)
dfli22.to_excel('Immunology.xlsx', index=False)
dfli23.to_excel('Infectious Diseases.xlsx', index=False)
dfli24.to_excel('Materials Science.xlsx', index=False)
dfli25.to_excel('Mathematics.xlsx', index=False)
dfli26.to_excel('Mechanical Engineering.xlsx', index=False)
dfli27.to_excel('Microbiology.xlsx', index=False)
dfli28.to_excel('Molecular Biology and Genetics.xlsx', index=False)
dfli29.to_excel('Nanoscience and Nanotechnology.xlsx', index=False)
dfli30.to_excel('Neuroscience and Behavior.xlsx', index=False)
dfli31.to_excel('Oncology.xlsx', index=False)
dfli32.to_excel('Optics.xlsx', index=False)
dfli33.to_excel('Pharmacology and Toxicology.xlsx', index=False)
dfli34.to_excel('Physical Chemistry.xlsx', index=False)
dfli35.to_excel('Physics.xlsx', index=False)
dfli36.to_excel('Plant and Animal Science.xlsx', index=False)
dfli37.to_excel('Polymer Science.xlsx', index=False)
dfli38.to_excel('Psychiatry_Psychology.xlsx', index=False)
dfli39.to_excel('Public, Environmental and Occupational Health.xlsx', index=False)
dfli40.to_excel('Radiology, Nuclear Medicine and Medical Imaging.xlsx', index=False)
dfli41.to_excel('Social Sciences and Public Health.xlsx', index=False)
dfli42.to_excel('Space Science.xlsx', index=False)
dfli43.to_excel('Surgery.xlsx', index=False)

Tags: Python request

Posted on Wed, 10 Nov 2021 15:10:45 -0500 by aruns