Ranking analysis of application access regions

Application access region ranking

Topic content: given Nginx AccessLog (multiple files, estimated to be about 66G) for a period of time, find the five IPS with the most access times in the fastest way. Submit a script or an executable program, and pass in the path of the file in the form of command line parameters. Output 5 IPS in descending order of times, one row for each IP.

Known notes:
  1. Linux Centos7 server, limited to 2G memory, 4-core CPU
  2. The nginx access log is placed in the specified directory, and the file content format
   '$remote\_addr\\t-\\t$remote_user\t$time_local\t'
                '$http\_x\_forwarded\_for\\t$tcpinfo_rtt\t$tcpinfo\_rttvar\\t$tcpinfo_snd_cwnd\t$tcpinfo_rcv_space\t'
                    '$request\_method\\t$host\t$request\_uri\\t$server_protocol\t$request\_length\\t$request_time\t'
                    '$status\\t$body_bytes_sent\t$bytes_sent\t'
                    '$http\_referer\\t$http_user_agent\t'
                    '$connection\\t$connection_requests\t'
                    '$upstream\_addr\\t$upstream_status\t$upstream\_response\_length\\t$upstream_response_time\t'
                    '$scheme\\t$ssl_session_reused';
    10.0.0.1 - - 22/Oct/2019:00:00:05 +0800 - 45250 5000 20 14600 POST api.immomo.com /v1/welcome/logs?fr=123456789 HTTP/1.1 567 0.029 200 96 651 - MomoChat/8.20.2 ios/1878 (iPhone 7 Plus; iOS 11.0.3; zh_CN; iPhone9,2; S1) 93983365152 15 10.0.0.1:9000 200 101 0.029 https .
  3. Not limited to the implementation language, but cannot rely on any open source third-party dependency or service
There is only one question input parameter: the folder path of Accesslog
  4. The topic output needs to create the result file under the program running path. The format of the file content is: the five IP addresses inverted according to the number of accesses and the corresponding number of accesses.
For example:
10.12.12.1    10000
102.12.12.2   9999
...
Judging rules:
The winner is the one with accurate statistics and the shortest time
2-core 4G mechanical hard disk

Solving problems

The solution code at the bottom of this article is to use idea 1
 Idea 1: 2.1 directly change IP to decimal hash. 2.2 mod N for heap sorting 2.3 for N heap TOP10 sorting aggregation 2.4 for heap TOP10 after output aggregation
 Idea 2: can we combine our super large numbers? The number of times of combination + decimal numbers, heap sorting, and directly get the result set? Avoid building my own structure

Performance discussion points

Time consuming analysis

    Note that the machine configuration given in this topic is 2-core 4G
    The test data (5GB) is calculated as follows. It is found that heap sorting takes about 300ms
    processLine and CalculateIp take a few seconds with few optimization points.
    ReadLine takes 90% of the time, so this article focuses on the performance of readLine read file IO! 
    Will it be faster if we do multithreaded reading? Keep looking down~

How fast is single thread / multi thread reading and writing files?

1. The disk IO single thread sequential reading is the fastest "what"?
       If multiple threads read, the head of the disk needs to be continuously re addressed, resulting in slower read speed than single thread
 2. Linux will pre read the sequential read!      
3. Random read multithreading is about N times faster than single thread. (depending on the number of threads)
4. For multithreaded IO, we read the same file. Even if we use seek+w/r mode to read, we need to lock it.
5. We open a set of file descriptors (file objects) for each thread. Can we improve IO? We have N file objects in the core, but only one inode object. File reading and writing ultimately falls to inode. So it won't improve IO

Conclusion: when we deal with large file reading, single thread is better than multi thread~

Implementation code

package main

import (
    "bufio"
    "container/heap"
    "fmt"
    "io"
    "os"
    "runtime"
    "strconv"
    "strings"
    "time"
)
const N = 256
//Build N heaps
var GlobalIp map[int64]*IpQueue

//Then N heaps get top 10
var GlobalNum map[int64]int64 //frequency

func ReadLine(filePth string, hookfn func([]byte)) error {
    f, err := os.Open(filePth)
    if err != nil {
        return err
    }
    defer f.Close()

    bfRd := bufio.NewReader(f)
    for {
        line, err := bfRd.ReadBytes('\n')
        hookfn(line)
        if err != nil {
            if err == io.EOF {
                return nil
            }
            return err
        }
    }

}

//Initialize global variables
func initHeap() {
    GlobalNum = make(map[int64]int64)
    GlobalIp = make(map[int64]*IpQueue)
    for i := 0; i <= N; i++ {
        q := make(IpQueue, 1)
        q[0] = &Item{ip: "0.0.0.0", num: -1}
        heap.Init(&q)
        GlobalIp[int64(i)] = &q //Heap to Global
    }
}

//2.1 directly change IP to decimal hash count times
func processLine(line []byte) {

    var result int
    for i := 7; i <= 15; i++ {
        if line[i] == '\t' || line[i] == '-' {
            result = i
            break
        }
    }
    str := string(line[0:result])

    ipv4 := CalculateIp(string(str))

    GlobalNum[int64(ipv4)]++
}

//2.2 mod N for heap sorting
func handleHash() {
    //Heap time start
    timestamp := time.Now().UnixNano() / 1000000
    for k, v := range GlobalNum {
        heap.Push(GlobalIp[k%N], &Item{ip: RevIp(k), num: int64(v)})
    }
    edgiest := time.Now().UnixNano() / 1000000
    fmt.Println("Total heap time ms:", edgiest-timestamp)
}

//2.3 sorting and aggregation of N heap TOP10
func polyHeap() {
    //Top 10 aggregating N small heaps
    for i := 0; i < N; i++ {
        iterator := 10
        if iterator > GlobalIp[int64(i)].Len() {
            iterator = GlobalIp[int64(i)].Len()
        }
        for j := 0; j < iterator; j++ {
            //Write to stack N
            item := heap.Pop(GlobalIp[int64(i)]).(*Item)
            heap.Push(GlobalIp[N], item)
        }
    }
}

//2.4 output aggregated heap TOP10
func printResult() {
    result := 0
    for result < 10 {
        item := heap.Pop(GlobalIp[N]).(*Item)
        fmt.Printf("Number of occurrences:%d|IP:%s \n", item.num, item.ip)
        result++
    }
}

//string to IP
func CalculateIp(str string) int64 {
    x := strings.Split(str, ".")
    b0, _ := strconv.ParseInt(x[0], 10, 0)
    b1, _ := strconv.ParseInt(x[1], 10, 0)
    b2, _ := strconv.ParseInt(x[2], 10, 0)
    b3, _ := strconv.ParseInt(x[3], 10, 0)

    number0 := b0 * 16777216 //256*256*256
    number1 := b1 * 65536    //256*256
    number2 := b2 * 256      //256
    number3 := b3 * 1        //1
    sum := number0 + number1 + number2 + number3
    return sum
}

//ip to string
func RevIp(ip int64) string {

    ip0 := ip / 16777216 //Senior one
    ip1 := (ip - ip0*16777216) / 65536
    ip2 := (ip - ip0*16777216 - ip1*65536) / 256
    ip3 := ip - ip0*16777216 - ip1*65536 - ip2*256
    return fmt.Sprintf("%d.%d.%d.%d", ip0, ip1, ip2, ip3)
}

type Item struct {
    ip  string
    num int64
}

type IpQueue []*Item

func (pq IpQueue) Len() int { return len(pq) }

func (pq IpQueue) Less(i, j int) bool {
    return pq[i].num > pq[j].num
}
func (pq IpQueue) Swap(i, j int) {
    pq[i], pq[j] = pq[j], pq[i]
}

func (pq *IpQueue) Push(x interface{}) {
    item := x.(*Item)
    *pq = append(*pq, item)
}

func (pq *IpQueue) Pop() interface{} {
    old := *pq
    n := len(old)
    item := old[n-1]
    *pq = old[0 : n-1]
    return item
}

func main() {
    runtime.GOMAXPROCS(2)
    timestamp := time.Now().UnixNano() / 1000000

    //Initialization
    initHeap()

    //Serial read file written to hash map
    _ = ReadLine("/Users/admin/Downloads/api.immomo.com-access_10-01.log", processLine)

    //Many small piles
    handleHash()

    //Polymerization reactor
    polyHeap()

    //Print results

    printResult()

    fmt.Println(time.Now().UnixNano()/1000000 - timestamp)
}

Ending

Thank Xin Huimin (Momo) and Li gengyong for their warm support and discussion

Tags: Go Nginx Linux iOS less

Posted on Tue, 05 Nov 2019 06:12:24 -0500 by Brentley_11