numpy of data analysis

numpy of data analysis

Introduction to numpy

Is it numpy?

A basic library for scientific calculation in python, which focuses on numerical calculation. It is also the basic library of most python scientific calculation libraries. It is mostly used to perform numerical operations on large and multi-dimensional arrays.

Data type related

import numpy as np
import random

#Use numpy to generate an array to get the type of ndarray
t1 = np.array([1, 2, 3])
print(t1)
print(type(t1))

t2 = np.array(range(10))
print(t2)
print(type(t2))

t3 = np.arange(4, 18, 3)
print(t3)
print(type(t3))
print(t3.dtype)

#Data types in numpy
t4 = np.array(range(1, 4), dtype="float32")
print(t4)
print(t4.dtype)

#bool type in numpy
t5 = np.array([1, 0, 1, 0, 0, 0, 1, 1, 1], dtype=bool)
print(t5)
print(t5.dtype)

#Adjust data type
t6 = t5.astype("int8")
print(t6)
print(t6.dtype)

#Decimal type in numpy
t7 = np.array([random.random() for i in range(6)])
print(t7)
print(t7.dtype)

t8 = np.round(t7, 3)
print(t8)

Array shape

import numpy as np

#View the shape of an array
t1 = np.arange(12)
print(t1)
print(t1.shape)

t2 = np.array([[1, 2, 3], [4, 5, 6]])
print(t2)
print(t2.shape)
print("that 's ok", t2.shape[0])
print("column", t2.shape[1])

t3 = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 9, 0]]])
print(t3)
print(t3.shape)

#Modify the shape of the array (reshape())
t4 = np.arange(24)
print(t4)

t5 = t4.reshape((4, 6))
print(t5)
print(t5.shape)

t6 = t4.reshape((3, 2, 4))
print(t6)
print(t6.shape)

#Expand by row. flatten()
t7 = t6.flatten()
print(t7)
print(t7.shape)

Array calculation

When performing addition, subtraction, multiplication and division between arrays, the following principles shall be met:

Broadcast principle: if the axis lengths of the trailing dimension (i.e. the dimension from the end) of the two arrays match or the length of one of them is 1, they are considered broadcast compatible. The broadcast will be carried out in the dimension of true and / or length 1.

In short, the last few bits of the shape of the two arrays must match.

import numpy as np

t1 = np.arange(10, 34).reshape(4, 6)
print(t1)

#Add k
t2 = t1 + 2
print(t2)

t3 = np.arange(100, 124).reshape(4, 6)
#Addition, subtraction, multiplication and division of the same shape and size matrix
t4 = t1 + t3
print(t4)
t5 = t1 - t3
print(t5)
t6 = t1 * t3
print(t6)
t7 = t1 / t3
print(t7)
t8 = t3 // t1
print(t8)
t9 = np.arange(0, 6) ** 2
print(t9)

#Operate with the same row or column array (each group will participate in the operation)
a1 = np.arange(10, 34).reshape(4, 6)
a2 = np.arange(0, 6)
a3 = np.arange(4).reshape(4, 1)
a5 = a1 - a2
print(a5)
a6 = a1 - a3
print(a6)

numpy transpose

Axis: in numpy, it can be understood as direction, which is expressed by 0,1,2... Numbers. For a one-dimensional array, there is only one 0 axis, for a two-dimensional array (shape(2,2)), there are 0 and 1 axes, and for a three-dimensional array (shape(2,2,3)), there are 0,1,2 axes.

Transfer or T or exchange axis

t1 = np.arange(12).reshape(3, 4)
#transpose or T or Commutative axis of matrix
t2 = t1.transpose()
t3 = t1.T
t4 = t1.swapaxes(1, 0)

numpy index and slice

a1 = np.arange(48).reshape(6, 8)
print(a1)
#Take line
print(a1[2])
#Take consecutive rows
print(a1[4:])
#Take discontinuous multiple lines
print(a1[2::2])
print(a1[[0, 3, 4]])

#Take column
print("Take column: ", a1[:,1])
print("Take 3 columns and subsequent elements:\n", a1[:,2:])
print("Take 3,5,6 Columns:\n", a1[:,[2, 4, 5]])

#Take multiple rows and columns, and take the values of row 3 and column 4
print(a1[2, 3])
print(type(a1[2,3]))  #<class 'numpy.int32'>

#Take multiple rows and columns, take the elements from row 3 to row 5, and from column 2 to column 4 (cross elements)
print(a1[2:5, 1:4])

#Take multiple non adjacent points (0, 0) (2, 1), (4, 5)
b = a1[[0,2,4], [0,1,5]]
print(b)

Numeric modification and bool index

Assign a value at the corresponding index slice

import numpy as np

a = np.arange(48).reshape(6, 8)
print(a)
b1 = a < 10
print(b1)
a[a < 10] = 123
print(a)
a[[2, 3, 4],[0, 1, 3]] = 1
print(a)

b2 = a
b2[b2 < 10] = 0
b2[b2 > 10] = 10
print(b2)

Ternary operator where

a = np.arange(48).reshape(6, 8)
print(a)
b = a
b[b < 20] = 0
b[b >= 20] = 100
print(b)
#That is, less than 20 is assigned as 0, and greater than or equal to 20 is assigned as 100
#Using ternary operators is equivalent to
c = a
np.where(c < 20, 0, 100)
print(c)

Crop clip

a = np.arange(48).reshape(6, 8)
print(a)
#Replace those less than 15 with 15 and those greater than 30 with 30
b = a.clip(15, 30)
print(b)

Array splicing

  • np.vstack((t1, t2)) # vertical splicing
  • np.hstack((t1, t2)) # horizontal splicing
t1 = np.arange(12).reshape(2, 6)
t2 = np.arange(24, 36).reshape(2, 6)
print("t1\n{}".format(t1))
print("t2\n{}".format(t2))
t3 = np.vstack((t1, t2))  #Vertical splicing
t4 = np.hstack((t1, t2))  #Horizontal splicing
print("t1 and t2 Vertical splicing\n {}".format(t3))
print("t1 and t2 Horizontal splicing\n {}".format(t4))

The direction of segmentation is just opposite to that of splicing

Row column exchange of array

Index assignment is enough

a = np.arange(48).reshape(6, 8)
print(a)
#Row exchange
a1 = a
a1[[1, 2],:] = a1[[2, 1],:]
print(a1)
#Column exchange
a2 = a
a2[:,[3, 7]] = a2[:,[7, 3]]
print(a2)

Maximum value, all 0 / 1 array, I/E

  • Gets the location of the maximum and minimum values
    1. np.argmax(t.axis=0)
    2. np.argmin(t.axis=1)
  • Create an array of all zeros: np.zeros((3,4))
  • Create an array of all 1: np.ones((3,4))
  • Create a square array (square array) with diagonal 1: np.eys(3)
import numpy as np
import random

a = np.array([random.randint(0, 100) for i in range(32)]).reshape(4, 8)
print(a)
a1 = np.argmax(a, axis=0)
print("Row maximum position:\n{}".format(a1))
a2 = np.argmax(a, axis=1)
print("Column maximum position:\n{}".format(a2))

#Create all 0 array
a3 = np.zeros((3, 4))
print("All 0 arrays:\n {}".format(a3))

#Create full 1 array
a4 = np.ones((3, 4))
print("Full 1 array:\n {}".format(a4))

#Create an array of squares (squares) with a diagonal of 1
a5 = np.eye(6)
print("Identity matrix:\n {}".format(a5))

a6 = a5 * (-1)
print(a6)

numpy generates random numbers

parameterexplain
.rand(d0,d1,..dn)Create a uniformly distributed random number array of d0 DN dimension, floating-point number, ranging from 0-1
.rand(d0,d1,..,dn)Create standard normal distribution random number of d0 DN dimension, floating point number, mean 0, standard deviation 1
.randint(low,high,(shape))Select a random number integer from the given upper and lower limits, the range is low,high, and the shape is shape
.uniform(low,high,(size))Generate an array with random distribution, low start value, high end value, size shape
.normal(loc,scale,(size))Samples are randomly selected from the specified normal distribution. The distribution center is loc (mean value of probability distribution), the standard deviation is scale, and the shape is size
.seed(s)Random number seed, s is the given seed value. Because the computer generates pseudo-random numbers, the same random numbers can be generated each time by setting the same random number seeds

copy and view of numpy

  • a = b does not replicate at all, and a and b interact
  • a = b [:], the operation of the view, a slice, will create a new object a, but the data of a is completely kept by b, and their data changes are consistent
  • a = b.copy(), copy, a and b do not affect each other
import numpy as np

np.random.seed(10)
a = np.random.randint(10, 20, (5, 8))
print(a)

b = a[3:, 3:]
print("b = a[3:]\n", b)
b += 1000
print("a Change of: \n", a)
print("b Changes in:\n", b)

c = a.copy()
c -= 1000
print("c = a.copy() - 1000: \n", c)
print("a Value of:\n", a)

nan and inf in numpy

nan(NAN,Nan):not a number means not an array

When will nan appear in numpy:

  • When we read the local file as float, nan will appear if it is missing
  • As an inappropriate calculation (such as infinity minus infinity)

Inf (- inf.inf): infinity. Inf means positive infinity, - inf means negative infinity

When will inf appear, including (- inf, + INF)

  • For example, a number divided by 0 (an error will be reported directly in python, and an inf or - inf in numpy)

How to make a nan or inf:

d = np.inf
print(type(d))
e = np.nan
print(type(e))

nan's attention points

  • Two nan are not equal

    np.nan != np.nan (True)

  • You can use the above characteristics to determine the number of nan in the array

    np.count_nonzero(t != t)

  • How to judge whether a number is nan? Judge by np.isnan(a) and return bool type. For example, you want to replace nan with 0

    t[np.isnan(t)] = 0

  • nan and any value calculation is nan

print(np.nan != np.nan)
f = np.array([[1.9, 2.7, np.nan], [np.nan, np.nan, 4.9]])
print(f)
nan_counts = np.count_nonzero(f != f)
print(nan_counts)

f1 = f.copy()
f1[np.isnan(f1)] = 0
print(f1)

f2 = f.copy()
f2 -= np.nan
print(f2)

Common statistical functions in numpy

effectfunction
Sumt.sum(axis=Node)
mean valuet.mean(axis=None) is greatly affected by outliers
mediannp.median(t.axis=None)
Maximumt.max(axis=None)
minimum valuet.min(axis=None)
rangenp.ptp(t.axis=None) is the difference between the maximum value and the minimum value
standard deviationt.tsd(axis=None)

By default, it returns all the statistical results of the multidimensional array. If axis is specified, it returns a result on the current axis

import numpy as np

a = np.arange(24).reshape(4, 6)
print(a)
print("Summation:\n", a.sum(axis=0))
print("Find the mean: \n", a.mean())
print(a.mean(axis=1))
print("Median:\n", np.median(a, axis=1))
print("Maximum:\n", a.max())
print("Minimum:\n", a.min(axis=0))
print("Range:\n", np.ptp(a, axis=1))
print("standard deviation: \n", a.std())

Small exercise: assign nan to mean()

import numpy as np

def fill_ndarray(a):
    for i in range(a.shape[1]):
        temp_col = a[:, i]
        nan_num = np.count_nonzero(temp_col != temp_col)
        if nan_num != 0:   #If it is not 0, it indicates that nan exists in the current column
            temp_not_nan_clo =  temp_col[temp_col == temp_col]
            #Select the position that is currently nan and assign a value that is not the mean of nan
            temp_col[temp_col != temp_col] = temp_not_nan_clo.mean()
    return a

if __name__ == '__main__':
    a = np.arange(24).reshape(4, 6).astype("float")
    print(a)
    a[1:3, 2:5] = np.nan
    print(a)
    a = fill_ndarray(a)
    print("assignment nan Column mean: \n", a)

summary

Tags: Python Machine Learning Data Analysis numpy

Posted on Thu, 30 Sep 2021 14:22:14 -0400 by dominant