0%

NumPy+Pandas+Matplotlib学习

NumPy基础

NumPy几个重要定义

  • axis:维度
  • rank:维度数
  • shape:各维度的元素数
  • size:数组内所有元素的数量

创建数组

1、创建一个自定义大小和内容的数组

1
np_array = np.array([[1,2,3,4],[5,6,7,8]])

2、创建一个全为0的1*5的数组

1
zero_array = np.zeros((5))

3、创建一个全为1的4*4的数组

1
one_array = np.ones((4,4))

4、创建全为一个数的3*3数组

1
full_array = np.full((3,3),7)

5、创建一个空的2*3数组,数组值来自内存任意数

1
empty_array = np.empty((2,3))

6、改变输出数组的形状

1
2
x = np.array([[1,2,3],[4,5,6]])
y = np.reshape(x,(3,2))

7、将数组读取改为一维

1
z = y.rival()

注意,以上两种方法只是改变了数组的输出方式,并没有复制成为新的数组,如果改变y、z的内容,x内容也会相应改变。

8、复制数组到新的数组

1
array3 = array1.copy()

9、生成(0,1)随机数

1
random_array = np.random.random((5,5))

10、生成整数随机数

1
rand1 = np.random.randint(100,size=(5,2))

11、生成服从正态分布的随机数

1
rand3 = np.random.randn(100)

12、生成4维单位矩阵

1
eye_array = np.eye(4)

13、生成1-5的等差数列

1
array = np.arange(1,5)

14、生成规定步长的等差数列

1
array = np.arange(1,5,0.5)

15、平均分(0,5)区间

1
array = np.linspace(0,5,10)

16、自定义函数处理数组

1
2
3
4
def my_function(z,y,x):
return x*y+z

array = np.fromfunction(my_function,(3,2,10))

索引

1、二维数组索引

1
2
3
4
5
6
7
matrix1 = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(matrix1)
print(matrix1[0,2])
matrix1[0,1] = 100
print(matrix1)
print(matrix1[[0,2,2],[0,1,0]])
print(matrix1[(0,2),2:5])

2、高维数组索引

1
2
3
4
5
6
matrix_3d = np.arange(80).reshape(4,5,4)
print(matrix_3d)
print(matrix_3d[2,...])
print(matrix_3d[0,2,...])
print(matrix_3d[3,...,2])
print(matrix_3d[3,:,2])

3、布尔索引

1
2
3
4
5
6
7
8
9
10
11
matrix3 = np.array([[1,2],[3,4],[5,6]])
print(matrix3)
bool_idx = (matrix3 > 2)
print(bool_idx)
print(matrix3[bool_idx])
print(matrix3%2 == 1)

matrix4 = np.arange(36).reshape(3,12)
rows_on = [True, False, True]
cols_on = [True, False, True, False, True, False, True, False, True, False, True, False]
print(matrix4[np.ix_(rows_on,cols_on)])

4、提取数组元素

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
two_dimensional_list = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
for item in two_dimensional_list:
print(item)

for row in two_dimensional_list:
print(row)

for column in two_dimensional_list.T:
print(column)

three_dimentiional_list = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[10, 11, 12], [13, 14, 15], [16, 17, 18]]])
for item in three_dimentiional_list.flat:
print(item)

for item in three_dimentiional_list:
print(item)

数学函数、统计、线性代数

1、Arithmetic operations:+ - * / // % **

1
2
3
4
5
6
7
8
matrix1 = np.array([[1, 2, 3], [4, 5, 6]])
matrix2 = np.array([[4, 5, 6], [7, 8, 9]])
print(np.add(matrix1, matrix2)) # 求和
print(np.subtract(matrix1, matrix2)) # 求差
print(np.multiply(matrix1, matrix2)) # 求积
print(np.floor_divide(matrix1, matrix2)) # 除
print(np.power(matrix1, matrix2)) # 乘方
print(np.mod(matrix1, matrix2)) # 取余

2、function

1
2
3
4
5
6
7
8
9
10
11
rand_matrix = np.array([[20,15.4,-12.8,-1.1,-8.8],[-18.3,-19.6,20.2,-15.5,43.1]])
print(np.isnan(rand_matrix)) # 判断是否为nan
print(np.sign(rand_matrix)) # 判断正负
print(np.nonzero(rand_matrix)) # 判断是否为0
print(np.ceil(rand_matrix)) # 向上取整
print(np.cumsum(rand_matrix)) # 累加
print(np.diff(rand_matrix)) # 累差
print(np.sqrt(rand_matrix)) # 开方
print(np.square(rand_matrix)) # 平方
print(np.sin(rand_matrix)) # 正弦
print(np.sort(rand_matrix)) # 排序

3、统计

1
2
3
4
5
6
7
8
9
10
matrix = np.arange(-5,5).reshape(2,5)
print(matrix.mean()) # 求平均值
print(np.median(matrix)) # 求中位数
print(np.argmax(matrix)) # 求最大值的索引
print(np.argmin(matrix)) # 求最小值的索引
print(np.sum(matrix)) # 求和
print(np.abs(matrix)) # 求绝对值
print(np.max(matrix)) # 求最大值
print(np.sum(matrix,axis=0)) # 求每一列的和
print(np.sum(matrix,axis=1)) # 求每一行的和

4、线性代数

1
2
3
4
5
6
7
8
9
matrix1 = np.arange(16).reshape(4,4)
matrix2 = np.arange(8).reshape(2,4)
print(matrix1.T) # 转置
print(matrix1.dot(matrix2.T)) # 矩阵乘法
print(np.linalg.inv(matrix1)) # 逆矩阵
print(np.linalg.det(matrix1)) # 行列式
eigenvalues, eigenvectors = np.linalg.eig(matrix1) # 特征值和特征向量
print(eigenvalues)
print(eigenvectors)

5、求解线性方程

1
2
3
4
coeffs = np.array([[2,6],[5,3]]) # 系数矩阵
depvars = np.array([6,-9]) # 常数项
solution = np.linalg.solve(coeffs,depvars) # 解方程
print(solution)

合并、分割、广播

1、交换维度

1
2
3
4
5
m1 = np.ones((1,2,3))
print(np.transpose(m1,(1,0,2))) # 交换维度
m2 = np.ones((2,3,4,5))
print(np.transpose(m2)) # 交换维度
m3 = m2.swapaxes(1,2) # 交换维度

2、合并

1
2
3
4
5
6
7
8
9
10
11
12
13
array1 = np.full((1,5),1.0)
array2 = np.full((1,5),2.0)

stake_array = np.vstack((array1,array2)) # 垂直堆叠数组
print(stake_array)
stake_array = np.hstack((array1,array2)) # 水平堆叠数组
print(stake_array)
stake_array = np.concatenate((array1,array2),axis=0) # 按行堆叠数组(同vstack)
print(stake_array)
stake_array = np.concatenate((array1,array2),axis=1) # 按列堆叠数组(同hstack)
print(stake_array)
stake_array = np.stack((array1,array2)) #沿新轴合并数组,保持两矩阵形状相同
print(stake_array)

3、分割

1
2
3
4
5
6
big_matrix = np.arange(36).reshape(9,4)
m1,m2,m3 = np.vsplit(big_matrix,3) # 垂直分割(按行分割)
m4,m5 = np.hsplit(big_matrix,2) # 水平分割(按列分割)

m1 = np.arange(4)
m2 = np.tile(m1,(4,1)) #第一个维度重复4次,第二个维度重复1次

4、广播 广播的前提:两个数组必须可以转化成维度大小一样的数组才能进行计算 - 规则1、如果两个数组维度不同,那么小维度数组的形状将在最左边补1 - 规则2、如果两个数组的形状在任何一个维度上都不匹配,那么数组的形状会沿着维度为1的维度扩展,匹配另一个数组的形状 - 规则3、如果两个数组在任何一个维度上都不匹配且没有一个维度为1,则发生异常

1
2
3
4
5
6
7
8
m2 = np.arange(6).reshape(2,3)
print(m2+[[10],[20]]) # 扩展为[[10,10,10],[20,20,20]]
print(m2+[10,20,30]) # 扩展为[[10,20,30],[10,20,30]]
print(m2+10) #扩展为[[10,10,10],[10,10,10]]

m3 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
m4 = np.array([1,0,1])
print(m3+m4) # 扩展为4行3列,然后相加

Pandas基础

Series和DataFrame

1、Seires

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
random_items = np.random.randint(25,size=10)    # 生成10个0-25之间的随机数
series_data = pd.Series(random_items) # 将随机数转换为Series对象
print(series_data[0]) # 输出第一个元素

letter_index = ['a','b','c','d','e','f','g','h','i','j'] # 生成索引
new_series = pd.Series(random_items,index=letter_index) # 将随机数转换为Series对象,并指定索引
print(new_series) # 输出Series对象


data_dict = {'a':1,'b':2,'c':3,'d':4,'e':5} # 生成字典
dict_series = pd.Series(data_dict) # 将字典转换为Series对象
print(dict_series) # 输出Series对象
dict_series = pd.Series(data_dict,name = 'series_name') # 将字典转换为Series对象,并指定名称
print(dict_series) # 输出Series对象
dict_series = dict_series.rename('new_name') # 重命名Series对象
print(dict_series) # 输出Series对象
print(dict_series.median()) # 输出Series对象的中位数
print(dict_series>dict_series.median()) # 输出Series对象中大于中位数的元素

2、DataFram

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
d = {'one':[1,2,3,4]
,'two':[4,3,2,1]} # 创建一个字典
dict_df = pd.DataFrame(d) # 将字典转换为数据框
print(dict_df)
print(dict_df['one']) # 输出one列
print(dict_df.shape) # 输出数据框的行数和列数

d_data = {
'one':pd.Series([1,2,3,],name = 'col_one',index = ['a','b','c']),
'two':pd.Series([1,2,3,4],name = 'col_two',index = ['a','b','c','d'])
}
df = pd.DataFrame(d_data)
print(df)
new_df = df.reset_index(drop = True) # drop = True 丢弃原来的索引
print(new_df)
print(df)
df.reset_index(drop = True,inplace = True) # drop = True 丢弃原来的索引
df.reset_index(drop = False,inplace = True) # drop = False 保留原来的索引
3、大型表格索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
country_info= pd.read_csv('D:\Downloads\country.csv')
print(country_info.columns) # 查看列名
print(country_info.head(3)) # 查看前3行
print(country_info.head()) # 查看前5行
print(country_info.tail()) # 查看后5行
print(country_info['Region']) # 查看Region列
print(country_info[['Region','Country']]) # 查看Region和Country列
print(country_info.iloc[100]) # 查看行
print(country_info.iloc[[2,3]]) # 查看多行
print(country_info.iloc[[2,3],[0,3]]) # 查看多行多列
country_info['Country'] = country_info['Country'].str.strip() # 去除头和尾的空格
country_info.set_index(['Country'],drop=True,inplace=True) # 设置Country为索引
print(country_info.loc['China']) # 查看China行
print(country_info.loc[['China','India']]) # 查看China和India行
print(country_info.loc[['China','India'],['Region','Population']]) # 查看China和India行的Region和Population列
print(country_info.loc['China':'India','Region':'Deathrate']) # 查看China到India行

  • HOMEWORK

使用read_csv获取country数据,读取前十行的数据,只选取Country、Birthrate和Service,将其中的数据变成DataFrame,并使用to_csv函数将结果存到本地的country.csv文件中。

1
2
3
4
5
6
country_info= pd.read_csv('D:\Downloads\country.csv')
print(country_info.head(10)) #查看前10行数据
print(country_info[['Country','Birthrate','Service']]) #提取数据
df_country = pd.DataFrame(country_info[['Country','Birthrate','Service']]) #提取数据
print(df_country)
df_country.to_csv('D:\Downloads\country.csv',index=False) #保存数据但不保存索引

Filtering

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
country_info= pd.read_csv("D:\Downloads\country.csv")

print(country_info['Net migration'] =='0') # 选出净移民为0的国家
zero_migration_filter = (country_info['Net migration'] =='0') # 选出净移民为0的国家
print(country_info[zero_migration_filter]) # 输出净移民为0的国家

print(country_info.loc[zero_migration_filter,['Region','Country','Net migration']]) # 输出净移民为0的国家的地区、国家名和净移民数
low_migration_filter = (country_info['Net migration'] <'100') # 选出净移民小于100的国家
print(country_info.loc[low_migration_filter,['Region','Population']]) # 输出净移民小于100的国家的地区和人口数
and_filter = (country_info['Deathrate']>'1000')&(country_info['Population']<10000) # 选出死亡率大于1000且人口小于10000的国家
print(country_info.loc[and_filter,['Region','Population']]) # 输出死亡率大于1000且人口小于10000的国家的地区和人口数
or_filter = (country_info['Deathrate']>'1000')|(country_info['Population']<10000) # 选出死亡率大于1000或人口小于10000的国家
print(country_info.loc[or_filter,['Region','Population']]) # 输出死亡率大于1000或人口小于10000的国家的地区和人口数
population_filter = (country_info['Population']>100000) # 选出人口大于100000的国家
print(country_info.loc[~population_filter,['Region','Population']]) # 输出人口不大于100000的国家的地区和人口数

countries = ['China','Japan','United States','Russia','India'] # 选出中国、日本、美国、俄罗斯和印度的数据
infilter = country_info['Country'].isin(countries) # 选出中国、日本、美国、俄罗斯和印度的数据
print(country_info.loc[infilter,['Region','Population']]) # 输出中国、日本、美国、俄罗斯和印度的地区和人口数
str_filter = country_info['Country'].str.contains("A") # 选出国家名中包含字母“A”的国家
print(country_info.loc[~str_filter,['Region','Population']]) # 输出国家名中不包含字母“A”的国家的地区和人口数
str_filter2 = country_info['Country'].str.contains("A|Z") # 选出国家名中包含字母“A”或“Z”的国家
print(country_info.loc[str_filter2,['Region','Population']]) # 输出国家名中包含字母“A”或“Z”的国家的地区和人口数
str_filter3 = country_info['Country'].str.contains("[a-m]") # 选出国家名中包含字母“A”到“M”的国家
print(country_info.loc[str_filter3,['Region','Population']]) # 输出国家名中包含字母“A”到“M”的国家的地区和人口数

排序和增删查改

1、排序搜索

1
2
3
4
5
6
7
8
9
10
11
12
survey_df = pd.read_csv("D:\Downloads\small_survey_results.csv")

survey_df.set_index('Respondent', inplace=True) # 将Respondent列设置为索引列
print(survey_df.sort_index()) # 按照索引列排序
print(survey_df.sort_values(by='Age',ascending = False)) # 按照Age列排序
print(survey_df.sort_values(by=['Age','YearsCode'],ascending = [False,True])[['Age','YearsCode']]) # 按照Age列排序,如果Age相同,按照YearsCode列排序
print(survey_df.sort_values(by=['Age','YearsCode'],ascending = [0,1])[['Age','YearsCode']]) # 按照Age列排序,如果Age相同,按照YearsCode列排序,显示前10行
print(survey_df['Age'].sort_values())
print(survey_df['ConvertedComp'].nlargest(10)) # 找出ConvertedComp列中最大的10个值
print(survey_df['ConvertedComp'].nsmallest(10)) # 找出ConvertedComp列中最小的10个值
richest_users = survey_df.nlargest(10,'ConvertedComp') # 找出ConvertedComp列中最大的10个值,并显示所有列
print(richest_users[['ConvertedComp','DevType','EdLevel']]) # 找出ConvertedComp列中最大的10个值,并显示ConvertedComp和DevType列

2、增删改查

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
survey_df = pd.read_csv("D:\Downloads\small_survey_results.csv")

for col_name,col_data in survey_df.items(): # 遍历所有列
print(col_name)
print(col_data)
break

for row_index,row_data in survey_df.iterrows(): # 遍历所有行
print(row_index)
print(row_data)
break

inversed_df = pd.DataFrame({idx:values for idx,values in survey_df.items()}) # 转置
print(inversed_df.shape)

survey_df.rename(columns = {'Age':'user age','Orgsize':'organization size'},inplace=True) # 重命名列
print(survey_df.columns)

survey_df.columns = survey_df.columns.str.replace(' ','_') # 替换列名中的空格
print(survey_df.columns)

survey_df.columns = [col.lower() for col in survey_df.columns] # 将列名转换为小写
print(survey_df.columns)

print(survey_df['jobfactors'].str.lower()) # 将列中的字符串转换为小写

survey_df.loc[2,'trans'] = 'Yes' # 修改指定位置的值
print(survey_df.loc[2])

survey_df.loc[2,['trans','user_age','country']] = ['Yes',29,'China'] # 修改指定位置的值
print(survey_df.loc[2])

survey_df.loc[survey_df['user_age']<18,['age_group']] = 'young' # 根据条件修改值
print(survey_df.loc[survey_df['user_age']<18])

survey_df.loc[survey_df['user_age']>=18,['age_group']] = 'adult' # 根据条件修改值
print(survey_df.loc[survey_df['user_age']>=18])

survey_df['gen_col']=survey_df['gender']+survey_df['sexuality']+survey_df['trans'] # 新增列
print(survey_df['gen_col'])

survey_df['job_factors']=survey_df['jobfactors'].str.split(';')
print(survey_df['job_factors'])

survey_df[['job-fac1','job-fac2','job-fac3']] = survey_df['jobfactors'].str.split(';',expand=True) # 将列中的字符串按指定字符分割
print(survey_df[['job-fac1','job-fac2','job-fac3']]) # 将列中的字符串按指定字符分割

survey_df._append({'user_age':30,'country':'China'},ignore_index=True) # 添加行
print(survey_df)

new_survey={'user_age':25,'country':'China','age1stcode':24}
new_survey_df = pd.DataFrame(new_survey,index=[0])
print(survey_df._append(new_survey_df,ignore_index=True,sort=False)) # 添加行

print(survey_df.drop(columns=['soaccount'],inplace=True))# 删除列

print(survey_df.drop(index=4))# 删除行

age_filter = survey_df['user_age']<50
print(survey_df.drop(index=survey_df[age_filter].index))# 删除指定条件行

3、apply,map,applymap,replace

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
survey_df = pd.read_csv("D:\Downloads\small_survey_results.csv")
dtypes = survey_df.dtypes # 获取数据类型
str_cols=[col_name for col_name in dtypes.index if dtypes[col_name]=='object'] # 获取字符串类型的列名
survey_df[str_cols]=survey_df[str_cols].astype(str) # 将字符串类型的列转换为字符串类型

print(survey_df['Hobbyist'].apply(len))

def upper_case(col_value):
return col_value.upper()
print(survey_df['Hobbyist'].apply(upper_case)) # 将hobbyist列的值转换为大写

survey_df['Hobbyist'] = survey_df['Hobbyist'].apply(lambda x: x.upper()) # 将hobbyist列的值转换为大写
print(survey_df['Hobbyist'])

print(survey_df.apply(len,axis='columns')) # 计算每行的长度
print(survey_df.apply(lambda x:x.min())) # 计算每列的最小值
map_dict = {'YES':True,'NO':False}
print(survey_df['Hobbyist'].map(map_dict)) # 将hobbyist列的值转换为布尔值

print(survey_df[str_cols].applymap(len)) # 计算字符串列的长度

map_dict={'YES':True}
print(survey_df['Hobbyist'].replace(map_dict)) # 将hobbyist列的值转换为布尔值

聚合、分组、数据清理

1、聚合、分组

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
survey_df = pd.read_csv("D:\Downloads\small_survey_results.csv")

print(survey_df['ConvertedComp'].median()) # 显示数据的中位数
print(survey_df['ConvertedComp'].describe()) # 显示数据的统计信息
print(survey_df['Hobbyist'].value_counts(normalize=True)) # normalize=True表示显示百分比
country_groups=survey_df.groupby(['Country']) # 按国家分组
print(country_groups.get_group('India')) # 获取印度的数据
print(survey_df.groupby(['Country']).count()) # 统计各个国家的数据量
print(survey_df.groupby(['Country']).get_group('China')['OpSys'].value_counts()) # 获取中国的数据,并统计操作系统的数量
print(country_groups['OpSys'].value_counts().loc['China']) # 获取中国的数据,并统计操作系统的数量
print(country_groups['ConvertedComp'].median().loc['Germany']) # 获取德国的数据,并统计薪资的中位数
print(country_groups['ConvertedComp'].agg(['median','mean'])) # 统计薪资的中位数和平均数
country_filter = survey_df['Country'] == 'United States' # 筛选出美国的数据
print(survey_df.loc[country_filter]['LanguageWorkedWith'].str.contains('Python').sum()) # 统计美国使用Python的人数
country_groups = survey_df.groupby(['Country']).sum() # 按国家分组,并求和
print(country_groups) # 显示各个国家的数据和

country_groups = survey_df.groupby('Country')
country_users_python = country_groups['LanguageWorkedWith'].apply(lambda x:x.str.contains('Python').sum())
country_respondents = survey_df['Country'].value_counts()
concated_df = pd.concat([country_respondents, country_users_python], axis='columns', sort=False) # 合并两个Series
concated_df.rename(columns={'Country':'NumIfUsers', 'LanguageWorkedWith':'NumOfPythonUsers'}, inplace=True) # 重命名列名
print(concated_df)

2、数据清理

1
2
3
4
5
6
7
8
9
10
11
12
survey_df = pd.read_csv("D:\Downloads\small_survey_results.csv")

print(small_survey_df.dropna()) # 删除缺失值
print(small_survey_df.dropna(axis='index',how='all')) # 删除全为缺失值的行
print(small_survey_df.dropna(axis='index',how='all',subset=['CompFreq','CompTotal'])) # 删除CompFreq和CompTotal列全为缺失值的行
small_survey_df.replace('No',np.nan,inplace=True) # 将No替换为缺失值
print(small_survey_df.isna()) # 查看缺失值
print(small_survey_df.fillna('Missing')) # 填充缺失值为missing
survey_df['YearsCode'].replace('Less than 1 year',0,inplace=True)# 将Less than 1 year替换为0
print(survey_df)
survey_df['YearsCode'] = survey_df['YearsCode'].astype(float) # 将YearsCode转换为浮点型
print(survey_df['YearsCode'].mean()) # 计算YearsCode的均值

3、实际应用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
framework_df = survey_df['WebframeWorkedWith'].str.split(';', expand=True) # 拆分数据
framework_df.fillna('None', inplace=True) # 将空值填充为None
country_groups = survey_df.groupby(['Country'] )# 按国家分组
district_frameworks = np.unique(framework_df.values) # 获取所有的框架

framework_sum_array = [] # 创建一个空数组
for framework in district_frameworks:
new_df = country_groups['WebframeWorkedWith'].apply(lambda x: x.str.contains(framework).sum()) # 计算每个国家的框架数量
new_df.name = framework
framework_sum_array.append(new_df)

user_count = survey_df['Country'].value_counts() # 计算每个国家的用户数量
concated_df = pd.concat([user_count]+framework_sum_array, axis='columns') # 拼接数据
print(concated_df)

most_popular_df = concated_df.drop(columns = ['Country','None']).idxmax(axis=1) # 获取每个国家最流行的框架
most_popular_df.name = 'most_popular_framework'
final_df = concated_df.join(most_popular_df)[['Country','most_popular_framwork']] # 拼接数据

横向合并和纵向合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
movies = pd.DataFrame({'movie_id':[1,2,3,5,7],
'title':['t1','t2','t3','t5','t7'],
'description':['d1','d2','d3','d5','d7']})
ratings = pd.DataFrame({'user_id':[1,2,7,9,11],
'movie_id':[1,2,3,5,6],
'title':['t1','t2','t3','t5','t6'],
'rating':[2,3,1,5,4],
'time':['t1','t2','t4','t4','t1']})
print(pd.merge(movies,ratings)) # 默认是inner join
print(pd.merge(movies,ratings,on=['movie_id','title'])) # 指定连接键
print(pd.merge(movies,ratings,left_on='movie_id',right_on='user_id')) # 指定左右连接键
print(pd.merge(movies,ratings,left_index=True,right_index=True)) # 指定左右索引
print(pd.merge(movies,ratings,on=['movie_id'],suffixes=['_left','_right']))# 指定后缀
print(pd.merge(movies,ratings,on=['movie_id','title'],how='outer',indicator='indicator')) # 指定连接方式
print(movies.join(ratings,on='movie_id',lsuffix='_left',rsuffix='_right')) # 用join方法连接
print(pd.concat([movies,ratings])) # 用concat方法连接
print(pd.concat([movies,ratings],ignore_index=True)) # 用concat方法连接
print(pd.concat([movies,ratings],join='inner',axis=1)) # 用concat方法连接
print(movies._append([ratings,movies],ignore_index=True)) # 用append方法连接

Pandas数据可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
usa_city_population = pd.DataFrame({
'population': [8175133,3792621,2695598,2100263,19354922],},index=['New York','Los Angeles','Chicago','Houston','Phoenix']) # 创建数据
axs = usa_city_population.plot.pie(y='population') # 画饼图
china_city_population = pd.DataFrame({'population': [15773658,2180357, 1208360, 1035837, 10039107]},index=['广州','上海','北京','天津','重庆']) # 创建数据
axs = china_city_population.plot.pie(y='population',figsize = (5,5)) # 画饼图

top_city_population = pd.DataFrame({
'uas':usa_city_population['population'].values,
'china':china_city_population['population'].values},index=['top1','top2','top3','top4','top5']) # 创建数据
axs = top_city_population.plot.bar(rot=0) # 画柱状图
top_city_population.reset_index().plot.bar(x='index',y=['china'],rot=0) # 画柱状图

data=pd.Series(np.random.randn(1000),index=np.arange(1000))
axs = data.cumsum().plot() # 画折线图
data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=['A','B','C','D'])

data=data.cumsum()
data.plot.scatter(x='A',y='B',color='Green',label = 'Class1') # 画散点图
ax1 = data.plot.scatter(x='A',y='B',color='Green',label = 'Class1') # 画散点图
data.plot.scatter(x='A',y='C',color='Red',label = 'Class2',ax=ax1) # 合并散点图

data.plot.box() # 画箱线图

data = np.abs(data)
axs = data.plot.area(figsize=(12,4),subplots=True) # 画面积图
plt.show()

Matplotlib基础

figure

1
2
3
4
5
6
7
8
x1 = np.linspace(-5,5,50)

y1 = 2*x1
y2 = x1**2-10

plt.figure(figsize=(5,6))
plt.plot(x1,y1)
plt.show()
plot
1
2
plt.plot(x1,y2,color="green",linewidth=2,linestyle='--',label='y=x^2-10') # 绘制y=x^2-10曲线
plt.show()
plot
1
2
3
4
5
# 画两条曲线
fix,axe = plt.subplots()
axe.plot(x1,y1,label='y=2x')
axe.plot(x1,y2,label='y=x^2-10')
plt.show()
subplot
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 子图
fig=plt.figure()
plot=fig.add_subplot(121) # 1行2列第1个
plot.plot(x1,y1)
plot=fig.add_subplot(122) # 1行2列第2个
plot.plot(x1,y2)
plt.show()

#另一种方法
plt.subplot(1,2,1)
plt.plot(x1,y1)
plt.subplot(1,2,2)
plt.plot(x1,y2)
plt.show()
subplot

坐标轴和边框

1
2
3
4
5
6
7
8
9
x1 = np.linspace(-5,5,50)
y1 = 2*x1
y2 = x1**2-10

plt.plot(x1,y1)
plt.plot(x1,y2)

plt.title("Title",color='red',fontsize=20)
plt.show()
标题
1
2
3
plt.xlabel('x轴')
plt.ylabel('y轴')
plt.show()
坐标轴标题
1
2
3
4
5
6
7
8
9
10
plt.xlim((-2,2))
plt.ylim((-5,10))
plt.show()

#另一种方法
fig,axe=plt.subplots()
axe.plot(x1,y1)
axe.plot(x1,y2)
axe.set_xlim(-2,2)
axe.set_ylim(-5,10)
限制坐标轴范围
1
2
3
4
5
6
7
8
#设置刻度线和边框
fig,axe=plt.subplots()
axe.plot(x1,y1)
axe.plot(x1,y2)
axe.set_xlim(-2,2)
axe.set_ylim(-5,10)
axe.set_xticks(np.linspace(-2,2,3)) # 设置x轴刻度
axe.tick_params(direction='in',length=5,width=2,color='b') # 设置刻度线的方向、长度、宽度和颜色
设置刻度线和边框
1
2
3
4
5
6
7
fig,axe=plt.subplots()
axe.plot(x1,y1)
axe.plot(x1,y2)
axe.spines['left'].set_position(('axes',0.5))
axe.spines['bottom'].set_position(('axes',0.5)) # 设置坐标轴位置
axe.spines['right'].set_color('none')
axe.spines['top'].set_color('none') # 隐藏坐标轴
设置边框

图例、标注

1
2
3
plt.plot(x1,y1,label='y=2x')
plt.plot(x1,y2,label='y=x^2')
plt.legend(loc=0,title="legend title",shadow=True,ncol=2,facecolor='gray') # loc=0自动选择最佳位置,设置标题,阴影,两列显示,背景为灰色
设置图例
1
2
3
4
5
6
#text无指向性标注
plt.plot(x1,y1)
plt.plot(x1,y2)
plt.text(-1,5,"two functions",family = "Times New Roman",fontsize = 12,style = "italic",color = "r",weight = "black",bbox = dict(boxstyle="round",facecolor = "none",ec = "b"))

plt.show()
无指向性标注
1
2
3
4
#annotate指向性标注
plt.plot(x1,y1)
plt.plot(x1,y2)
plt.annotate("y=2x",xy=(1,2),xytext=(2,0),arrowprops=dict(arrowstyle="->",linestyle="--",connectionstyle="arc3,rad=.5"),bbox=dict(boxstyle="round,pad=0.5",fc="none",ec="gray")) #xytext是注释的位置,xy是指向的位置
指向性标注

多图合并、折线图、散点图、柱状图、直方图、面积图、堆叠面积图、箱型图、饼图、热力图、3D图

多图合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
plt.figure(figsize=(8, 5))
plt.subplot(2, 3, 2) # 将窗口分为2行3列,当前位置为2
plt.plot([0, 10], [0, 10])
plt.title("2,3,2")
plt.subplot(2, 3, 4)
plt.plot([0, 10], [0, 10])
plt.title("2,3,4")
plt.subplot(2, 3, 5)
plt.plot([0, 10], [0, 10])
plt.title("2,3,5")
plt.tight_layout() # 紧凑图像

#另一种方法
fig, axe = plt.subplots(nrows = 2, ncols = 2)
axe[0][1].plot([0, 10], [0, 10])
plt.show()

# 修改高度
fig, axe = plt.subplots(nrows = 2, ncols = 2, figsize = (10, 4)) # inches (width, height)
axe[1][1].plot([0, 10], [0, 10])
plt.show()

# 修改高度
fig, axe = plt.subplots(nrows = 2, ncols = 1)
axe[1].plot([0, 10], [0, 10])
plt.show()

# 绘制多图
fig, axe = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
plt.tight_layout()
axe[0][0].set_title("1st subplot")
axe[1][1].set_title("4th subplot")
axe[1][0].plot([0,10], [0, 10])
plt.tight_layout() # 紧凑图像
plt.show()

# 改变宽度
plt.subplot(2, 2, 1)
plt.title("2,2,1")
plt.plot([0, 10], [0, 10])

plt.subplot(2, 2, 2)
plt.plot([0, 10], [0, 10])
plt.title("2,2,3")

plt.subplot(2, 1, 2)
plt.plot([0, 10], [0, 10])
plt.title("2,1,2")

plt.tight_layout() # 紧凑图像
plt.show()

#使用GridSpec
from matplotlib.gridspec import GridSpec
fig = plt.figure(dpi=100) # the resolution in dots per inch
gs = GridSpec(2, 2, width_ratios=[1, 2], height_ratios=[3, 1])

ax1 = fig.add_subplot(gs[0]) #gs[0,:]表示占据整列,gs[:,0]表示占据整行
ax1.text(0.5, 0.5, "1st plot",verticalalignment='center', ha='center')
ax1.plot([0,1], [0, 1])
ax2 = fig.add_subplot(gs[1])
ax2.text(0.5,0.5,"2nd plot",verticalalignment='center', ha='center')
ax3 = fig.add_subplot(gs[2])
ax3.text(0.5,0.5,"3rd plot",verticalalignment='center', ha='center')
ax4 = fig.add_subplot(gs[3])
ax4.text(0.5,0.5,"4th plot",verticalalignment='center', ha='center')
plt.show()

# 图中图
x = np.linspace(-3, 3, 100)
y = np.sin(x)

fig, ax1 = plt.subplots()
ax1.plot(x, y, 'orange')
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_title('Big 1')

left, bottom, width, height = 0.25, 0.6, 0.2, 0.2
ax2 = fig.add_axes([left, bottom, width, height])
ax2.plot(x, y2, 'g')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('small 1')

plt.axes([0.65, 0.2, 0.2, 0.25])
plt.plot(x, x ** 2, 'r')
plt.xlabel('x')
plt.ylabel('y')
plt.title('small 2')
plt.show()

折线图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#折线图
x = np.linspace(-5, 5, 25)

fig, axe = plt.subplots(figsize= (8, 5))
axe.plot(x, x + 1, linestyle='-', color='r', marker='x', label="l1") # solid line style, red
axe.plot(x, x + 2, linestyle='--', color='y', marker='s', label="l2") # dashed line style, yellow, square
axe.plot(x, x + 3, linestyle='-.', color='m', marker='|', label="l3") # dash-dot line style, magenta, |
axe.plot(x, x + 4, linestyle=':', color='g', marker='v', label="l4") # dotted line style, green
axe.plot(x, x + 5, linestyle='-', color='b', marker='*', label="l5") # solid, blue,star
axe.plot(x, x + 6, linestyle='-', color='c', marker='o', label="l6") # solid, cyan, circle
axe.legend()
plt.show()

x = np.linspace(-4, 4, 100)
fig, axe = plt.subplots(figsize= (8, 5))
axe.plot(x, np.sin(x), '--b' ,x + 1, np.sin(x), '.r') # dashed line blue, point marker red
plt.show()

散点图

1
2
3
4
5
6
7
8
9
10
11
#散点图
# scatter allows us to control the properties of each individual data point, including size, color, edge color, and more.
fig, axe = plt.subplots(figsize= (8, 5))
rng = np.random.RandomState(66)
x = rng.randn(50)
y = rng.randn(50)
colors = rng.randn(50)
sizes = rng.randn(50) * 500
axe.grid()
axe.scatter(x, y, c=colors, s=sizes, alpha=0.5) # alpha透明度
plt.show()

柱状图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# 柱状对比
fig, axe = plt.subplots()

label = ["Jan", "Feb", "Mar", "Apr", "May", "Jun"]
index = np.arange(len(label))
values1 = [100, 150, 300, 220, 660, 320]
values2 = [200, 160, 200, 300, 800, 400]
axe.bar(index, values1, width=0.4)
axe.bar(index + 0.4, values2, width=0.4)
axe.set_xticks(index+0.15)
axe.set_xticklabels(label)

plt.show()

#柱状总和
fig, axe = plt.subplots()
label = ["Jan", "Feb", "Mar", "Apr", "May", "Jun"]
index = np.arange(len(label))
values1 = [100, 150, 300, 220, 660, 320]
values2 = [200, 160, 200, 300, 800, 400]
axe.bar(index, values1)
axe.bar(index, values2, bottom=values1)

plt.show()

#横向柱状总和barh
fig, axe = plt.subplots()
label = ["Jan", "Feb", "Mar", "Apr", "May", "Jun"]
index = np.arange(len(label))
values1 = [100, 150, 300, 220, 660, 320]
values2 = [200, 160, 200, 300, 800, 400]
axe.barh(index, values1)
axe.barh(index, values2, left=values1)
axe.set_yticks(index)
axe.set_yticklabels(label)
plt.show()

直方图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# 显示多个直方图
fig, axe = plt.subplots(figsize = (8, 5))

np.random.seed(66)
data1 = np.random.normal(-1, 1, 5000)
data2 = np.random.normal(-2, 1, 5000)
data3 = np.random.normal(-1, 2, 5000)
axe.hist(data1, bins=50, density=True, alpha=0.35, label="data1")
axe.hist(data2, bins=50, density=True, alpha=0.35, label="data2")
axe.hist(data3, bins=50, density=True, alpha=0.35, label="data3")

axe.legend()
plt.show()

#做出外边界曲线
fig, axe = plt.subplots(figsize = (8, 5))

sigma = 1
mu = 0
np.random.seed(66)
data = np.random.normal(mu, sigma, 5000)
n, bins, _ = axe.hist(data, bins=50, alpha=0.35, density=True)
# n: the values of the histogram bins, bins: the edges of the bins

y = ((1 / (np.sqrt(2 * np.pi) * sigma)) *
np.exp(-0.5 * (1 / sigma * (bins - mu))**2))
axe.plot(bins, y, '--r')

plt.show()

面积图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#面积图,并填充不同颜色
fig, axe = plt.subplots(figsize = (8, 5))

x = np.arange(0, 5, 0.01)
y = np.sin(x * np.pi)
axe.fill_between(x, y, where=(y > 0), facecolor = 'b', alpha = 0.7)
axe.fill_between(x, y, where=(y < 0), facecolor = 'g', alpha = 0.7)

plt.show()

#堆叠面积图stackplot
fig, axe = plt.subplots(nrows=2, figsize = (8, 5))

x = [1, 2, 3, 4, 5, 6]
y = [1, 3, 5, 7, 9, 11]
np.random.seed(66)
y1 = y + np.random.randint(1, 5, 6) # low = 1, high = 5, size = 6
y2 = y + np.random.randint(1, 8, 6)
y3 = y + np.random.randint(1, 5, 6)
y4 = y + np.random.randint(1, 30, 6)
y5 = y + np.random.randint(1, 5, 6)
y6 = y + np.random.randint(1, 20, 6)
y7 = y + np.random.randint(1, 10, 6)

labels = ["Jan", "Feb", "Mar", "Apr", "May", "Jun"]

axe[0].stackplot(x, y1, y2, y3, y4, y5, y6, y7, baseline="sym") # 基本线将会水平对称与0 (the baseline stack is going to be symmetric around the horizontal 0 line)
axe[0].set_xticks(x)
axe[0].set_xticklabels(labels)
axe[0].set_title("Symmetric")

axe[1].stackplot(x, y1, y2, y3, y4, y5, y6, y7, baseline="wiggle") # 重设基本线,以至于最小化平方斜率的总和 (minimizes the sum of the squared slopes.)
axe[1].set_xticks(x)
axe[1].set_xticklabels(labels)
axe[1].set_title("Wiggle")

plt.tight_layout()
plt.show()

箱型图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
fig, axe = plt.subplots(figsize = (8, 5))

np.random.seed(66)
labels = ["Label1", "Label2", "Label3", "Label4"]
values = []
values.append(np.random.normal(100, 20, 200)) # mean = 100, std = 20 for 200 values
values.append(np.random.normal(100, 100, 200))
values.append(np.random.normal(150, 50, 200))
values.append(np.random.normal(150, 70, 200))

axe.boxplot(values, labels=labels)
axe.boxplot(values, labels=labels, vert=False)
axe.boxplot(values, labels=labels,patch_artist=True,
boxprops=dict(facecolor='teal', color='r'))
plt.show()

饼图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
fig, axe = plt.subplots(figsize = (8, 5))

labels = ["P1", "P2", "P3", "P4", "P5", "P6"]
labels2 = ["S1", "S2", "S3"]
values = [200, 300, 88, 66, 110, 168]
values2 = [500, 100, 200]
explode = [0,0,0,0.3,0,0]
explode2 = [0, 0.1, 0]

# outer circle
# pctdistance is used to control the distance between the center of the circle and percentage value.
# labeldistance is used to control the distance between the center of the circle and the label.
axe.pie(values, radius=1.5, wedgeprops=dict(width=0.5), autopct='%.2f%%',
pctdistance=0.8, labels=labels, labeldistance=1.05, explode=explode)
# inner circle
axe.pie(values2, radius=1, wedgeprops=dict(width=0.5), autopct='%.2f%%',
pctdistance=0.8, labels=labels2, labeldistance=0.3, explode=explode2)
plt.show()

热力图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
fig, axe = plt.subplots(figsize = (8, 5))

axe.set_xticks(np.arange(len(xlabels)))
axe.set_yticks(np.arange(len(ylabels)))
axe.set_xticklabels(xlabels)
axe.set_yticklabels(ylabels)
im = axe.imshow(values)


for i in range(len(xlabels)):
for j in range(len(ylabels)):
text = axe.text(i, j, values[i, j],
horizontalalignment="center", verticalalignment="center", color="w")#显示文本
axe.figure.colorbar(im, ax=axe)#颜色条

plt.show()

3D图

1
2
3
4
5
6
7
8
9
10
11
fig = plt.figure(figsize = (8, 5))
axes = plt.axes(projection = "3d")

X = np.linspace(-5, 5, 200)
Y = np.linspace(-5, 5, 200)
X, Y = np.meshgrid(X, Y)
Z = np.cos(np.sqrt(X**2 + Y**2))

surf = axes.plot_surface(X, Y, Z, cmap=plt.get_cmap("plasma"))
plt.colorbar(surf)
plt.show()