population

国家统计局的第六次人口普查(2010年分年龄、性别人口数据)

  • 2010年第六次全国人口普查的对象是:普查标准时点在中华人民共和国境内的自然人以及在中华人民共和国境外但未定居的中国公民,不包括在中华人民共和国境内短期停留的港澳台居民和外籍人员
  • 本资料是按照实际登记直接汇总的数据,不包括人口普查漏登的人口;不包括230万现役军人;不包括465万难以确定常住地的人口

导入依赖包并爬取国家统计局数据

In [1040]:
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import matplotlib
import matplotlib.pyplot as plt
from pylab import * 
from matplotlib.font_manager import *
forcn = FontProperties(fname='/jupyterfile/simsun.ttf', size=14)
matplotlib.rcParams['axes.unicode_minus'] = False
url='http://www.stats.gov.cn/tjsj/pcsj/rkpc/6rp/html/A0107a.htm'
headers=('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
opener=urllib.request.build_opener()
opener.addheaders=[headers]
html=opener.open(url)
soup=BeautifulSoup(html, 'lxml')
pretdata=soup.prettify()
data=pd.read_html(pretdata)
df=pd.DataFrame(data)
d = df.iloc[0,0]

设定人口统计图表函数

In [1041]:
def popu(a):
    area=d.iloc[a, 0].replace(' ','')
    dmale=d.iloc[a:a+1, [5,8,11,14,17,20,23,26,29,32,35,38,41,44,47,50,53,56,59,62,65,68]]
    dfemale=d.iloc[a:a+1, [6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,66,69]]
    idx = ['1-','1-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80-84','85-89','90-94','95-99','100+']
    r=[]
    for i in np.arange(len(idx)):
        j = round(float(dmale.iloc[0, i])/float(dfemale.iloc[0, i]),2)
        r.append(j)
    data_bar_m = [-1*float(item) for item in list(dmale.iloc[0])]
    data_bar_f = [float(item) for item in list(dfemale.iloc[0])]
    fig = plt.figure(figsize=(14,6))
    l1=plt.barh(idx,data_bar_m , color='b',alpha=0.5,label='男', edgecolor='white')
    l2=plt.barh(idx,data_bar_f , color='g',alpha=0.5,label='女', edgecolor='white')
    plt.grid(True)
    plt.legend(handles=[l1, l2 ], loc = 2, frameon=False, prop=forcn)
    plt.xlabel('2010 YEAR')
    plt.title('2010 POPULATION'+' '+ area +'\n来源:www.jasper.wang', fontproperties=forcn)
    for i, xy in enumerate(zip(data_bar_m, idx)):
        plt.annotate(r[i], xy=xy, xytext=(-30, -5), textcoords='offset points')    
    plt.show()

全国人口数据

In [1042]:
popu(5)

上海人口数据

In [1043]:
popu(14)

江西人口数据

In [1044]:
popu(19)

新疆人口数据

In [1045]:
popu(36)

国家统计局1997年至2016年人口出生率与死亡率

准备实际数据

In [1046]:
x = ['2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003','2002','2001','2000','1999','1998','1997']
y1=[12.95,12.07,12.37,12.08,12.10,11.93,11.90,11.95,12.14,12.10,12.09,12.40,12.29,12.41,12.86,13.38,14.03,14.64,15.64,16.57]
y2=[7.09,7.11,7.16,7.16,7.15,7.14,7.11,7.08,7.06,6.93,6.81,6.51,6.42,6.40,6.41,6.43,6.45,6.46,6.50,6.51]
x.reverse()
y1.reverse()
y2.reverse()
fig = plt.figure(figsize=(14,6))
l1, = plt.plot(x, y1, 'r-', label="Birth rate")
l2, = plt.plot(x, y2, 'g--', label="mortality rate")
plt.show()

编制人口预测二次指数平滑函数

In [1047]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import statsmodels.api as sm 
In [1048]:
def exSm(a,origin,originx,t):
    es=[]
    for i in range(len(originx)):
        pre=0.
        if not i == 0:
            pre = a * origin[i]+(1-a)*es[i-1]
        else:
            pre = a * origin[i] + (1-a)*(origin[0]+origin[1]+origin[2])/3
        es.append(round(pre,2))
    pred=a*origin[-1]+(1-a)*es[-1]
    es2=[]
    es2.append(es[0])
    for j in range(1, len(originx)):
        pre2=a * es[j] + (1-a) * es2[j-1]
        es2.append(round(pre2,2))        
    at = 2*es[-1]-es2[-1]
    bt = a/(1-a) * (es[-1]-es2[-1])
    Y_n1 = round((at+bt*1),2)
    Y_n2 = round((at+bt*2),2)
    Y_n3 = round((at+bt*3),2)
    Y_n4 = round((at+bt*4),2)
    newyear = ['2017','2018','2019','2020']
    originx_pre = originx + newyear
    i=[]
    i.append(origin[-1])
    i.append(origin[-1])
    i.append(origin[-1])
    i.append(origin[-1])
    origin_pre = origin + i
    es2.append(Y_n1)
    es2.append(Y_n2)
    es2.append(Y_n3)
    es2.append(Y_n4)
    x = originx_pre
    y1= origin_pre
    y2=es2
    xmark=[originx[-1],originx[-1]]
    ymark=[min(es2)*0.95,max(es2)*1.05]    
    fig = plt.figure(figsize=(14,6))
    l1, = plt.plot(x, y1, 'r--', label="origin", marker ='o')
    l2, = plt.plot(x, y2, 'b-', label="$ES^{(2)}$", marker ='o')
    l4, = plt.plot(xmark, ymark, 'g-')
    plt.legend(handles=[l1, l2], loc = 0, frameon=False, fontsize=16)
    plt.grid(False)
#     plt.text(originx[-1],(min(es2)+max(es2))/2, '  www.jasper.wang')
    plt.title('2020 PREDICTION'+' '+ t +'\n来源:www.jasper.wang', fontproperties=forcn)
    plt.show()   
    mm=pd.DataFrame(x,columns=['x'])
    mm=pd.concat([mm,pd.DataFrame(y1,columns=['y1'])],axis=1)
    mm=pd.concat([mm,pd.DataFrame(y2,columns=['y2'])],axis=1)
    print(mm)

计算人口出生率预测值并绘制图表

In [1049]:
a=0.5
originx= ['2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003','2002','2001','2000','1999','1998','1997']
origin=[12.95,12.07,12.37,12.08,12.10,11.93,11.90,11.95,12.14,12.10,12.09,12.40,12.29,12.41,12.86,13.38,14.03,14.64,15.64,16.57]
originx.reverse()
origin.reverse()
exSm(a,origin,originx,'出生率')
       x     y1     y2
0   1997  16.57  16.09
1   1998  15.64  15.98
2   1999  14.64  15.62
3   2000  14.03  15.13
4   2001  13.38  14.57
5   2002  12.86  14.00
6   2003  12.41  13.46
7   2004  12.29  13.04
8   2005  12.40  12.77
9   2006  12.09  12.53
10  2007  12.10  12.36
11  2008  12.14  12.27
12  2009  11.95  12.16
13  2010  11.90  12.07
14  2011  11.93  12.02
15  2012  12.10  12.02
16  2013  12.08  12.04
17  2014  12.37  12.12
18  2015  12.07  12.13
19  2016  12.95  12.34
20  2017  12.95  12.94
21  2018  12.95  13.14
22  2019  12.95  13.34
23  2020  12.95  13.54

计算人口死亡率预测值并绘制图表

In [1050]:
a=0.5
originx= ['2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003','2002','2001','2000','1999','1998','1997']
origin=[7.09,7.11,7.16,7.16,7.15,7.14,7.11,7.08,7.06,6.93,6.81,6.51,6.42,6.40,6.41,6.43,6.45,6.46,6.50,6.51]
originx.reverse()
origin.reverse()
exSm(a,origin,originx,'死亡率')
       x    y1    y2
0   1997  6.51  6.50
1   1998  6.50  6.50
2   1999  6.46  6.49
3   2000  6.45  6.47
4   2001  6.43  6.46
5   2002  6.41  6.45
6   2003  6.40  6.44
7   2004  6.42  6.43
8   2005  6.51  6.45
9   2006  6.81  6.54
10  2007  6.93  6.66
11  2008  7.06  6.79
12  2009  7.08  6.89
13  2010  7.11  6.97
14  2011  7.14  7.03
15  2012  7.15  7.08
16  2013  7.16  7.11
17  2014  7.16  7.13
18  2015  7.11  7.13
19  2016  7.09  7.12
20  2017  7.09  7.09
21  2018  7.09  7.08
22  2019  7.09  7.07
23  2020  7.09  7.06

计算各年增加及减少人口数

  人口出生率 =(年内出生人数/年内平均总人口数)×1000‰
  人口死亡率 =(年内死亡人数/年内平均总人口数)×1000‰

年份 人口普查数据 实际出生率 预测出生率 实际死亡率 预测死亡率
2010 1332810869 11.9 12.07 7.11 6.97
2011 - 11.93 12.02 7.14 7.03
2012 - 12.1 12.02 7.15 7.08
2013 - 12.08 12.04 7.16 7.11
2014 - 12.37 12.12 7.16 7.13
2015 - 12.07 12.13 7.11 7.13
2016 - 12.95 12.34 7.09 7.12
2017 - - 12.94 - 7.09
2018 - - 13.14 - 7.08
2019 - - 13.34 - 7.07
2020 - - 13.54 - 7.06

预测计算从2010年至2020年各年度总人口数

  年内平均总人口数以上年数为准,即2011年平均总人口数以1332810869为基数计算。
  2016年以前(含)按照实际出生率和死亡率计算。
  2016年以后按照预测出生率和死亡率计算。

In [1051]:
#import numpy as np
#np.set_printoptions(suppress=True) np不以科学计数法显示
x=['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
popu=[1332810869]
birthrate_a=[11.9,11.93,12.1,12.08,12.37,12.07,12.95,12.95,12.95,12.95,12.95]
birthrate_d=[12.07,12.02,12.02,12.04,12.12,12.13,12.34,12.94,13.14,13.34,13.54]
mortality_a=[7.11,7.14,7.15,7.16,7.16,7.11,7.09,7.09,7.09,7.09,7.09]
mortality_d=[6.97,7.03,7.08,7.11,7.13,7.13,7.12,7.09,7.08,7.07,7.06]
for i in range(1,len(x)):
    if int(x[i])-2016 <= 0:
        n = (birthrate_a[i] - mortality_a[i])/1000
        popu_d = popu[-1] * (1+n)
        popu.append(round(popu_d,0))
    elif int(x[i])-2016 < 5:
        n = (birthrate_d[i] - mortality_d[i])/1000
        popu_d = popu[-1] * (1+n)
        popu.append(round(popu_d,0))
p=pd.DataFrame(popu)
p.astype('int64') #df不以科学计数法显示
Out[1051]:
0
0 1332810869
1 1339195033
2 1345824048
3 1352445502
4 1359491743
5 1366234822
6 1374240958
7 1382280268
8 1390656886
9 1399376305
10 1408444263

各年增加人口数

In [1052]:
x=['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
popu=[1332810869]
birthrate_a=[11.9,11.93,12.1,12.08,12.37,12.07,12.95,12.95,12.95,12.95,12.95]
birthrate_d=[12.07,12.02,12.02,12.04,12.12,12.13,12.34,12.94,13.14,13.34,13.54]
mortality_a=[7.11,7.14,7.15,7.16,7.16,7.11,7.09,7.09,7.09,7.09,7.09]
mortality_d=[6.97,7.03,7.08,7.11,7.13,7.13,7.12,7.09,7.08,7.07,7.06]
inc=[]
for i in range(1,len(x)):
    if int(x[i])-2016 <= 0:
        n = (birthrate_a[i] - mortality_a[i])/1000
        popu_d = popu[-1] * (1+n)
        popu.append(round(popu_d,0))
        inc_i = round(popu[-1] * birthrate_a[i]/1000,0)
        inc.append(inc_i)
    elif int(x[i])-2016 < 5:
        n = (birthrate_d[i] - mortality_d[i])/1000
        popu_d = popu[-1] * (1+n)
        popu.append(round(popu_d,0))
        inc_ = round(popu[-1] * birthrate_d[i]/1000,0)
        inc.append(inc_i)
print(inc)
pre5=sum(inc[0:5])
print(pre5)
nex5=sum(inc[5:10])
print(nex5)
new = []
new.append(nex5/2)
new.append(pre5/2)
print(new)
[15976597.0, 16284471.0, 16337542.0, 16816913.0, 16490454.0, 17796420.0, 17796420.0, 17796420.0, 17796420.0, 17796420.0]
81905977.0
88982100.0
[44491050.0, 40952988.5]

各年减少人口数

In [1053]:
x=['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
popu=[1332810869]
birthrate_a=[11.9,11.93,12.1,12.08,12.37,12.07,12.95,12.95,12.95,12.95,12.95]
birthrate_d=[12.07,12.02,12.02,12.04,12.12,12.13,12.34,12.94,13.14,13.34,13.54]
mortality_a=[7.11,7.14,7.15,7.16,7.16,7.11,7.09,7.09,7.09,7.09,7.09]
mortality_d=[6.97,7.03,7.08,7.11,7.13,7.13,7.12,7.09,7.08,7.07,7.06]
dec=[]
for i in range(1,len(x)):
    if int(x[i])-2016 <= 0:
        n = (birthrate_a[i] - mortality_a[i])/1000
        popu_d = popu[-1] * (1+n)
        popu.append(round(popu_d,0))
        dec_i = round(popu[-1] * mortality_a[i]/1000,0)
        dec.append(dec_i)
    elif int(x[i])-2016 < 5:
        n = (birthrate_d[i] - mortality_d[i])/1000
        popu_d = popu[-1] * (1+n)
        popu.append(round(popu_d,0))
        dec_i = round(popu[-1] * mortality_d[i]/1000,0)
        dec.append(dec_i)
print(dec)
sum5=sum(dec[0:10])
print(sum5)
[9561853.0, 9622642.0, 9683510.0, 9733961.0, 9713930.0, 9743368.0, 9800367.0, 9845851.0, 9893590.0, 9943616.0]
97542688.0

计算2020年分年龄段人口预测数据

按照时间增长调整人口统计分年龄数据

In [1054]:
dmale=d.iloc[5:6, [5,8,11,14,17,20,23,26,29,32,35,38,41,44,47,50,53,56,59,62,65,68]]
dfemale=d.iloc[5:6, [6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,66,69]]

合并 1- 与 1-4 分组数据,并赋值给 0-4 分组

In [1055]:
new_male_1   = dmale.replace(dmale.iloc[0,1], float(dmale.iloc[0,0])+float(dmale.iloc[0,1])).iloc[[0],1:]
new_female_1 = dfemale.replace(dfemale.iloc[0,1], float(dfemale.iloc[0,0])+float(dfemale.iloc[0,1])).iloc[[0],1:]
In [1056]:
new_male_1
Out[1056]:
8 11 14 17 20 23 26 29 32 35 ... 41 44 47 50 53 56 59 62 65 68
5 41062566.0 38464665 40267277 51904830 64008573 50837038 49521822 60391104 63608678 53776418 ... 41082938 29834426 20748471 16403453 11278859 5917502 2199810 530872 117716 8852

1 rows × 21 columns

In [1057]:
new_female_1
Out[1057]:
9 12 15 18 21 24 27 30 33 36 ... 42 45 48 51 54 57 60 63 66 69
5 34470044.0 32416884 34641185 47984284 63403945 50176814 47616381 57634855 61145286 51818135 ... 40229536 28832856 20364811 16568944 12573274 7455696 3432118 1047435 252263 27082.0

1 rows × 21 columns

合并100+,95-99,90-94 分组并赋值给 100+ 年龄段

In [1058]:
new_male_2= new_male_1.replace(new_male_1.iloc[0,-3], float(new_male_1.iloc[0,-1])+float(new_male_1.iloc[0,-2])+float(new_male_1.iloc[0,-3])).iloc[[0],0:19]
new_female_2= new_female_1.replace(new_female_1.iloc[0,-3], float(new_female_1.iloc[0,-1])+float(new_female_1.iloc[0,-2])+float(new_female_1.iloc[0,-3])).iloc[[0],0:19]
In [1059]:
new_male_2
Out[1059]:
8 11 14 17 20 23 26 29 32 35 38 41 44 47 50 53 56 59 62
5 41062566.0 38464665 40267277 51904830 64008573 50837038 49521822 60391104 63608678 53776418 40363234 41082938 29834426 20748471 16403453 11278859 5917502 2199810 657440.0
In [1060]:
new_female_2
Out[1060]:
9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57 60 63
5 34470044.0 32416884 34641185 47984284 63403945 50176814 47616381 57634855 61145286 51818135 38389937 40229536 28832856 20364811 16568944 12573274 7455696 3432118 1326780.0

将2010~2015出生合计人口赋值给5-9分组

将2016~2020出生合计人口赋值给0-4分组

In [1061]:
df=pd.DataFrame(new)
df1=df.T.reset_index(drop=True)
new_male_3=new_male_2.reset_index(drop=True)
result_m = pd.concat([df1, new_male_3],axis=1)
result_m
Out[1061]:
0 1 8 11 14 17 20 23 26 29 ... 35 38 41 44 47 50 53 56 59 62
0 44491050.0 40952988.5 41062566.0 38464665 40267277 51904830 64008573 50837038 49521822 60391104 ... 53776418 40363234 41082938 29834426 20748471 16403453 11278859 5917502 2199810 657440.0

1 rows × 21 columns

In [1062]:
df=pd.DataFrame(new)
df1=df.T.reset_index(drop=True)
new_female_3=new_female_2.reset_index(drop=True)
result_f = pd.concat([df1, new_female_3],axis=1)
result_f
Out[1062]:
0 1 9 12 15 18 21 24 27 30 ... 36 39 42 45 48 51 54 57 60 63
0 44491050.0 40952988.5 34470044.0 32416884 34641185 47984284 63403945 50176814 47616381 57634855 ... 51818135 38389937 40229536 28832856 20364811 16568944 12573274 7455696 3432118 1326780.0

1 rows × 21 columns

按照认为设定比例相应减少各年龄段人口数

  如前,10年间共减少人口数97542688人。
  分布在个年龄段的权重,根据经验从0-4 ~ 100+设置为:

In [1063]:
result_m
result_f
weight=[0.015,0.04,0.04,0.04,0.04,0.03,0.03,0.04,0.04,0.04,0.04,0.04,0.04,0.05,0.05,0.05,0.11,0.11,0.11,0.035,0.01]
po=97542688/2
weight_po = [po*i for i in weight]
weight_po_de = pd.DataFrame(weight_po)
result_m.columns = [i for i in range(0,21)]
result_f.columns = [i for i in range(0,21)]
weight_po_de.T.columns = [i for i in range(0,21)]
m_p = pd.concat([result_m, weight_po_de.T],axis=0).reset_index(drop=True)
f_p = pd.concat([result_f, weight_po_de.T],axis=0).reset_index(drop=True)
m_p_1=[]
f_p_1=[]
for i in range(len(m_p.T)):
    m_p_1.append(round(float(m_p[i][0])-float(m_p[i][1]),0))
    f_p_1.append(round(float(f_p[i][0])-float(f_p[i][1]),0))
m_p_2=pd.DataFrame(m_p_1).reset_index(drop=True)
f_p_2=pd.DataFrame(f_p_1).reset_index(drop=True)
print(m_p_2.T)
print(f_p_2.T)
           0           1           2           3           4           5   \
0  43759480.0  39002135.0  39111712.0  36513811.0  38316423.0  50441690.0   

           6           7           8           9     ...             11  \
0  62545433.0  48886184.0  47570968.0  58440250.0    ...     51825564.0   

           12          13          14          15          16         17  \
0  38412380.0  38644371.0  27395859.0  18309904.0  11038605.0  5914011.0   

         18        19        20  
0  552654.0  492813.0  169727.0  

[1 rows x 21 columns]
           0           1           2           3           4           5   \
0  43759480.0  39002135.0  32519190.0  30466030.0  32690331.0  46521144.0   

           6           7           8           9     ...             11  \
0  61940805.0  48225960.0  45665527.0  55684001.0    ...     49867281.0   

           12          13          14          15          16         17  \
0  36439083.0  37790969.0  26394289.0  17926244.0  11204096.0  7208426.0   

          18         19        20  
0  2090848.0  1725121.0  839067.0  

[1 rows x 21 columns]

2020年人口统计预测

图表

In [1064]:
idx = ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80-84','85-89','90-94','95-99','100+']
r=[]
for i in np.arange(len(idx)):
    j = round(float(m_p_2.T.iloc[0,i])/float(f_p_2.T.iloc[0,i]),2)
    r.append(j)
data_bar_m = [-1*float(item) for item in list(m_p_2.T.iloc[0])]
data_bar_f = [float(item) for item in list(f_p_2.T.iloc[0])]
fig = plt.figure(figsize=(14,6))
l1=plt.barh(idx,data_bar_m , color='b',alpha=0.5,label='男', edgecolor='white')
l2=plt.barh(idx,data_bar_f , color='g',alpha=0.5,label='女', edgecolor='white')
plt.grid(True)
plt.legend(handles=[l1, l2 ], loc = 2, frameon=False, prop=forcn)
plt.xlabel('2020 YEAR')
plt.title('2020 POPULATION'+' 全国 '+'\n来源:www.jasper.wang', fontproperties=forcn)
for i, xy in enumerate(zip(data_bar_m, idx)):
    plt.annotate(r[i], xy=xy, xytext=(-30, -5), textcoords='offset points')    
plt.show()

几个假设

  • 因为缺乏出生率与死亡率的性别数据,故按照5:5进行划分
  • 死亡率在不同年龄段的权数设定带有主观性,主要是80岁以上人权较重相对较高
  • 由于2016年出生率相对于2015年有一个较大的升幅,因此预测值呈现逐年增长态势,故0-4岁年龄段比5-9岁年龄段人口有较明显的增长

几点杞人忧天

  • 至2020年,人口较多的年龄段分别是50岁到54岁,以及30岁到34岁
  • 20-24岁年龄段的人口中,男女比率为1.17 :1。理论上,将有17条汉子难以脱单......

◎ 欢迎参与讨论,请在这里发表您的看法、交流您的观点。