import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import matplotlib
import matplotlib.pyplot as plt
from pylab import *
from matplotlib.font_manager import *
forcn = FontProperties(fname='/jupyterfile/simsun.ttf', size=14)
matplotlib.rcParams['axes.unicode_minus'] = False
url='http://www.stats.gov.cn/tjsj/pcsj/rkpc/6rp/html/A0107a.htm'
headers=('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
opener=urllib.request.build_opener()
opener.addheaders=[headers]
html=opener.open(url)
soup=BeautifulSoup(html, 'lxml')
pretdata=soup.prettify()
data=pd.read_html(pretdata)
df=pd.DataFrame(data)
d = df.iloc[0,0]
def popu(a):
area=d.iloc[a, 0].replace(' ','')
dmale=d.iloc[a:a+1, [5,8,11,14,17,20,23,26,29,32,35,38,41,44,47,50,53,56,59,62,65,68]]
dfemale=d.iloc[a:a+1, [6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,66,69]]
idx = ['1-','1-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80-84','85-89','90-94','95-99','100+']
r=[]
for i in np.arange(len(idx)):
j = round(float(dmale.iloc[0, i])/float(dfemale.iloc[0, i]),2)
r.append(j)
data_bar_m = [-1*float(item) for item in list(dmale.iloc[0])]
data_bar_f = [float(item) for item in list(dfemale.iloc[0])]
fig = plt.figure(figsize=(14,6))
l1=plt.barh(idx,data_bar_m , color='b',alpha=0.5,label='男', edgecolor='white')
l2=plt.barh(idx,data_bar_f , color='g',alpha=0.5,label='女', edgecolor='white')
plt.grid(True)
plt.legend(handles=[l1, l2 ], loc = 2, frameon=False, prop=forcn)
plt.xlabel('2010 YEAR')
plt.title('2010 POPULATION'+' '+ area +'\n来源:www.jasper.wang', fontproperties=forcn)
for i, xy in enumerate(zip(data_bar_m, idx)):
plt.annotate(r[i], xy=xy, xytext=(-30, -5), textcoords='offset points')
plt.show()
popu(5)
popu(14)
popu(19)
popu(36)
x = ['2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003','2002','2001','2000','1999','1998','1997']
y1=[12.95,12.07,12.37,12.08,12.10,11.93,11.90,11.95,12.14,12.10,12.09,12.40,12.29,12.41,12.86,13.38,14.03,14.64,15.64,16.57]
y2=[7.09,7.11,7.16,7.16,7.15,7.14,7.11,7.08,7.06,6.93,6.81,6.51,6.42,6.40,6.41,6.43,6.45,6.46,6.50,6.51]
x.reverse()
y1.reverse()
y2.reverse()
fig = plt.figure(figsize=(14,6))
l1, = plt.plot(x, y1, 'r-', label="Birth rate")
l2, = plt.plot(x, y2, 'g--', label="mortality rate")
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
def exSm(a,origin,originx,t):
es=[]
for i in range(len(originx)):
pre=0.
if not i == 0:
pre = a * origin[i]+(1-a)*es[i-1]
else:
pre = a * origin[i] + (1-a)*(origin[0]+origin[1]+origin[2])/3
es.append(round(pre,2))
pred=a*origin[-1]+(1-a)*es[-1]
es2=[]
es2.append(es[0])
for j in range(1, len(originx)):
pre2=a * es[j] + (1-a) * es2[j-1]
es2.append(round(pre2,2))
at = 2*es[-1]-es2[-1]
bt = a/(1-a) * (es[-1]-es2[-1])
Y_n1 = round((at+bt*1),2)
Y_n2 = round((at+bt*2),2)
Y_n3 = round((at+bt*3),2)
Y_n4 = round((at+bt*4),2)
newyear = ['2017','2018','2019','2020']
originx_pre = originx + newyear
i=[]
i.append(origin[-1])
i.append(origin[-1])
i.append(origin[-1])
i.append(origin[-1])
origin_pre = origin + i
es2.append(Y_n1)
es2.append(Y_n2)
es2.append(Y_n3)
es2.append(Y_n4)
x = originx_pre
y1= origin_pre
y2=es2
xmark=[originx[-1],originx[-1]]
ymark=[min(es2)*0.95,max(es2)*1.05]
fig = plt.figure(figsize=(14,6))
l1, = plt.plot(x, y1, 'r--', label="origin", marker ='o')
l2, = plt.plot(x, y2, 'b-', label="$ES^{(2)}$", marker ='o')
l4, = plt.plot(xmark, ymark, 'g-')
plt.legend(handles=[l1, l2], loc = 0, frameon=False, fontsize=16)
plt.grid(False)
# plt.text(originx[-1],(min(es2)+max(es2))/2, ' www.jasper.wang')
plt.title('2020 PREDICTION'+' '+ t +'\n来源:www.jasper.wang', fontproperties=forcn)
plt.show()
mm=pd.DataFrame(x,columns=['x'])
mm=pd.concat([mm,pd.DataFrame(y1,columns=['y1'])],axis=1)
mm=pd.concat([mm,pd.DataFrame(y2,columns=['y2'])],axis=1)
print(mm)
a=0.5
originx= ['2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003','2002','2001','2000','1999','1998','1997']
origin=[12.95,12.07,12.37,12.08,12.10,11.93,11.90,11.95,12.14,12.10,12.09,12.40,12.29,12.41,12.86,13.38,14.03,14.64,15.64,16.57]
originx.reverse()
origin.reverse()
exSm(a,origin,originx,'出生率')
a=0.5
originx= ['2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003','2002','2001','2000','1999','1998','1997']
origin=[7.09,7.11,7.16,7.16,7.15,7.14,7.11,7.08,7.06,6.93,6.81,6.51,6.42,6.40,6.41,6.43,6.45,6.46,6.50,6.51]
originx.reverse()
origin.reverse()
exSm(a,origin,originx,'死亡率')
人口出生率 =(年内出生人数/年内平均总人口数)×1000‰
人口死亡率 =(年内死亡人数/年内平均总人口数)×1000‰
| 年份 | 人口普查数据 | 实际出生率 | 预测出生率 | 实际死亡率 | 预测死亡率 |
|---|---|---|---|---|---|
| 2010 | 1332810869 | 11.9 | 12.07 | 7.11 | 6.97 |
| 2011 | - | 11.93 | 12.02 | 7.14 | 7.03 |
| 2012 | - | 12.1 | 12.02 | 7.15 | 7.08 |
| 2013 | - | 12.08 | 12.04 | 7.16 | 7.11 |
| 2014 | - | 12.37 | 12.12 | 7.16 | 7.13 |
| 2015 | - | 12.07 | 12.13 | 7.11 | 7.13 |
| 2016 | - | 12.95 | 12.34 | 7.09 | 7.12 |
| 2017 | - | - | 12.94 | - | 7.09 |
| 2018 | - | - | 13.14 | - | 7.08 |
| 2019 | - | - | 13.34 | - | 7.07 |
| 2020 | - | - | 13.54 | - | 7.06 |
年内平均总人口数以上年数为准,即2011年平均总人口数以1332810869为基数计算。
2016年以前(含)按照实际出生率和死亡率计算。
2016年以后按照预测出生率和死亡率计算。
#import numpy as np
#np.set_printoptions(suppress=True) np不以科学计数法显示
x=['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
popu=[1332810869]
birthrate_a=[11.9,11.93,12.1,12.08,12.37,12.07,12.95,12.95,12.95,12.95,12.95]
birthrate_d=[12.07,12.02,12.02,12.04,12.12,12.13,12.34,12.94,13.14,13.34,13.54]
mortality_a=[7.11,7.14,7.15,7.16,7.16,7.11,7.09,7.09,7.09,7.09,7.09]
mortality_d=[6.97,7.03,7.08,7.11,7.13,7.13,7.12,7.09,7.08,7.07,7.06]
for i in range(1,len(x)):
if int(x[i])-2016 <= 0:
n = (birthrate_a[i] - mortality_a[i])/1000
popu_d = popu[-1] * (1+n)
popu.append(round(popu_d,0))
elif int(x[i])-2016 < 5:
n = (birthrate_d[i] - mortality_d[i])/1000
popu_d = popu[-1] * (1+n)
popu.append(round(popu_d,0))
p=pd.DataFrame(popu)
p.astype('int64') #df不以科学计数法显示
x=['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
popu=[1332810869]
birthrate_a=[11.9,11.93,12.1,12.08,12.37,12.07,12.95,12.95,12.95,12.95,12.95]
birthrate_d=[12.07,12.02,12.02,12.04,12.12,12.13,12.34,12.94,13.14,13.34,13.54]
mortality_a=[7.11,7.14,7.15,7.16,7.16,7.11,7.09,7.09,7.09,7.09,7.09]
mortality_d=[6.97,7.03,7.08,7.11,7.13,7.13,7.12,7.09,7.08,7.07,7.06]
inc=[]
for i in range(1,len(x)):
if int(x[i])-2016 <= 0:
n = (birthrate_a[i] - mortality_a[i])/1000
popu_d = popu[-1] * (1+n)
popu.append(round(popu_d,0))
inc_i = round(popu[-1] * birthrate_a[i]/1000,0)
inc.append(inc_i)
elif int(x[i])-2016 < 5:
n = (birthrate_d[i] - mortality_d[i])/1000
popu_d = popu[-1] * (1+n)
popu.append(round(popu_d,0))
inc_ = round(popu[-1] * birthrate_d[i]/1000,0)
inc.append(inc_i)
print(inc)
pre5=sum(inc[0:5])
print(pre5)
nex5=sum(inc[5:10])
print(nex5)
new = []
new.append(nex5/2)
new.append(pre5/2)
print(new)
x=['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
popu=[1332810869]
birthrate_a=[11.9,11.93,12.1,12.08,12.37,12.07,12.95,12.95,12.95,12.95,12.95]
birthrate_d=[12.07,12.02,12.02,12.04,12.12,12.13,12.34,12.94,13.14,13.34,13.54]
mortality_a=[7.11,7.14,7.15,7.16,7.16,7.11,7.09,7.09,7.09,7.09,7.09]
mortality_d=[6.97,7.03,7.08,7.11,7.13,7.13,7.12,7.09,7.08,7.07,7.06]
dec=[]
for i in range(1,len(x)):
if int(x[i])-2016 <= 0:
n = (birthrate_a[i] - mortality_a[i])/1000
popu_d = popu[-1] * (1+n)
popu.append(round(popu_d,0))
dec_i = round(popu[-1] * mortality_a[i]/1000,0)
dec.append(dec_i)
elif int(x[i])-2016 < 5:
n = (birthrate_d[i] - mortality_d[i])/1000
popu_d = popu[-1] * (1+n)
popu.append(round(popu_d,0))
dec_i = round(popu[-1] * mortality_d[i]/1000,0)
dec.append(dec_i)
print(dec)
sum5=sum(dec[0:10])
print(sum5)
dmale=d.iloc[5:6, [5,8,11,14,17,20,23,26,29,32,35,38,41,44,47,50,53,56,59,62,65,68]]
dfemale=d.iloc[5:6, [6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,66,69]]
new_male_1 = dmale.replace(dmale.iloc[0,1], float(dmale.iloc[0,0])+float(dmale.iloc[0,1])).iloc[[0],1:]
new_female_1 = dfemale.replace(dfemale.iloc[0,1], float(dfemale.iloc[0,0])+float(dfemale.iloc[0,1])).iloc[[0],1:]
new_male_1
new_female_1
new_male_2= new_male_1.replace(new_male_1.iloc[0,-3], float(new_male_1.iloc[0,-1])+float(new_male_1.iloc[0,-2])+float(new_male_1.iloc[0,-3])).iloc[[0],0:19]
new_female_2= new_female_1.replace(new_female_1.iloc[0,-3], float(new_female_1.iloc[0,-1])+float(new_female_1.iloc[0,-2])+float(new_female_1.iloc[0,-3])).iloc[[0],0:19]
new_male_2
new_female_2
df=pd.DataFrame(new)
df1=df.T.reset_index(drop=True)
new_male_3=new_male_2.reset_index(drop=True)
result_m = pd.concat([df1, new_male_3],axis=1)
result_m
df=pd.DataFrame(new)
df1=df.T.reset_index(drop=True)
new_female_3=new_female_2.reset_index(drop=True)
result_f = pd.concat([df1, new_female_3],axis=1)
result_f
如前,10年间共减少人口数97542688人。
分布在个年龄段的权重,根据经验从0-4 ~ 100+设置为:
result_m
result_f
weight=[0.015,0.04,0.04,0.04,0.04,0.03,0.03,0.04,0.04,0.04,0.04,0.04,0.04,0.05,0.05,0.05,0.11,0.11,0.11,0.035,0.01]
po=97542688/2
weight_po = [po*i for i in weight]
weight_po_de = pd.DataFrame(weight_po)
result_m.columns = [i for i in range(0,21)]
result_f.columns = [i for i in range(0,21)]
weight_po_de.T.columns = [i for i in range(0,21)]
m_p = pd.concat([result_m, weight_po_de.T],axis=0).reset_index(drop=True)
f_p = pd.concat([result_f, weight_po_de.T],axis=0).reset_index(drop=True)
m_p_1=[]
f_p_1=[]
for i in range(len(m_p.T)):
m_p_1.append(round(float(m_p[i][0])-float(m_p[i][1]),0))
f_p_1.append(round(float(f_p[i][0])-float(f_p[i][1]),0))
m_p_2=pd.DataFrame(m_p_1).reset_index(drop=True)
f_p_2=pd.DataFrame(f_p_1).reset_index(drop=True)
print(m_p_2.T)
print(f_p_2.T)
idx = ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80-84','85-89','90-94','95-99','100+']
r=[]
for i in np.arange(len(idx)):
j = round(float(m_p_2.T.iloc[0,i])/float(f_p_2.T.iloc[0,i]),2)
r.append(j)
data_bar_m = [-1*float(item) for item in list(m_p_2.T.iloc[0])]
data_bar_f = [float(item) for item in list(f_p_2.T.iloc[0])]
fig = plt.figure(figsize=(14,6))
l1=plt.barh(idx,data_bar_m , color='b',alpha=0.5,label='男', edgecolor='white')
l2=plt.barh(idx,data_bar_f , color='g',alpha=0.5,label='女', edgecolor='white')
plt.grid(True)
plt.legend(handles=[l1, l2 ], loc = 2, frameon=False, prop=forcn)
plt.xlabel('2020 YEAR')
plt.title('2020 POPULATION'+' 全国 '+'\n来源:www.jasper.wang', fontproperties=forcn)
for i, xy in enumerate(zip(data_bar_m, idx)):
plt.annotate(r[i], xy=xy, xytext=(-30, -5), textcoords='offset points')
plt.show()