Sorsogon. Step 2.a Dynamic Sampling Model and GREGWT¶

In [1]:

import datetime; print(datetime.datetime.now())

2018-03-26 01:28:43.554147

Notebook abstract

This notebook shows the main sampling and reweighting algorithm.

Import libraries¶

In [2]:

from smum.microsim.run import run_calibrated_model
from smum.microsim.table import TableModel

/usr/lib/python3.6/site-packages/h5py-2.7.1-py3.6-linux-x86_64.egg/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

Global variables¶

In [3]:

iterations = 1000
benchmark_year = 2016
census_file = 'data/benchmarks_year_bias.csv'
typ = 'resampled'
model_name = 'Sorsogon_Electricity_Water_wbias_projected_dynamic_{}'.format(typ)
verbose = False
#The number of chains to run in parallel.
njobs = 4

Define Table model¶

In [4]:

tm = TableModel(census_file = census_file, verbose=verbose)

Income model¶

In [5]:

tm.add_model('data/table_inc.csv',   'Income')
tm.update_dynamic_model('Income', specific_col = 'Education')
tm.update_dynamic_model('Income',
                        specific_col = 'FamilySize',
                        specific_col_as = 'Size',
                        val = 'mu', compute_average =  0)
tm.update_dynamic_model('Income',
                        specific_col = 'Age',
                        val = 'mu', compute_average =  0)

In [6]:

tm.models['Income'].loc[2020]

Out[6]:

	co_mu	co_sd	p	mu	sd	dis	ub	lb
i_Intercept	NaN	NaN	1147.66	NaN	NaN	Deterministic	NaN	NaN
i_Sex	919.012059036333	161.50344091572538	0.243795	NaN	NaN	Bernoulli	NaN	NaN
i_Urbanity	7105.2244566329355	127.94148635675795	0.6356	NaN	NaN	Bernoulli	NaN	NaN
i_FamilySize	1666.846395220964	29.03482607534048	NaN	3.70878	1.83794	Poisson	10	1
i_Age	116.57589770606201	4.681393204635	NaN	52.5153	12.2451	Normal	100	18
i_Education	1.0,6023.86254599,11959.091528,18727.4606703,1...	1e-10,140.904404522,217.208790314,282.17614554...	0.2430379746835443,0.21581625995041107,0.25540...	NaN	NaN	Categorical	NaN	NaN

In [7]:

formula_inc = "i_Intercept+"+"+".join(
    ["c_{0} * {0}".format(e) for e in tm.models['Income'][benchmark_year].index if\
        (e != 'i_Intercept')])
tm.add_formula(formula_inc, 'Income')

In [8]:

tm.print_formula('Income')

Income =
         i_Intercept +
         c_i_Sex * i_Sex +
         c_i_Urbanity * i_Urbanity +
         c_i_FamilySize * i_FamilySize +
         c_i_Age * i_Age +
         c_i_Education * i_Education +

Electricity model¶

In [9]:

tm.add_model('data/table_elec.csv',  'Electricity', reference_cat = ['yes'])
tm.update_dynamic_model('Electricity', specific_col = 'Income', val = 'mu', compute_average = False)

In [10]:

tm.models['Electricity'].loc[2016]

Out[10]:

	co_mu	co_sd	p	mu	sd	dis	ub	lb
e_Intercept	NaN	NaN	3.29998	NaN	NaN	Deterministic	NaN	NaN
e_Lighting	0.825662	18.6676	0.946022	NaN	NaN	Bernoulli	NaN	NaN
e_TV	18.7899	1.75962	0.964932	NaN	NaN	Bernoulli	NaN	NaN
e_Cooking	28.8862	1.96894	0.0142662	NaN	NaN	Bernoulli	NaN	NaN
e_Refrigeration	59.2432	1.55605	0.602102	NaN	NaN	Bernoulli	NaN	NaN
e_AC	203.323	3.13016	0.256521	NaN	NaN	Bernoulli	NaN	NaN
e_Urban	24.5935	1.39104	1	NaN	NaN	Bernoulli	NaN	NaN
e_Income	0.00142607	4.10201e-05	NaN	190472	1904.72	None	inf	0

In [11]:

formula_elec = "e_Intercept+"+"+".join(
    ["c_{0} * {0}".format(e) for e in tm.models['Electricity'][benchmark_year].index if\
        (e != 'e_Intercept') &\
        (e != 'e_Income') &\
        (e != 'e_Urban')
    ])
formula_elec += '+c_e_Urban * i_Urbanity'
formula_elec += '+c_e_{0} * {0}'.format('Income')

In [12]:

tm.add_formula(formula_elec, 'Electricity')

In [13]:

tm.print_formula('Electricity')

Electricity =
         e_Intercept +
         c_e_Lighting * e_Lighting +
         c_e_TV * e_TV +
         c_e_Cooking * e_Cooking +
         c_e_Refrigeration * e_Refrigeration +
         c_e_AC * e_AC +
         c_e_Urban * i_Urbanity +
         c_e_Income * Income +

Water model¶

In [14]:

tm.add_model('data/table_water.csv', 'Water')
tm.update_dynamic_model('Water', specific_col = 'Education')
tm.update_dynamic_model('Water',
                        specific_col = 'FamilySize',
                        specific_col_as = 'Size',
                        val = 'mu', compute_average =  0)
tm.update_dynamic_model('Water',
                        specific_col = 'Age',
                        val = 'mu', compute_average =  0)

In [15]:

tm.models['Water'].loc[2020]

Out[15]:

	co_mu	co_sd	p	dis	mu	sd	ub	lb
w_Intercept	NaN	NaN	-601.592	Deterministic	NaN	NaN	NaN	NaN
w_Sex	98.49504620801835	29.44380722589748	0.243795	None	NaN	NaN	NaN	NaN
w_Urbanity	1000.9789077676428	25.415910606032206	0.6356	None	NaN	NaN	NaN	NaN
w_Total_Family_Income	0.05318701200857999	0.0009823058551951082	NaN	None	NaN	NaN	NaN	NaN
w_FamilySize	49.73935151831777	5.897790558149098	NaN	None	3.70878	1.83794	NaN	NaN
w_Age	6.088941881654669	0.9127405886772298	NaN	None	52.5153	12.2451	NaN	NaN
w_Education	1.0,214.4011453125436,260.32727427717964,101.7...	1e-10,28.815802440470176,40.0574490885231,49.9...	0.2430379746835443,0.21581625995041107,0.25540...	None;i;Categorical	NaN	NaN	NaN	NaN

In [16]:

formula_water = "w_Intercept+"+"+".join(
    ["c_{0} * {1}".format(e, "i_"+"_".join(e.split('_')[1:]))\
         for e in tm.models['Water'][benchmark_year].index if \
                                 (e != 'w_Intercept') &\
                                 (e != 'w_Total_Family_Income')   &\
                                 (e != 'w_Education')
    ])
formula_water += '+c_w_Total_Family_Income*Income'
formula_water += '+c_w_Education*i_Education'

In [17]:

tm.add_formula(formula_water, 'Water')

In [18]:

tm.print_formula('Water')

Water =
         w_Intercept +
         c_w_Sex * i_Sex +
         c_w_Urbanity * i_Urbanity +
         c_w_FamilySize * i_FamilySize +
         c_w_Age * i_Age +
         c_w_Total_Family_Income*Income +
         c_w_Education*i_Education +

Make model and save it to excel¶

In [19]:

table_model = tm.make_model()

In [20]:

tm.to_excel()

creating data/tableModel_Income.xlsx
creating data/tableModel_Electricity.xlsx
creating data/tableModel_Water.xlsx

Define model variables¶

In [21]:

labels = ['age_0_18', 'age_19_25', 'age_26_35',
          'age_36_45', 'age_46_55', 'age_56_65',
          'age_66_75', 'age_76_85', 'age_86_100']
cut = [0, 19, 26, 36, 46, 56, 66, 76, 86, 101]
to_cat = {'i_Age':[cut, labels]}
drop_col_survey = ['e_Income', 'e_Urban', 'w_Total_Family_Income', 'w_Education']

In [ ]:

fw = run_calibrated_model(
    table_model,
    project = typ,
    njobs = njobs,
    #rep = {'FamilySize': ['Size']},
    #rep={'urb': ['urban', 'urbanity']},
    census_file = census_file,
    year = benchmark_year,
    population_size = False,
    name = '{}_{}'.format(model_name, iterations),
    to_cat = to_cat,
    iterations = iterations,
    verbose = verbose,
    drop_col_survey = drop_col_survey)