Sorsogon. Step 2.a Dynamic Sampling Model and GREGWT

In [1]:
import datetime; print(datetime.datetime.now())
2018-03-26 01:28:43.554147

Notebook abstract

This notebook shows the main sampling and reweighting algorithm.

Import libraries

In [2]:
from smum.microsim.run import run_calibrated_model
from smum.microsim.table import TableModel
/usr/lib/python3.6/site-packages/h5py-2.7.1-py3.6-linux-x86_64.egg/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

Global variables

In [3]:
iterations = 1000
benchmark_year = 2016
census_file = 'data/benchmarks_year_bias.csv'
typ = 'resampled'
model_name = 'Sorsogon_Electricity_Water_wbias_projected_dynamic_{}'.format(typ)
verbose = False
#The number of chains to run in parallel.
njobs = 4

Define Table model

In [4]:
tm = TableModel(census_file = census_file, verbose=verbose)

Income model

In [5]:
tm.add_model('data/table_inc.csv',   'Income')
tm.update_dynamic_model('Income', specific_col = 'Education')
tm.update_dynamic_model('Income',
                        specific_col = 'FamilySize',
                        specific_col_as = 'Size',
                        val = 'mu', compute_average =  0)
tm.update_dynamic_model('Income',
                        specific_col = 'Age',
                        val = 'mu', compute_average =  0)
In [6]:
tm.models['Income'].loc[2020]
Out[6]:
co_mu co_sd p mu sd dis ub lb
i_Intercept NaN NaN 1147.66 NaN NaN Deterministic NaN NaN
i_Sex 919.012059036333 161.50344091572538 0.243795 NaN NaN Bernoulli NaN NaN
i_Urbanity 7105.2244566329355 127.94148635675795 0.6356 NaN NaN Bernoulli NaN NaN
i_FamilySize 1666.846395220964 29.03482607534048 NaN 3.70878 1.83794 Poisson 10 1
i_Age 116.57589770606201 4.681393204635 NaN 52.5153 12.2451 Normal 100 18
i_Education 1.0,6023.86254599,11959.091528,18727.4606703,1... 1e-10,140.904404522,217.208790314,282.17614554... 0.2430379746835443,0.21581625995041107,0.25540... NaN NaN Categorical NaN NaN
In [7]:
formula_inc = "i_Intercept+"+"+".join(
    ["c_{0} * {0}".format(e) for e in tm.models['Income'][benchmark_year].index if\
        (e != 'i_Intercept')])
tm.add_formula(formula_inc, 'Income')
In [8]:
tm.print_formula('Income')
Income =
         i_Intercept +
         c_i_Sex * i_Sex +
         c_i_Urbanity * i_Urbanity +
         c_i_FamilySize * i_FamilySize +
         c_i_Age * i_Age +
         c_i_Education * i_Education +

Electricity model

In [9]:
tm.add_model('data/table_elec.csv',  'Electricity', reference_cat = ['yes'])
tm.update_dynamic_model('Electricity', specific_col = 'Income', val = 'mu', compute_average = False)
In [10]:
tm.models['Electricity'].loc[2016]
Out[10]:
co_mu co_sd p mu sd dis ub lb
e_Intercept NaN NaN 3.29998 NaN NaN Deterministic NaN NaN
e_Lighting 0.825662 18.6676 0.946022 NaN NaN Bernoulli NaN NaN
e_TV 18.7899 1.75962 0.964932 NaN NaN Bernoulli NaN NaN
e_Cooking 28.8862 1.96894 0.0142662 NaN NaN Bernoulli NaN NaN
e_Refrigeration 59.2432 1.55605 0.602102 NaN NaN Bernoulli NaN NaN
e_AC 203.323 3.13016 0.256521 NaN NaN Bernoulli NaN NaN
e_Urban 24.5935 1.39104 1 NaN NaN Bernoulli NaN NaN
e_Income 0.00142607 4.10201e-05 NaN 190472 1904.72 None inf 0
In [11]:
formula_elec = "e_Intercept+"+"+".join(
    ["c_{0} * {0}".format(e) for e in tm.models['Electricity'][benchmark_year].index if\
        (e != 'e_Intercept') &\
        (e != 'e_Income') &\
        (e != 'e_Urban')
    ])
formula_elec += '+c_e_Urban * i_Urbanity'
formula_elec += '+c_e_{0} * {0}'.format('Income')
In [12]:
tm.add_formula(formula_elec, 'Electricity')
In [13]:
tm.print_formula('Electricity')
Electricity =
         e_Intercept +
         c_e_Lighting * e_Lighting +
         c_e_TV * e_TV +
         c_e_Cooking * e_Cooking +
         c_e_Refrigeration * e_Refrigeration +
         c_e_AC * e_AC +
         c_e_Urban * i_Urbanity +
         c_e_Income * Income +

Water model

In [14]:
tm.add_model('data/table_water.csv', 'Water')
tm.update_dynamic_model('Water', specific_col = 'Education')
tm.update_dynamic_model('Water',
                        specific_col = 'FamilySize',
                        specific_col_as = 'Size',
                        val = 'mu', compute_average =  0)
tm.update_dynamic_model('Water',
                        specific_col = 'Age',
                        val = 'mu', compute_average =  0)
In [15]:
tm.models['Water'].loc[2020]
Out[15]:
co_mu co_sd p dis mu sd ub lb
w_Intercept NaN NaN -601.592 Deterministic NaN NaN NaN NaN
w_Sex 98.49504620801835 29.44380722589748 0.243795 None NaN NaN NaN NaN
w_Urbanity 1000.9789077676428 25.415910606032206 0.6356 None NaN NaN NaN NaN
w_Total_Family_Income 0.05318701200857999 0.0009823058551951082 NaN None NaN NaN NaN NaN
w_FamilySize 49.73935151831777 5.897790558149098 NaN None 3.70878 1.83794 NaN NaN
w_Age 6.088941881654669 0.9127405886772298 NaN None 52.5153 12.2451 NaN NaN
w_Education 1.0,214.4011453125436,260.32727427717964,101.7... 1e-10,28.815802440470176,40.0574490885231,49.9... 0.2430379746835443,0.21581625995041107,0.25540... None;i;Categorical NaN NaN NaN NaN
In [16]:
formula_water = "w_Intercept+"+"+".join(
    ["c_{0} * {1}".format(e, "i_"+"_".join(e.split('_')[1:]))\
         for e in tm.models['Water'][benchmark_year].index if \
                                 (e != 'w_Intercept') &\
                                 (e != 'w_Total_Family_Income')   &\
                                 (e != 'w_Education')
    ])
formula_water += '+c_w_Total_Family_Income*Income'
formula_water += '+c_w_Education*i_Education'
In [17]:
tm.add_formula(formula_water, 'Water')
In [18]:
tm.print_formula('Water')
Water =
         w_Intercept +
         c_w_Sex * i_Sex +
         c_w_Urbanity * i_Urbanity +
         c_w_FamilySize * i_FamilySize +
         c_w_Age * i_Age +
         c_w_Total_Family_Income*Income +
         c_w_Education*i_Education +

Make model and save it to excel

In [19]:
table_model = tm.make_model()
In [20]:
tm.to_excel()
creating data/tableModel_Income.xlsx
creating data/tableModel_Electricity.xlsx
creating data/tableModel_Water.xlsx

Define model variables

In [21]:
labels = ['age_0_18', 'age_19_25', 'age_26_35',
          'age_36_45', 'age_46_55', 'age_56_65',
          'age_66_75', 'age_76_85', 'age_86_100']
cut = [0, 19, 26, 36, 46, 56, 66, 76, 86, 101]
to_cat = {'i_Age':[cut, labels]}
drop_col_survey = ['e_Income', 'e_Urban', 'w_Total_Family_Income', 'w_Education']
In [ ]:
fw = run_calibrated_model(
    table_model,
    project = typ,
    njobs = njobs,
    #rep = {'FamilySize': ['Size']},
    #rep={'urb': ['urban', 'urbanity']},
    census_file = census_file,
    year = benchmark_year,
    population_size = False,
    name = '{}_{}'.format(model_name, iterations),
    to_cat = to_cat,
    iterations = iterations,
    verbose = verbose,
    drop_col_survey = drop_col_survey)