
    >h7                     x    S r SSKJr  SSKrSSKJr  SSKJr  SSKr	SSK
r
\4S jrSSSS\4S	 jrSS
 jrSS jrg)a  Analyze a set of multiple variables with a linear models

multiOLS:
    take a model and test it on a series of variables defined over a
    pandas dataset, returning a summary for each variable

multigroup:
    take a boolean vector and the definition of several groups of variables
    and test if the group has a fraction of true values higher than the
    rest. It allows to test if the variables in the group are significantly
    more significant than outside the group.
    )dmatrixN)OLS)statsc                    U" X40 UD6R                  5       n[        R                  " UR                  UR                  S.5      n[        R
                  " UR                  UR                  UR                  US.5      n[        R
                  " SUR                  0SUR                  0S.5      n[        R                  " Xg/5      R                  5       nUR                  5       $ )z}return a series containing the summary of a linear model

All the exceding parameters will be redirected to the linear model
)r2adj_r2)paramspvalsstd
statistics_f_test)r	   r
   )fitpdSeriesrsquaredrsquared_adj	DataFramer	   pvaluesbsefvaluef_pvalueconcatunstackdropna)	model_endog
model_exog
model_typekwargsmodel_resultr   	result_df	fisher_df
res_seriess	            rC:\Users\julio\OneDrive\Documentos\Trabajo\Ideas Frescas\venv\Lib\site-packages\statsmodels/sandbox/multilinear.py_model2dataframer$      s     k@@DDFL,"7"7(557 8J (;(;'3';';%1%5%5,68 9I
 L4G4G(H(1<3H3H'IK LI I12::<J    fdr_bh皙?c                 t   Uc;  UR                    Vs/ s H$  nX   R                  [        :w  d  M  X;  d  M"  UPM&     nn[        U[        5      (       a  U/nUb  UR
                  U   n0 n	[        XSS9n
U H  n X   n[        XU40 UD6nXU'   M     [        R                  " U	5      nUR                  R                  S/5      nSUR                  l        [        R                   nU H)  u  nnUS:w  a  M  UUU4   nU" UX4S9S	   nUUS
U-   U4'   M+     U$ s  snf ! [         a    [        US-   US9n Nf = f)a  apply a linear model to several endogenous variables on a dataframe

Take a linear model definition via formula and a dataframe that will be
the environment of the model, and apply the linear model to a subset
(or all) of the columns of the dataframe. It will return a dataframe
with part of the information from the linear model summary.

Parameters
----------
model : str
    formula description of the model
dataframe : pandas.dataframe
    dataframe where the model will be evaluated
column_list : list[str], optional
    Names of the columns to analyze with the model.
    If None (Default) it will perform the function on all the
    eligible columns (numerical type and not in the model definition)
model_type : model class, optional
    The type of model to be used. The default is the linear model.
    Can be any linear model (OLS, WLS, GLS, etc..)
method : str, optional
    the method used to perform the pvalue correction for multiple testing.
    default is the Benjamini/Hochberg, other available methods are:

        `bonferroni` : one-step correction
        `sidak` : on-step correction
        `holm-sidak` :
        `holm` :
        `simes-hochberg` :
        `hommel` :
        `fdr_bh` : Benjamini/Hochberg
        `fdr_by` : Benjamini/Yekutieli

alpha : float, optional
    the significance level used for the pvalue correction (default 0.05)
subset : bool array
    the selected rows to be used in the regression

all the other parameters will be directed to the model creation.

Returns
-------
summary : pandas.DataFrame
    a dataframe containing an extract from the summary of the model
    obtained for each columns. It will give the model complexive f test
    result and p-value, and the regression value and standard deviarion
    for each of the regressors. The DataFrame has a hierachical column
    structure, divided as:

        - params: contains the parameters resulting from the models. Has
        an additional column named _f_test containing the result of the
        F test.
        - pval: the pvalue results of the models. Has the _f_test column
        for the significativity of the whole test.
        - adj_pval: the corrected pvalues via the multitest function.
        - std: uncertainties of the model parameters
        - statistics: contains the r squared statistics and the adjusted
        r squared.

Notes
-----
The main application of this function is on system biology to perform
a linear model testing of a lot of different parameters, like the
different genetic expression of several genes.

See Also
--------
statsmodels.stats.multitest
    contains several functions to perform the multiple p-value correction

Examples
--------
Using the longley data as dataframe example

>>> import statsmodels.api as sm
>>> data = sm.datasets.longley.load_pandas()
>>> df = data.exog
>>> df['TOTEMP'] = data.endog

This will perform the specified linear model on all the
other columns of the dataframe
>>> multiOLS('GNP + 1', df)

This select only a certain subset of the columns
>>> multiOLS('GNP + 0', df, ['GNPDEFL', 'TOTEMP', 'POP'])

It is possible to specify a trasformation also on the target column,
conforming to the patsy formula specification
>>> multiOLS('GNP + 0', df, ['I(GNPDEFL**2)', 'center(TOTEMP)'])

It is possible to specify the subset of the dataframe
on which perform the analysis
>> multiOLS('GNP + 1', df, subset=df.GNPDEFL > 90)

Even a single column name can be given without enclosing it in a list
>>> multiOLS('GNP + 0', df, 'GNPDEFL')
	dataframe)datareturn_typez + 0)r*   )r
   r   zendogenous varsr
   methodalpha   adj_)columnsdtypeobject
isinstancestrlocr   KeyErrorr$   r   r   Tsort_valuesindexnamer   multipletests)modelr)   column_listr-   r.   subsetr   r   r;   col_resultsr   col_namer   ressummarysmtkey1key2p_values	correcteds                       r#   multiOLSrI   +   sx   L (1(9(9 P(9"..&8 =A=N (9 P +s##"mMM&)	K KHJ	E#-K {
MfM #H   ll;'Gii##%9$:;G*GMM


Ct7?4:&=a@	 (1t#$   NMP$  	E!(V"3)DK	Es!   DDD;DD76D7c                    SSK JnJn  S[        U 5      -  nS[        R
                  " U 5      -  nU Vs/ s H  oU R                  ;   d  M  UPM     n	nU Vs/ s H  oU R                  ;  d  M  UPM     n
nU
(       a'  Sn[        R                  " UR                  X5      5        S[        U	5      -  nS[        U	 Vs/ s H  oU   (       d  M  UPM     sn5      -  nSX-
  -  nSX}-
  -  nSXg-
  U-
  -  nU(       a  UOUnUU/X//nU" [        R                  " U5      5      S   nXUU4n[        R                  " Xm-  X|-  -  5      nUUU4$ s  snf s  snf s  snf )ztest if the objects in the group are different from the general set.

The test is performed on the pvalues set (ad a pandas series) over
the group specified via a fisher exact test.
r   )fisher_exactchi2_contingencyg      ?zthe test is not well defined if the group has elements not presents in the significativity array. group name: {}, missing elements: {}r/   )scipy.statsrK   rL   lennpsumr:   loggingwarningformatarraylog)r   
group_namegroupexactrK   rL   totalstotal_significantccross_indexmissingsgroup_total
group_signgroup_nonsignextern_signextern_nonsigntesttablepvaluepartincreases                         r#   _test_groupri      sY    ;3w<FbffWo-#:eGMM'91eK::%QGMM#9q%G:; 	56K((Ks{A{!ajA{ABBJ;34M*78KF6FGN <&6Dk*],GHE"((5/"1%Fk>ADvvv**8: ;H8T!!/ ;: Bs#   EEE.E<E
E
c                    [         R                  " U 5      n [        U R                  5       5      SS1::  d  [	        S5      e[        U R                  S5      (       a&  U R                  R                  (       d  [	        S5      e0 0 0 0 0 0 S.nUR                  5        H_  u  pg[        XXr5      nUS   US   U'   US	   US
   U'   US   S   US   U'   US   S	   US   U'   US   S   US   U'   US   S   US   U'   Ma     [         R                  " U5      R                  S5      n	U(       d  XR                     n	[        R                  n
U
" U	S   SUS9S	   nXS'   U	$ )aG  Test if the given groups are different from the total partition.

Given a boolean array test if each group has a proportion of positives
different than the complexive proportion.
The test can be done as an exact Fisher test or approximated as a
Chi squared test for more speed.

Parameters
----------
pvals : pandas series of boolean
    the significativity of the variables under analysis
groups : dict of list
    the name of each category of variables under exam.
    each one is a list of the variables included
exact : bool, optional
    If True (default) use the fisher exact test, otherwise
    use the chi squared test for contingencies tables.
    For high number of elements in the array the fisher test can
    be significantly slower than the chi squared.
keep_all : bool, optional
    if False it will drop those groups where the fraction
    of positive is below the expected result. If True (default)
     it will keep all the significant results.
alpha : float, optional
    the significativity level for the pvalue correction
    on the whole set of groups (not inside the groups themselves).

Returns
-------
result_df: pandas dataframe
    for each group returns:

        pvals - the fisher p value of the test
        adj_pvals - the adjusted pvals
        increase - the log of the odd ratio between the
            internal significant ratio versus the external one
        _in_sign - significative elements inside the group
        _in_non - non significative elements inside the group
        _out_sign - significative elements outside the group
        _out_non - non significative elements outside the group

Notes
-----
This test allow to see if a category of variables is generally better
suited to be described for the model. For example to see if a predictor
gives more information on demographic or economical parameters,
by creating two groups containing the endogenous variables of each
category.

This function is conceived for medical dataset with a lot of variables
that can be easily grouped into functional groups. This is because
The significativity of a group require a rather large number of
composing elements.

Examples
--------
A toy example on a real dataset, the Guerry dataset from R
>>> url = "https://raw.githubusercontent.com/vincentarelbundock/"
>>> url = url + "Rdatasets/csv/HistData/Guerry.csv"
>>> df = pd.read_csv(url, index_col='dept')

evaluate the relationship between the various paramenters whith the Wealth
>>> pvals = multiOLS('Wealth', df)['adj_pvals', '_f_test']

define the groups
>>> groups = {}
>>> groups['crime'] = ['Crime_prop', 'Infanticide',
...     'Crime_parents', 'Desertion', 'Crime_pers']
>>> groups['religion'] = ['Donation_clergy', 'Clergy', 'Donations']
>>> groups['wealth'] = ['Commerce', 'Lottery', 'Instruction', 'Literacy']

do the analysis of the significativity
>>> multigroup(pvals < 0.05, groups)
FTzthe series should be binary	is_uniquez,series with duplicated index is not accepted)r
   rh   _in_sign_in_non	_out_sign_out_nonr   r
   r/   rh      rl   rm   rn      ro   r&   r,   	adj_pvals)r   r   setunique
ValueErrorhasattrr:   rk   itemsri   r   r9   rh   r   r<   )r
   groupsrX   keep_allr.   resultsrV   
group_listrB   r    rD   rH   s               r#   
multigroupr|      sx   V IIeEE4=0677u{{K((1F1FGHHG #),,.
%Z?'*1v$*-a&
J'*-a&)
J'),Q	:&+.q6!9Z(*-a&)
J' #1 W%11':I001	


CIg&xuEaHI&kr%   )T)TTr'   )__doc__patsyr   pandasr   statsmodels.apir   r   numpyrO   rQ   r$   rI   ri   r|    r%   r#   <module>r      sG       !  9< . ,0M`!"Hdr%   