
    >hI                         S r SSKJrJr  SSKrSSKrSSKJ	s  J
r  SSKJrJr  SS jrSS jrS rS r " S	 S
5      r " S S\5      rS rS rS r " S S5      rg)a  Tools for working with groups

This provides several functions to work with groups and a Group class that
keeps track of the different representations and has methods to work more
easily with groups.


Author: Josef Perktold,
Author: Nathaniel Smith, recipe for sparse_dummies on scipy user mailing list

Created on Tue Nov 29 15:44:53 2011 : sparse_dummies
Created on Wed Nov 30 14:28:24 2011 : combine_indices
changes: add Group class

Notes
~~~~~

This reverses the class I used before, where the class was for the data and
the group was auxiliary. Here, it is only the group, no data is kept.

sparse_dummies needs checking for corner cases, e.g.
what if a category level has zero elements? This can happen with subset
    selection even if the original groups where defined as arange.

Not all methods and options have been tried out yet after refactoring

need more efficient loop if groups are sorted -> see GroupSorted.group_iter
    )lrangelzipN)Index
MultiIndexc                    [        U [        5      (       a  [        R                  " U 5      n O[        R                  " U 5      n U R
                  nU R                  S:H  nU(       an  U R                  S   nU R                  R                  (       d  [        R                  " U SS9n U R                  SU R
                  4/U R                  S   -  5      nOU n[        R                  " USSS9u  pn
U(       a!  UR                  U5      R                  SW5      nU(       aC  U Vs/ s H2  nXR                  S	/[        US
   5      -  5      -   [        U5      -  PM4     nnXX4$ XU4$ s  snf )zIuse np.unique to get integer group indices for product, intersection
          C)order T)return_indexreturn_inverse%sr   )
isinstancetuplenpcolumn_stackasarraydtypendimshapeflagsc_contiguousarrayviewuniquereshapejoinlen)groupsprefixsepreturn_labelsdtis2dncolsgroups_uniuni_idxuni_inviilabels                oC:\Users\julio\OneDrive\Documentos\Trabajo\Ideas Frescas\venv\Lib\site-packages\statsmodels/tools/grouputils.pycombine_indicesr/   %   s:    &%  (F#	BKK1DQ||((XXfC0F++FLL12V\\!_DEIIgD59;C' hhrl""2u-  R 4&SV"455rB 	  ++$$	 s   !9E$c                 2   [         R                  " U 5      n [         R                  " U5      R                  5       nU R                  S:X  a
  U SS2S4   n O"U R                  S:  a  U(       a  [	        S5      eU(       a  [         R
                  " U5      SU R                  S   -  :  a  [        R                  " U5      S   n[         R                  " [        U R                  S   5       Vs/ s H  n[         R                  " XSS2U4   S9PM      sn5      $ [         R                  " U5      n[         R                  " [        U5      /[        U R                  SS 5      -   5      n[!        U5       H  u  pgXU:H     R#                  S5      XV'   M     U$ s  snf )zsimple bincount version, again

group : ndarray, integer
    assumed to be consecutive integers

no dtype checking because I want to raise in that case

uses loop over columns of x

for comparison, simple python loop
r	   Nr   znot implemented yetr   )weights)r   r   squeezer   
ValueErrormaxr   pd	factorizer   rangebincountr   zerosr    list	enumeratesum)xgroupuse_bincountcoluniquesresultr,   cats           r.   
group_sumsrD   P   sD    	

1AJJu%%'Evv{agJ	
!.// 66%=1qwwqz>)LL'*Exx !,,C EQV95,
 	
 ))E"3w<.4+<<= )GBC<,,Q/FJ *s   +%Fc                     [         R                  " US5      (       a!  [        R                  " U R                  U5      $ U R                  U-  $ )z]sum by groups given group dummy variable

group_dummy can be either ndarray or sparse matrix
N)	data_util_is_using_ndarray_typer   dotT)r=   group_dummys     r.   group_sums_dummyrK   w   s;    
 ''T::vvacc;''ss[      c                     SSK Jn  [        R                  " [	        U 5      S-   5      n[        R
                  " [	        U 5      [        R                  S9nUR                  X0U45      nU$ )a  create a sparse indicator from a group array with integer labels

Parameters
----------
groups : ndarray, int, 1d (nobs,)
    an array of group indicators for each observation. Group levels are
    assumed to be defined as consecutive integers, i.e. range(n_groups)
    where n_groups is the number of group levels. A group level with no
    observations for it will still produce a column of zeros.

Returns
-------
indi : ndarray, int8, 2d (nobs, n_groups)
    an indicator array with one row per observation, that has 1 in the
    column of the group level for that observation

Examples
--------

>>> g = np.array([0, 0, 2, 1, 1, 2, 0])
>>> indi = dummy_sparse(g)
>>> indi
<7x3 sparse matrix of type '<type 'numpy.int8'>'
    with 7 stored elements in Compressed Sparse Row format>
>>> indi.todense()
matrix([[1, 0, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 1, 0],
        [0, 1, 0],
        [0, 0, 1],
        [1, 0, 0]], dtype=int8)


current behavior with missing groups
>>> g = np.array([0, 0, 2, 0, 2, 0])
>>> indi = dummy_sparse(g)
>>> indi.todense()
matrix([[1, 0, 0],
        [1, 0, 0],
        [0, 0, 1],
        [1, 0, 0],
        [0, 0, 1],
        [1, 0, 0]], dtype=int8)
r   )sparser	   )r   )scipyrN   r   aranger    onesint8
csr_matrix)r!   rN   indptrdataindis        r.   dummy_sparserW      sP    \ YYs6{1}%F773v;bgg.DdF34DKrL   c                   T    \ rS rSrSS jrS rS rSS\4S jrS r	SS	 jr
SS
 jrSrg)Group   c                    X l         [        U5      u  p4nX4UsU l        U l        U l        [        U R                  5      U l        SU l        U R                   U l        U R                  (       a  U R                  S-   U l        g g )N.=)	namer/   	group_intr*   r)   r    n_groups	separatorr"   )selfr>   r^   r)   r*   r+   s         r.   __init__Group.__init__   sm     	 / 6g 25w.dhDHH ii;;+++DK rL   c                 B    [         R                  " U R                  5      $ N)r   r8   r_   rb   s    r.   countsGroup.counts   s    {{4>>**rL   c                 2   U R                   nU R                  nU R                  nUR                  S:  aA  U Vs/ s H2  nXR	                  S/[        US   5      -  5      -   [        U5      -  PM4     nnU$ U Vs/ s H
  oASU-  -   PM     nnU$ s  snf s  snf )Nr	   r   r   )r"   r)   ra   r   r   r    r   )rb   r"   r)   r#   r,   r-   s         r.   labelsGroup.labels   s    hhnn88a<"$" XXtfSQ[&899U2YF"  $  366#BdRi'#E6	$ 7s   9B:BNFc                     U R                   nUb  [        [        U5      5      nXQ	 XE   nU R                  nU(       d"  USS2S4   USSS24   :H  R	                  U5      $ [        U R                  5      $ )zT
drop_idx is only available if sparse=False

drop_idx is supposed to index into uni
N)r)   r   r    r>   astyperW   r_   )rb   drop_idxrN   r   r)   idxr>   s          r.   dummyGroup.dummy   sn     hhS"C(C

!T'Nc$'l2::5AA//rL   c                 r    [        XR                  5      (       a  UR                  nU R                  X45      $ rf   )r   	__class__r>   )rb   others     r.   interactionGroup.interaction   s+    e^^,,KKE~~tm,,rL   c                 *    [        XR                  US9$ N)r?   )rD   r_   )rb   r=   r?   s      r.   rD   Group.group_sums   s    !^^,GGrL   c                     [        [        U5      5      n[        X-  U R                  US9nXU R                     -
  nXT4$ ry   )floatr    rD   r_   )rb   r=   r?   nobsmeans_g
x_demeaneds         r.   group_demeanGroup.group_demean   s@    SV}QXt~~*6800
""rL   )r_   r`   r^   r"   ra   r)   r*   r   T)__name__
__module____qualname____firstlineno__rc   rh   rk   intrq   rv   rD   r   __static_attributes__ rL   r.   rY   rY      s/    ,$+ "%s 0&-
H#rL   rY   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )GroupSortedi  c                    > [         U R                  U ]  XS9  [        R                  " [        R
                  " U5      5      S   S-   R                  5       n[        S/U-   U[        U5      /-   5      U l	        g )Nr^   r   r	   )
superrt   rc   r   nonzerodifftolistr   r    groupidx)rb   r>   r^   rp   rt   s       r.   rc   GroupSorted.__init__  sa    dnnd,U,>zz"''%.)!,Q.668aS3Ys5zl(:;rL   c              #   P   #    U R                    H  u  p[        X5      v   M     g 7frf   )r   slice)rb   lowupps      r.   
group_iterGroupSorted.group_iter
  s     HC/! &s   $&c                 f    [         R                  " U R                  5      SS2S4   U-
  nX:*  nX#   $ )a  return the index array for lagged values

Warning: if k is larger then the number of observations for an
individual, then no values for that individual are returned.

TODO: for the unbalanced case, I should get the same truncation for
the array with lag=0. From the return of lag_idx we would not know
which individual is missing.

TODO: do I want the full equivalent of lagmat in tsa?
maxlag or lag or lags.

not tested yet
Nr	   )r   r   r   )rb   laglag_idxmask_oks       r.   lag_indicesGroupSorted.lag_indices  s6     **T]]+AqD1C7> rL   )r   r   )	r   r   r   r   rc   r   r   r   __classcell__)rt   s   @r.   r   r     s    <"   rL   r   c                     U S   n[        U[        [        [        R                  [
        R                  [
        R                  45      (       a  gg)z
Checks if the first item of an array-like object is also array-like
If so, we have a MultiIndex and returns True. Else returns False.
r   TF)r   r:   r   r   ndarrayr5   Series	DataFrame)r=   items     r.   _is_hierarchicalr   $  s7    
 Q4D$ubjj"))R\\JKKrL   c                 0    [         R                  " U /SU06$ )Nnames)r   from_tuplesindexr   s     r.   _make_hierarchical_indexr   1  s    !!E78%88rL   c                     [        U R                  5      n[        [        [        U5      5      5      n[        U5       Vs/ s H  nSU-   S-   R	                  U5      PM     sn$ s  snf )Nz	group{0:0})r    r   strr7   format)r   n_namespadis       r.   _make_generic_namesr   5  sS    %++G
c#g,
 C5:7^D^[_S ((+^DDDs    !A$c                       \ rS rSrSS jr\S 5       r\S 5       r\S 5       r\S 5       r	SS jr
SS	 jrSS
 jrSS jrSS jrSS jrSS jrSS jrS rSS jrSS jrSrg)Groupingi;  Nc                 B   [        U[        [        45      (       a2  Ub(  [        US5      (       a  UR	                  USS9  OX!l        Xl        O[        U5      (       a  [        X5      U l        O[        XS9U l        Uc[  [        U R                  5      n[        U R                  S5      (       a  U R                  R	                  USS9  OX R                  l        [        U R                  5      U l        [        U R                  R
                  5      U l        SU l        g)a  
index : index-like
    Can be pandas MultiIndex or Index or array-like. If array-like
    and is a MultipleIndex (more than one grouping variable),
    groups are expected to be in each row. E.g., [('red', 1),
    ('red', 2), ('green', 1), ('green', 2)]
names : list or str, optional
    The names to use for the groups. Should be a str if only
    one grouping variable is used.

Notes
-----
If index is already a pandas Index then there is no copy.
N	set_namesTinplacer   )r   r   r   hasattrr   r   r   r   r   r   r    r}   nlevelsslicesrb   r   r   s      r.   rc   Grouping.__init__<  s     eeZ011 5+..OOE4O8"'KJ&&5eC
"55
}+DJJ74::{33JJ(((=',JJ$

O	4::++,rL   c                     [        U R                  S5      (       a  U R                  R                  $ U R                  R                  $ )Nlevshape)r   r   r   r   rg   s    r.   index_shapeGrouping.index_shapeb  s3    4::z**::&&&::###rL   c                     [        U R                  S5      (       a  U R                  R                  $ [        R                  " U R                  5      R                  $ )Nlevels)r   r   r   r5   Categoricalrg   s    r.   r   Grouping.levelsi  s<    4::x((::$$$>>$**-444rL   c                     [        U R                  SS 5      nUc`  [        U R                  S5      (       a  U R                  R                  nU$ [        R
                  " U R                  5      R                  S    nU$ )Ncodesrk   )getattrr   r   rk   r5   r   r   )rb   r   s     r.   rk   Grouping.labelsp  sf     

GT2=tzz8,,

))  tzz288>rL   c                 .    U R                   R                  $ rf   r   rg   s    r.   group_namesGrouping.group_names{  s    zzrL   c                 8    Uc  U R                   n[        X5      n g)z
Resets the index in-place.
N)r   r   r   s      r.   reindexGrouping.reindex  s     =$$E%rL   c                    U R                   R                  U5      R                  5       n[        R                  " U5      nUR                  5         [        U R                   [        5      (       a3  U Vs/ s H  nU R                   R                  X1S9S   PM!     snU l	        gU Vs/ s H  o0R                   R                  U5      PM     snU l	        gs  snf s  snf )z
Sets the slices attribute to be a list of indices of the sorted
groups for the first index level. I.e., self.slices[0] is the
index where each observation is in the first (sorted) group.
levelr   N)r   get_level_valuesr   r   r   sortr   r   get_loc_levelr   get_loc)rb   r   r!   r=   s       r.   
get_slicesGrouping.get_slices  s     ,,U3::<&!djj*--$*,$*q  ::33A3CAF$*,DK ;AA&Q::--a0&ADK, Bs   3&C&$Cc                 T    [         R                  " U R                  U   5      U l        g)zQ
Sets the attribute counts to equal the bincount of the (integer-valued)
labels.
N)r   r8   rk   rh   rb   r   s     r.   count_categoriesGrouping.count_categories  s     kk$++e"45rL   c                    U(       d  U R                   nU(       al  [        R                  " [        [	        U5      5      US9nUR                  5       nUR                   R                  UR                   5      (       d  [        S5      eU(       a2  [	        U5      [	        UR                  5       5      :w  a  [        S5      egg)zSanity checksr   zData is not be sortedzDuplicate index entriesN)	r   r5   r   r   r    r   equals	Exceptionr   )rb   	is_sortedr   r   testtest_sorteds         r.   check_indexGrouping.check_index  s    JJE<<s5z 2%@D))+K::$$[%6%677 7885zS00 9:: 1 rL   c                     Uc  U R                   n[        R                  " US5      (       a{  UR                  S:X  a&  [        R
                  " XSS9nUR                  5       nO#[        R                  " XS9nUR                  SS9n[        R                  " U5      UR                   4$ [        R                  " US5      (       a0  UnUR                  U5      nUR                  5       nX3R                   4$ Sn[        U5      e)	zApplies a (potentially hierarchical) sort operation on a numpy array
or pandas series/dataframe based on the grouping index or a
user-supplied index.  Returns an object of the same type as the
original data as well as the matching (sorted) Pandas index.
Nr	   T)r   copyr   Fr   z7data must be a Numpy array or a Pandas Series/DataFrame)r   rF   rG   r   r5   r   
sort_indexr   r   r   _is_using_pandasr   r3   )rb   rU   r   outmsgs        r.   r   Grouping.sort  s     =JJE++D$77yyA~ii=nn&ll45nnUn388C=#))++''d33C++e$C.."C		>!KCS/!rL   c                 
   UR                   S   U R                  :w  a  [        S5      eUR                  US9R                  " U40 UD6nSUR                   ;   a  [
        R                  " U5      $ [
        R                  " U5      $ )z]Apply function to each column, by group
Assumes that the dataframe already has a proper indexr   z/dataframe does not have the same shape as indexr   r	   )r   r}   r   groupbyapplyr   ravelr   )rb   	dataframefunctionr   kwargsr   s         r.   transform_dataframeGrouping.transform_dataframe  so     ??1*MNNe,228FvF		>88C= 88C= rL   c                     UR                   S   U R                  :w  a  [        S5      e[        R                  " XR
                  S9nU R                  " XR4SU0UD6$ )z0Apply function to each column, by group
        r   +array does not have the same shape as indexr   r   )r   r}   r   r5   r   r   r   )rb   r   r   r   r   r   s         r.   transform_arrayGrouping.transform_array  sZ     ;;q>TYY&IJJLLjj9	''	 25 2*02 	2rL   c                    [         R                  " U5      nUR                  S   U R                  :w  a  [	        S5      eU R                  US9  / nU R                   HI  nUR                  S:X  a	  XSS24   nOUR                  S:X  a  X   nUR                  U" WU40 UD65        MK     [         R                  " U5      nUR                  SUR                  S   5      $ )zApply function to each group. Similar to transform_array but does
not coerce array to a DataFrame and back and only works on a 1D or 2D
numpy array. function is called function(group, group_idx, **kwargs).
r   r   r   r   Nr	   r   )r   r   r   r}   r   r   r   r   appendr   r   )rb   r   r   r   r   	processedssubsets           r.   transform_slicesGrouping.transform_slices  s    
 

5!;;q>TYY&IJJe$	AzzQ!tqXfa:6:;  HHY'	  Y__R%899rL   c                 8    U R                  SS9  U R                  $ )Nr	   r   rW   _dummiesrg   s    r.   dummies_timeGrouping.dummies_time  s    "}}rL   c                 8    U R                  US9  U R                  $ )Nr   r  r   s     r.   dummies_groupsGrouping.dummies_groups  s    &}}rL   c                 @    [        U R                  U   5      nX l        g)a  create a sparse indicator from a group array with integer labels

Parameters
----------
groups : ndarray, int, 1d (nobs,)
    An array of group indicators for each observation. Group levels
    are assumed to be defined as consecutive integers, i.e.
    range(n_groups) where n_groups is the number of group levels.
    A group level with no observations for it will still produce a
    column of zeros.

Returns
-------
indi : ndarray, int8, 2d (nobs, n_groups)
    an indicator array with one row per observation, that has 1 in the
    column of the group level for that observation

Examples
--------

>>> g = np.array([0, 0, 2, 1, 1, 2, 0])
>>> indi = dummy_sparse(g)
>>> indi
<7x3 sparse matrix of type '<type 'numpy.int8'>'
    with 7 stored elements in Compressed Sparse Row format>
>>> indi.todense()
matrix([[1, 0, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 1, 0],
        [0, 1, 0],
        [0, 0, 1],
        [1, 0, 0]], dtype=int8)


current behavior with missing groups
>>> g = np.array([0, 0, 2, 0, 2, 0])
>>> indi = dummy_sparse(g)
>>> indi.todense()
matrix([[1, 0, 0],
        [1, 0, 0],
        [0, 0, 1],
        [1, 0, 0],
        [0, 0, 1],
        [1, 0, 0]], dtype=int8)
N)rW   rk   r  )rb   r   rV   s      r.   rW   Grouping.dummy_sparse  s    ^ DKK./rL   )r  rh   r   r   r}   r   rf   )NN)r   )TTN)r   r   r   r   rc   propertyr   r   rk   r   r   r   r   r   r   r   r   r  r  r
  rW   r   r   rL   r.   r   r   ;  s    $L $ $ 5 5      &B 6;"4	!2:*0rL   r   )r   r\   Fr   )__doc__statsmodels.compat.pythonr   r   numpyr   pandasr5   statsmodels.tools.datatoolsrU   rF   r   r   r/   rD   rK   rW   rY   r   r   r   r   r   r   rL   r.   <module>r     sk   8 3   * * $'%V$N!4nE# E#P %  B
9En nrL   