
    >h"L                         S r SSKrSSKrSSKrSSKJr  SS jrS r	S r
S rS rS	 rS
 rS rS rSSSS\R"                  \R$                  S4S jrg)aS  
Implementation of Regression on Order Statistics for imputing left-
censored (non-detect data)

Method described in *Nondetects and Data Analysis* by Dennis R.
Helsel (John Wiley, 2005) to estimate the left-censored (non-detect)
values of a dataset.

Author: Paul M. Hobson
Company: Geosyntec Consultants (Portland, OR)
Date: 2016-06-14

    N)statsc                 h   X U      R                  USS9nX U   )    R                  USS9nXA   R                  5       XQ   R                  5       :  a9  XDU   XQ   R                  5       :*     nU(       a  Sn[        R                  " U5        [        R
                  " XE/SS9nXqU/   R                  SS9$ )a  
This function prepares a dataframe for ROS.

It sorts ascending with
left-censored observations first. Censored observations larger than
the maximum uncensored observations are removed from the dataframe.

Parameters
----------
df : DataFrame

observations : str
    Name of the column in the dataframe that contains observed
    values. Censored values should be set to the detection (upper)
    limit.

censorship : str
    Name of the column in the dataframe that indicates that a
    observation is left-censored. (i.e., True -> censored,
    False -> uncensored)

Returns
------
sorted_df : DataFrame
    The sorted dataframe with all columns dropped except the
    observation and censorship columns.
r   axiszKDropping censored observations greater than the max uncensored observation.T)drop)sort_valuesmaxwarningswarnpdconcatreset_index)dfobservations
censorshipr   censored
uncensoredmsgcombineds           mC:\Users\julio\OneDrive\Documentos\Trabajo\Ideas Frescas\venv\Lib\site-packages\statsmodels/imputation/ros.py	_ros_sortr      s    < Z.!--l-CH
^O$00A0FJ!!#j&>&B&B&DD\2j6N6R6R6TTU5CMM#yy(/a8H:./;;;FF    c                   ^ ^^ UU U4S jnUU U4S jnUU U4S jnS nS nT T   n[         R                  " T R                  UT4   5      n	U	R                  5         U	R                  S   S:  Ga,  T T   R                  5       U	R                  5       :  a)  [        R                  " T T   R                  5       U	/5      n	[         R                  " U	S/S9n
U" U
5      U
R                  S	S	2S
4'   U
R                  USS9U
R                  S	S	2S4'   U
R                  USS9U
R                  S	S	2S4'   U
R                  USS9U
R                  S	S	2S4'   U
R                  [        U	R                  S   S-   5      5      n
U" U
S   U
S   5      U
R                  S	S	2S4'   U
$ / SQn[         R                  " [        R                  " S[        U5      45      US9n
U
$ )a  
Computes the Cohn numbers for the detection limits in the dataset.

The Cohn Numbers are:

    - :math:`A_j =` the number of uncensored obs above the jth
      threshold.
    - :math:`B_j =` the number of observations (cen & uncen) below
      the jth threshold.
    - :math:`C_j =` the number of censored observations at the jth
      threshold.
    - :math:`\mathrm{PE}_j =` the probability of exceeding the jth
      threshold
    - :math:`\mathrm{DL}_j =` the unique, sorted detection limits
    - :math:`\mathrm{DL}_{j+1} = \mathrm{DL}_j` shifted down a
      single index (row)

Parameters
----------
dataframe : DataFrame

observations : str
    Name of the column in the dataframe that contains observed
    values. Censored values should be set to the detection (upper)
    limit.

censorship : str
    Name of the column in the dataframe that indicates that a
    observation is left-censored. (i.e., True -> censored,
    False -> uncensored)

Returns
-------
cohn : DataFrame
c                 j   > TT   U S   :  nTT   U S   :  nTT   ) nTX-  U-     R                   S   $ )zCA, the number of uncensored obs above the given threshold.
        lower_dlupper_dlr   shape)rowabovebelowdetectr   r   r   s       r   nuncen_above"cohn_numbers.<locals>.nuncen_aboveh   sY    
 < C
O3 < 3z?2 Z. %-&()//22r   c                    > TT	   U S   :  nTT	   U S   :*  nTT   ) nTT   nTX$-     R                   S   nTX-     R                   S   nXV-   $ )zFB, the number of observations (cen & uncen) below the given
threshold
r   r   r   )
r   	less_thanless_thanequalr   r   LTE_censoredLT_uncensoredr   r   r   s
          r   
nobs_below cohn_numbers.<locals>.nobs_belowx   s     |$s:6	 L)S_< n_
j> .34::1= 91288; ++r   c                 L   > TT   nTT   U   nX S   :H  nUR                  5       $ )z?C, the number of censored observations at the given
threshold.
r   )sum)r   censored_indexcensored_datacensored_belowr   r   r   s       r   
ncen_equal cohn_numbers.<locals>.ncen_equal   s8    
 J<(8&j/9!!##r   c                     U R                   S   S:  a/  U S   R                  S5      R                  [        R                  S9$ [        R                  /$ )z9Sets the upper_dl DL for each row of the Cohn dataframe. r      r   )value)r   shiftfillnanpinf)cohns    r   set_upper_limit%cohn_numbers.<locals>.set_upper_limit   sE    ::a=1
#))"-442664BBFF8Or   c                     [        U 5      n[        R                  " USS9nSUS'   [        US-
  SS5       H&  nX4S-      SX4S-      -
  X   -  X   X   -   -  -   X4'   M(     U$ )zJComputes the probability of excedance for each row of the
Cohn dataframe. float64)dtypeg        r5      r4   )lenr9   emptyrange)ABNPEjs        r   
compute_PE cohn_numbers.<locals>.compute_PE   st     FXXay)2qsB#AsGq2c7{ad2adQTkBBBE $ 	r   r   r   )columnsNr   r4   r   r#   r*   r1   prob_exceedance)r   r   r#   r*   r1   rM   )r   uniquelocsortr   minr9   hstack	DataFrameapplyreindexrD   rC   rB   )r   r   r   r#   r*   r1   r<   rJ   r/   DLsr;   dl_colss   ```         r   cohn_numbersrX   C   s   J3 ,0$	 zNM
))BFF=,67
8CHHJ yy|al!CGGI-))R-113S9:C
 ||C*6"1$"7J&*jjAj&FN"#$(JJzJ$BL!$(JJzJ$BL!||E#))A,"234)3D4H$|J\)]%%& K	B||BHHaW%67IKr   c                 z    UR                   S   S:  a%  [        R                  " US   U :*  5      u  nUS   nU$ SnU$ )a  
Locates the corresponding detection limit for each observation.

Basically, creates an array of indices for the detection limits
(Cohn numbers) corresponding to each data point.

Parameters
----------
obs : float
    A single observation from the larger dataset.

cohn : DataFrame
    DataFrame of Cohn numbers.

Returns
-------
det_limit_index : int
    The index of the corresponding detection limit in `cohn`

See Also
--------
cohn_numbers
r   r   r5   )r   r9   where)obsr;   indexdet_limit_indexs       r   _detection_limit_indexr^      sL    2 zz!}q$z*c12)  r   c                     U R                  5       nSUR                  SS2S4'   UR                  X/S9S   R                  S 5      nU$ )a6  
Ranks each observation within the data groups.

In this case, the groups are defined by the record's detection
limit index and censorship status.

Parameters
----------
df : DataFrame

dl_idx : str
    Name of the column in the dataframe the index of the
    observations' corresponding detection limit in the `cohn`
    dataframe.

censorship : str
    Name of the column in the dataframe that indicates that a
    observation is left-censored. (i.e., True -> censored,
    False -> uncensored)

Returns
-------
ranks : ndarray
    Array of ranks for the dataset.
r4   Nrank)byc                 "    U R                  5       $ N)cumsum)gs    r   <lambda>!_ros_group_rank.<locals>.<lambda>  s
    !((*r   )copyrO   groupby	transform)r   dl_idxr   rankss       r   _ros_group_rankrm      sN    : GGIEEIIai&-.v6i,- 
 Lr   c                     U S   nU S   nX   nUR                   U   nUR                   US-      nU(       a  SUS   -
  U-  US   S-   -  $ SUS   -
  US   US   -
  U-  US   S-   -  -   $ )aY  
ROS-specific plotting positions.

Computes the plotting position for an observation based on its rank,
censorship status, and detection limit index.

Parameters
----------
row : {Series, dict}
    Full observation (row) from a censored dataset. Requires a
    'rank', 'detection_limit', and `censorship` column.

censorship : str
    Name of the column in the dataframe that indicates that a
    observation is left-censored. (i.e., True -> censored,
    False -> uncensored)

cohn : DataFrame
    DataFrame of Cohn numbers.

Returns
-------
plotting_position : float

See Also
--------
cohn_numbers
r]   r`   r4   rM   r1   r#   )iloc)r   r   r;   DL_indexr`   r   dl_1dl_2s           r   _ros_plot_posrs     s    < $%Hv;DH99XD99X\"DD*++t3tL7I!7KLLD*++5F0G$O`Ja0a0^,Q.00 0 	0r   c                 n    [         R                  " U SS9u  p[         R                  R                  U5      $ )z
Computes standard normal (Gaussian) plotting positions using scipy.

Parameters
----------
observations : array_like
    Sequence of observed quantities.

Returns
-------
plotting_position : array of floats
F)fit)r   probplotnormcdf)r   ppos
sorted_ress      r   _norm_plot_posr{   =  s*     ~~l>D::>>$r   c                    ^^ U R                  UU4S jSS9nX0T      n[        R                  " USS9nUR                  5         XSR                  U T   R
                  U T      '   U$ )a  
Compute the plotting positions for the observations.

The ROS-specific plotting postions are based on the observations'
rank, censorship status, and corresponding detection limit.

Parameters
----------
df : DataFrame

censorship : str
    Name of the column in the dataframe that indicates that a
    observation is left-censored. (i.e., True -> censored,
    False -> uncensored)

cohn : DataFrame
    DataFrame of Cohn numbers.

Returns
-------
plotting_position : array of float

See Also
--------
cohn_numbers
c                    > [        U TT5      $ rc   )rs   )rr   r;   s    r   rf   $plotting_positions.<locals>.<lambda>j  s    -:t"Dr   r4   r   W)requirements)rT   r9   requirerP   rO   r\   )r   r   r;   plot_pos
ND_plotposND_plotpos_arrs    ``   r   plotting_positionsr   N  sf    8 xxD1xMH Z.)JZZ
=N9GLLJ%%bn56Or   c                    X   ) nX   n[         R                  " U S   U   U" X   U   5      5      nUSS u  pU" XS   U   -  U	-   5      U R                  SS2S4'   [        R                  " X   U S   X   5      U R                  SS2S4'   U $ )aC  
Executes the basic regression on order stat (ROS) proceedure.

Uses ROS to impute censored from the best-fit line of a
probability plot of the uncensored values.

Parameters
----------
df : DataFrame
observations : str
    Name of the column in the dataframe that contains observed
    values. Censored values should be set to the detection (upper)
    limit.
censorship : str
    Name of the column in the dataframe that indicates that a
    observation is left-censored. (i.e., True -> censored,
    False -> uncensored)
transform_in, transform_out : callable
    Transformations to be applied to the data prior to fitting
    the line and after estimated values from that line. Typically,
    `np.log` and `np.exp` are used, respectively.

Returns
-------
estimated : DataFrame
    A new dataframe with two new columns: "estimated" and "final".
    The "estimated" column contains of the values inferred from the
    best-fit line. The "final" column contains the estimated values
    only where the original observations were censored, and the original
    observations everwhere else.
ZprelimNrA   	estimatedfinal)r   
linregressrO   r9   rZ   )
r   r   r   transform_intransform_outuncensored_maskcensored_mask
fit_paramsslope	intercepts
             r   _imputer   u  s    D ~oONM !!
9o&R%o67J ""1~E
 +5i=3O+OR[+[\BFF1k>"."[/2CSTBFF1g:Ir   c                 p   [        XUS9n[        XUS9nXa   R                  [        U4S9UR                  SS2S4'   [        USU5      UR                  SS2S4'   [        XbU5      UR                  SS2S4'   [        R                  R                  US   5      UR                  SS2S4'   [        XaX#U5      $ )aV  
DataFrame-centric function to impute censored valies with ROS.

Prepares a dataframe for, and then esimates the values of a censored
dataset using Regression on Order Statistics

Parameters
----------
df : DataFrame

observations : str
    Name of the column in the dataframe that contains observed
    values. Censored values should be set to the detection (upper)
    limit.

censorship : str
    Name of the column in the dataframe that indicates that a
    observation is left-censored. (i.e., True -> censored,
    False -> uncensored)

transform_in, transform_out : callable
    Transformations to be applied to the data prior to fitting
    the line and after estimated values from that line. Typically,
    `np.log` and `np.exp` are used, respectively.

Returns
-------
estimated : DataFrame
    A new dataframe with two new columns: "estimated" and "final".
    The "estimated" column contains of the values inferred from the
    best-fit line. The "final" column contains the estimated values
    only where the original observations were censored, and the original
    observations everwhere else.
)r   r   )argsNr]   r`   r   r   )rX   r   rT   r^   rO   rm   r   r   rw   ppfr   )r   r   r   r   r   r;   modeleds          r   _do_rosr     s    J *MD *MG(/(=(C(CDZbfah(C(iGKK$$%,W6GTGKK6	!3G!NGKK: %

wz/B CGKK97*MRRr   rA   g?g      ?Tc	                     Uc  [         R                  " XS.5      nSn SnUR                  S   n	X!   R                  [        5      R                  5       n
X-
  nX-  nU
S:X  a*  X U/   R                  5       nX    UR                  SS2S4'   O\X:  d  X:  aE  X U/   R                  5       nX    UR                  SS2S4'   UR                  X!   S4==   U-  ss'   O[        X XU5      nU(       a  US   R                  nU$ )ai  
Impute censored dataset using Regression on Order Statistics (ROS).

Method described in *Nondetects and Data Analysis* by Dennis R.
Helsel (John Wiley, 2005) to estimate the left-censored (non-detect)
values of a dataset. When there is insufficient non-censorded data,
simple substitution is used.

Parameters
----------
observations : str or array-like
    Label of the column or the float array of censored observations

censorship : str
    Label of the column or the bool array of the censorship
    status of the observations.

      * True if censored,
      * False if uncensored

df : DataFrame, optional
    If `observations` and `censorship` are labels, this is the
    DataFrame that contains those columns.

min_uncensored : int (default is 2)
    The minimum number of uncensored values required before ROS
    can be used to impute the censored observations. When this
    criterion is not met, simple substituion is used instead.

max_fraction_censored : float (default is 0.8)
    The maximum fraction of censored data below which ROS can be
    used to impute the censored observations. When this fraction is
    exceeded, simple substituion is used instead.

substitution_fraction : float (default is 0.5)
    The fraction of the detection limit to be used during simple
    substitution of the censored values.

transform_in : callable (default is np.log)
    Transformation to be applied to the values prior to fitting a
    line to the plotting positions vs. uncensored values.

transform_out : callable (default is np.exp)
    Transformation to be applied to the imputed censored values
    estimated from the previously computed best-fit line.

as_array : bool (default is True)
    When True, a numpy array of the imputed observations is
    returned. Otherwise, a modified copy of the original dataframe
    with all of the intermediate calculations is returned.

Returns
-------
imputed : {ndarray, DataFrame}
    The final observations where the censored values have either been
    imputed through ROS or substituted as a fraction of the
    detection limit.

Notes
-----
This function requires pandas 0.14 or more recent.
N)r[   cenr[   r   r   r   )
r   rS   r   astypeintr-   rh   rO   r   values)r   r   r   min_uncensoredmax_fraction_censoredsubstitution_fractionr   r   as_arrayN_observations
N_censoredN_uncensoredfraction_censoredoutputs                 r   
impute_rosr     s   H 
z\\,BC
 XXa[N&&s+//1J!.L"3
 Q:./446!#!1

1g: 
'->-V:./446!#!1

1g:

2>7*+/DD+
 :]S ''Mr   )F)__doc__r
   numpyr9   pandasr   scipyr   r   rX   r^   rm   rs   r{   r   r   r   logexpr    r   r   <module>r      sr       *GZDND#L(0V "$N4n/Sd -1%(FF"&&hr   