
    >h                         S r SSKrSSKJr  SSKJr   " S S5      rSS jrS r	SS	 jr
S
 rS r " S S5      r    SS jr SS jrSS jr SS jr      SS jrSS jrg)a/  Tests and descriptive statistics with weights


Created on 2010-09-18

Author: josef-pktd
License: BSD (3-clause)


References
----------
SPSS manual
SAS manual

This follows in large parts the SPSS manual, which is largely the same as
the SAS manual with different, simpler notation.

Freq, Weight in SAS seems redundant since they always show up as product, SPSS
has only weights.

Notes
-----

This has potential problems with ddof, I started to follow numpy with ddof=0
by default and users can change it, but this might still mess up the t-tests,
since the estimates for the standard deviation will be based on the ddof that
the user chooses.
- fixed ddof for the meandiff ttest, now matches scipy.stats.ttest_ind

Note: scipy has now a separate, pooled variance option in ttest, but I have not
compared yet.

    N)stats)cache_readonlyc                   J   \ rS rSrSrSS jr\S 5       r\S 5       r\S 5       r	\S 5       r
\S	 5       r\S
 5       rSS jrSS jr\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       rS S jrS rS!S jrS!S jrS"S jrS rS"S jrS rS#S jrS rSrg)$DescrStatsW)   a  
Descriptive statistics and tests with weights for case weights

Assumes that the data is 1d or 2d with (nobs, nvars) observations in rows,
variables in columns, and that the same weight applies to each column.

If degrees of freedom correction is used, then weights should add up to the
number of observations. ttest also assumes that the sum of weights
corresponds to the sample size.

This is essentially the same as replicating each observations by its
weight, if the weights are integers, often called case or frequency weights.

Parameters
----------
data : array_like, 1-D or 2-D
    dataset
weights : None or 1-D ndarray
    weights for each observation, with same length as zero axis of data
ddof : int
    default ddof=0, degrees of freedom correction used for second moments,
    var, std, cov, corrcoef.
    However, statistical tests are independent of `ddof`, based on the
    standard formulas.

Examples
--------

>>> import numpy as np
>>> np.random.seed(0)
>>> x1_2d = 1.0 + np.random.randn(20, 3)
>>> w1 = np.random.randint(1, 4, 20)
>>> d1 = DescrStatsW(x1_2d, weights=w1)
>>> d1.mean
array([ 1.42739844,  1.23174284,  1.083753  ])
>>> d1.var
array([ 0.94855633,  0.52074626,  1.12309325])
>>> d1.std_mean
array([ 0.14682676,  0.10878944,  0.15976497])

>>> tstat, pval, df = d1.ttest_mean(0)
>>> tstat; pval; df
array([  9.72165021,  11.32226471,   6.78342055])
array([  1.58414212e-12,   1.26536887e-14,   2.37623126e-08])
44.0

>>> tstat, pval, df = d1.ttest_mean([0, 1, 1])
>>> tstat; pval; df
array([ 9.72165021,  2.13019609,  0.52422632])
array([  1.58414212e-12,   3.87842808e-02,   6.02752170e-01])
44.0

# if weights are integers, then asrepeats can be used

>>> x1r = d1.asrepeats()
>>> x1r.shape
...
>>> stats.ttest_1samp(x1r, [0, 1, 1])
...

Nc                    [         R                  " U5      U l        Uc3  [         R                  " U R                  R                  S   5      U l        O[         R                  " U5      R                  [        5      U l        [        U R
                  R                  5      S:  a8  [        U R
                  5      S:  a  U R
                  R                  5       U l        X0l
        g )Nr      )npasarraydataonesshapeweightsastypefloatlensqueezeddof)selfr   r   r   s       pC:\Users\julio\OneDrive\Documentos\Trabajo\Ideas Frescas\venv\Lib\site-packages\statsmodels/stats/weightstats.py__init__DescrStatsW.__init__h   s    JJt$	?77499??1#56DL::g.55e<DL4<<%%&*s4<</@1/D#||335	    c                 8    U R                   R                  S5      $ )zSum of weightsr   )r   sumr   s    r   sum_weightsDescrStatsW.sum_weightst   s     ||""r   c                     U R                   $ )zHalias for number of observations/cases, equal to sum of weights
        )r   r   s    r   nobsDescrStatsW.nobsy   s     r   c                 l    [         R                  " U R                  R                  U R                  5      $ )zweighted sum of data)r
   dotr   Tr   r   s    r   r   DescrStatsW.sum   s!     vvdiikk4<<00r   c                 4    U R                   U R                  -  $ )zweighted mean of data)r   r   r   s    r   meanDescrStatsW.mean   s     xx$****r   c                 4    U R                   U R                  -
  $ )z"data with weighted mean subtracted)r   r'   r   s    r   demeanedDescrStatsW.demeaned   s     yy499$$r   c                 r    [         R                  " U R                  S-  R                  U R                  5      $ )z(weighted sum of squares of demeaned data   )r
   r#   r*   r$   r   r   s    r   
sumsquaresDescrStatsW.sumsquares   s(     vvt}}),,dll;;r   c                 :    U R                   U R                  U-
  -  $ )zvariance of data given ddof

Parameters
----------
ddof : int, float
    degrees of freedom correction, independent of attribute ddof

Returns
-------
var : float, ndarray
    variance with denominator ``sum_weights - ddof``
r.   r   r   r   s     r   var_ddofDescrStatsW.var_ddof   s     $"2"2T"9::r   c                 H    [         R                  " U R                  US95      $ )zstandard deviation of data with given ddof

Parameters
----------
ddof : int, float
    degrees of freedom correction, independent of attribute ddof

Returns
-------
std : float, ndarray
    standard deviation with denominator ``sum_weights - ddof``
)r   )r
   sqrtr3   r2   s     r   std_ddofDescrStatsW.std_ddof   s     wwt}}$}/00r   c                 N    U R                   U R                  U R                  -
  -  $ )z<variance with default degrees of freedom correction
        )r.   r   r   r   s    r   varDescrStatsW.var   s"     $"2"2TYY">??r   c                 4    U R                   U R                  -  $ )z`variance without degrees of freedom correction

used for statistical tests with controlled ddof
r1   r   s    r   _varDescrStatsW._var   s     !1!111r   c                 B    [         R                  " U R                  5      $ )zFstandard deviation with default degrees of freedom correction
        )r
   r6   r:   r   s    r   stdDescrStatsW.std   s     wwtxx  r   c                     [         R                  " U R                  U R                  R                  -  U R                  5      nXR
                  U R                  -
  -  nU$ )z~weighted covariance of data if data is 2 dimensional

assumes variables in columns and observations in rows
uses default ddof
)r
   r#   r   r*   r$   r   r   )r   cov_s     r   covDescrStatsW.cov   sE     vvdllT]]__4dmmD  499,,r   c                 \    U R                   U R                  -  U R                  SS2S4   -  $ )z^weighted correlation with default ddof

assumes variables in columns and observations in rows
N)rD   r@   r   s    r   corrcoefDescrStatsW.corrcoef   s)     xx$(("TXXag%666r   c                     U R                   nU R                  S:w  a=  U[        R                  " U R                  U R                  -
  U R                  -  5      -  nU[        R                  " U R                  S-
  5      -  $ )z,standard deviation of weighted mean
        r   r	   )r@   r   r
   r6   r   )r   r@   s     r   std_meanDescrStatsW.std_mean   sk     hh99>!!DII-1A1AA C RWWT--1222r   c                 n   SSK n[        R                  " U5      n[        R                  " U5      nU R                  R
                  S:X  a3  U R                  U R                  U5      nU(       a  UR                  XAS9nO/ nU R                  R                   H#  nUR                  U R                  XQ5      5        M%     [        R                  " U5      nU(       a>  [        UR                  S   5       Vs/ s H  nSUS-   -  PM     nnUR                  XGUS9nU(       a  SUR                  l        U$ s  snf )a  
Compute quantiles for a weighted sample.

Parameters
----------
probs : array_like
    A vector of probability points at which to calculate the
    quantiles.  Each element of `probs` should fall in [0, 1].
return_pandas : bool
    If True, return value is a Pandas DataFrame or Series.
    Otherwise returns a ndarray.

Returns
-------
quantiles : Series, DataFrame, or ndarray
    If `return_pandas` = True, returns one of the following:
      * data are 1d, `return_pandas` = True: a Series indexed by
        the probability points.
      * data are 2d, `return_pandas` = True: a DataFrame with
        the probability points as row index and the variables
        as column index.

    If `return_pandas` = False, returns an ndarray containing the
    same values as the Series/DataFrame.

Notes
-----
To compute the quantiles, first, the weights are summed over
exact ties yielding distinct data values y_1 < y_2 < ..., and
corresponding weights w_1, w_2, ....  Let s_j denote the sum
of the first j weights, and let W denote the sum of all the
weights.  For a probability point p, if pW falls strictly
between s_j and s_{j+1} then the estimated quantile is
y_{j+1}.  If pW = s_j then the estimated quantile is (y_j +
y_{j+1})/2.  If pW < p_1 then the estimated quantile is y_1.

References
----------
SAS documentation for weighted quantiles:

https://support.sas.com/documentation/cdl/en/procstat/63104/HTML/default/viewer.htm#procstat_univariate_sect028.htm
r   Nr	   indexzcol%d)r   columnsrN   p)pandasr
   r   
atleast_1dr   ndim	_quantileSeriesr$   appendcolumn_stackranger   	DataFramerN   name)r   probsreturn_pandaspdrsltvecjrO   s           r   quantileDescrStatsW.quantile   s    X 	

5!e$99>>Q>>$))U3Dyyy3Dyy{{DNN367 #??4(D6;DJJqM6JK6J7a!e,6JK||e|L!DJJO Ls   4D2c                 v   SS K nUR                  [        R                  " [	        U R
                  5      5      S9nU R
                  US'   XS'   UR                  S5      R                  S5      nUR                  S S 2S4   n[        R                  " UR                  5      n[        R                  " U5      nUS   n	X)-  n
[        R                  " X5      nX{   n[        R                  " [        R                  " XU   -
  5      S:  5      nXU   [	        U5      S-
  :     nX{U      X{U   S-      -   S	-  X'   U$ )
Nr   rM   r   r_   r   g|=r	   r-   )rQ   rY   r
   aranger   r   groupbyaggvaluesr   rN   cumsumsearchsortedflatnonzeroabs)r   r_   r[   r]   dfdfgr   rh   cweightstotwttargetsiir^   jjs                 r   rT   DescrStatsW._quantile)  s   
 	 \\		#dll*; <\=95	jj##E***QT"CII&99W%-__X/z ^^BFF7b\#9:UBC2X**+b6NVrFQJ%771<r   c                 h    U R                   S-
  n[        U R                  U R                  X1U5      nU$ )a  two-sided confidence interval for weighted mean of data

If the data is 2d, then these are separate confidence intervals
for each column.

Parameters
----------
alpha : float
    significance level for the confidence interval, coverage is
    ``1-alpha``
alternative : str
    This specifies the alternative hypothesis for the test that
    corresponds to the confidence interval.
    The alternative hypothesis, H1, has to be one of the following

      'two-sided': H1: mean not equal to value (default)
      'larger' :   H1: mean larger than value
      'smaller' :  H1: mean smaller than value

Returns
-------
lower, upper : floats or ndarrays
    lower and upper bound of confidence interval

Notes
-----
In a previous version, statsmodels 0.4, alpha was the confidence
level, e.g. 0.95
r	   )r   _tconfint_genericr'   rJ   )r   alphaalternativedofcis        r   tconfint_meanDescrStatsW.tconfint_meanF  s6    > "IIt}}c+
 	r   c                 D    [        U R                  U R                  X5      $ )aO  two-sided confidence interval for weighted mean of data

Confidence interval is based on normal distribution.
If the data is 2d, then these are separate confidence intervals
for each column.

Parameters
----------
alpha : float
    significance level for the confidence interval, coverage is
    ``1-alpha``
alternative : str
    This specifies the alternative hypothesis for the test that
    corresponds to the confidence interval.
    The alternative hypothesis, H1, has to be one of the following

      'two-sided': H1: mean not equal to value (default)
      'larger' :   H1: mean larger than value
      'smaller' :  H1: mean smaller than value

Returns
-------
lower, upper : floats or ndarrays
    lower and upper bound of confidence interval

Notes
-----
In a previous version, statsmodels 0.4, alpha was the confidence
level, e.g. 0.95
)_zconfint_genericr'   rJ   )r   rw   rx   s      r   zconfint_meanDescrStatsW.zconfint_meank  s    @ !DMM5NNr   c                    U R                   U-
  U R                  -  nU R                  S-
  nUS:X  a8  [        R                  R                  [        R                  " U5      U5      S-  nOWUS:X  a   [        R                  R                  X45      nO1US:X  a   [        R                  R                  X45      nO[        S5      eX5U4$ )a  ttest of Null hypothesis that mean is equal to value.

The alternative hypothesis H1 is defined by the following

- 'two-sided': H1: mean not equal to value
- 'larger' :   H1: mean larger than value
- 'smaller' :  H1: mean smaller than value

Parameters
----------
value : float or array
    the hypothesized value for the mean
alternative : str
    The alternative hypothesis, H1, has to be one of the following:

      - 'two-sided': H1: mean not equal to value (default)
      - 'larger' :   H1: mean larger than value
      - 'smaller' :  H1: mean smaller than value

Returns
-------
tstat : float
    test statistic
pvalue : float
    pvalue of the t-test
df : int or float

r	   	two-sidedr-   largersmallerzalternative not recognized)
r'   rJ   r   r   tsfr
   rl   cdf
ValueError)r   valuerx   tstatry   pvalues         r   
ttest_meanDescrStatsW.ttest_mean  s    < U"dmm3"+%WWZZus3a7FH$WWZZ+FI%WW[[,F9::c!!r   c                     U R                  USS9u  p4nU R                  USS9u  pgn[        R                  " XG5      X4U4XgU44$ )a
  test of (non-)equivalence of one sample

TOST: two one-sided t tests

null hypothesis:  m < low or m > upp
alternative hypothesis:  low < m < upp

where m is the expected value of the sample (mean of the population).

If the pvalue is smaller than a threshold, say 0.05, then we reject the
hypothesis that the expected value of the sample (mean of the
population) is outside of the interval given by thresholds low and upp.

Parameters
----------
low, upp : float
    equivalence interval low < mean < upp

Returns
-------
pvalue : float
    pvalue of the non-equivalence test
t1, pv1, df1 : tuple
    test statistic, pvalue and degrees of freedom for lower threshold
    test
t2, pv2, df2 : tuple
    test statistic, pvalue and degrees of freedom for upper threshold
    test

r   rx   r   )r   r
   maximum)	r   lowuppt1pv1df1t2pv2df2s	            r   
ttost_meanDescrStatsW.ttost_mean  sO    @ sAs	Bzz##bs^bs^CCr   c                 Z   U R                   U-
  U R                  -  nUS:X  a9  [        R                  R	                  [
        R                  " U5      5      S-  nX44$ US:X  a"  [        R                  R	                  U5      nX44$ US:X  a  [        R                  R                  U5      nUW4$ )a*  z-test of Null hypothesis that mean is equal to value.

The alternative hypothesis H1 is defined by the following
'two-sided': H1: mean not equal to value
'larger' :   H1: mean larger than value
'smaller' :  H1: mean smaller than value

Parameters
----------
value : float or array
    the hypothesized value for the mean
alternative : str
    The alternative hypothesis, H1, has to be one of the following

      'two-sided': H1: mean not equal to value (default)
      'larger' :   H1: mean larger than value
      'smaller' :  H1: mean smaller than value

Returns
-------
tstat : float
    test statistic
pvalue : float
    pvalue of the t-test

Notes
-----
This uses the same degrees of freedom correction as the t-test in the
calculation of the standard error of the mean, i.e it uses
`(sum_weights - 1)` instead of `sum_weights` in the denominator.
See Examples below for the difference.

Examples
--------

z-test on a proportion, with 20 observations, 15 of those are our event

>>> import statsmodels.api as sm
>>> x1 = [0, 1]
>>> w1 = [5, 15]
>>> d1 = sm.stats.DescrStatsW(x1, w1)
>>> d1.ztest_mean(0.5)
(2.5166114784235836, 0.011848940928347452)

This differs from the proportions_ztest because of the degrees of
freedom correction:
>>> sm.stats.proportions_ztest(15, 20.0, value=0.5)
(2.5819888974716112, 0.009823274507519247).

We can replicate the results from ``proportions_ztest`` if we increase
the weights to have artificially one more observation:

>>> sm.stats.DescrStatsW(x1, np.array(w1)*21./20).ztest_mean(0.5)
(2.5819888974716116, 0.0098232745075192366)
r   r-   r   r   )r'   rJ   r   normr   r
   rl   r   )r   r   rx   r   r   s        r   
ztest_meanDescrStatsW.ztest_mean  s    p U"dmm3+%ZZ]]266%=1A5F } H$ZZ]]5)F } I%ZZ^^E*Ff}r   c                     U R                  USS9u  p4U R                  USS9u  pV[        R                  " XF5      X44XV44$ )a  test of (non-)equivalence of one sample, based on z-test

TOST: two one-sided z-tests

null hypothesis:  m < low or m > upp
alternative hypothesis:  low < m < upp

where m is the expected value of the sample (mean of the population).

If the pvalue is smaller than a threshold, say 0.05, then we reject the
hypothesis that the expected value of the sample (mean of the
population) is outside of the interval given by thresholds low and upp.

Parameters
----------
low, upp : float
    equivalence interval low < mean < upp

Returns
-------
pvalue : float
    pvalue of the non-equivalence test
t1, pv1 : tuple
    test statistic and p-value for lower threshold test
t2, pv2 : tuple
    test statistic and p-value for upper threshold test

r   r   r   )r   r
   r   )r   r   r   r   r   r   r   s          r   
ztost_meanDescrStatsW.ztost_mean   sF    < //#8/<//#9/=zz##bY	99r   c                 h    [        XR                  5      (       d  [        X5      nOUn[        X5      $ )a  return an instance of CompareMeans with self and other

Parameters
----------
other : array_like or instance of DescrStatsW
    If array_like then this creates an instance of DescrStatsW with
    the given weights.
weights : None or array
    weights are only used if other is not an instance of DescrStatsW

Returns
-------
cm : instance of CompareMeans
    the instance has self attached as d1 and other as d2.

See Also
--------
CompareMeans

)
isinstance	__class__r   CompareMeans)r   otherr   d2s       r   get_compareDescrStatsW.get_compareB  s-    * %00U,BBD%%r   c                     [         R                  " U R                  5      R                  [        5      n[         R
                  " U R                  USS9$ )z\get array that has repeats given by floor(weights)

observations with weight=0 are dropped

r   )axis)r
   floorr   r   intrepeatr   )r   w_ints     r   	asrepeatsDescrStatsW.asrepeats]  s8     &--c2yyE22r   )r   r   r   )Nr   r   )T)皙?r   )r   r   )N) __name__
__module____qualname____firstlineno____doc__r   r   r   r    r   r'   r*   r.   r3   r7   r:   r=   r@   rD   rG   rJ   ra   rT   r{   r   r   r   r   r   r   r   __static_attributes__ r   r   r   r   )   sF   <|
 # #    
 1 1 + + % % < <
;1 @ @
 2 2 ! !
   7 7 
3 
3AF:#J OD*"X"DHAF :D&63r   r   c                 L   X-
  U-
  U-  nUS;   a:  [         R                  R                  [        R                  " U5      U5      S-  nXg4$ US;   a"  [         R                  R                  Xc5      nXg4$ US;   a"  [         R                  R                  Xc5      nXg4$ [        S5      e)a  generic ttest based on summary statistic

The test statistic is :
    tstat = (value1 - value2 - diff) / std_diff

and is assumed to be t-distributed with ``dof`` degrees of freedom.

Parameters
----------
value1 : float or ndarray
    Value, for example mean, of the first sample.
value2 : float or ndarray
    Value, for example mean, of the second sample.
std_diff : float or ndarray
    Standard error of the difference value1 - value2
dof : int or float
    Degrees of freedom
alternative : str
    The alternative hypothesis, H1, has to be one of the following

       * 'two-sided' : H1: ``value1 - value2 - diff`` not equal to 0.
       * 'larger' :   H1: ``value1 - value2 - diff > 0``
       * 'smaller' :  H1: ``value1 - value2 - diff < 0``

diff : float
    value of difference ``value1 - value2`` under the null hypothesis

Returns
-------
tstat : float or ndarray
    Test statistic.
pvalue : float or ndarray
    P-value of the hypothesis test assuming that the test statistic is
    t-distributed with ``df`` degrees of freedom.
r   z2-sided2sr-   r   lr   sinvalid alternative)r   r   r   r
   rl   r   r   )value1value2std_diffry   rx   diffr   r   s           r   _tstat_genericr   g  s    J _t#x/E44BFF5M3/!3 = 
	'E'
 =	 
(	(U( = .//r   c                    US;   a7  [         R                  R                  SUS-  -
  U5      nXU-  -
  nXU-  -   nXg4$ US;   a9  [         R                  R                  X25      nXU-  -   n[        R                  nXg4$ US;   a>  [         R                  R                  SU-
  U5      n[        R                  * nXU-  -   nXg4$ [        S5      e)ac  generic t-confint based on summary statistic

Parameters
----------
mean : float or ndarray
    Value, for example mean, of the first sample.
std_mean : float or ndarray
    Standard error of the difference value1 - value2
dof : int or float
    Degrees of freedom
alpha : float
    Significance level for the confidence interval, coverage is
    ``1-alpha``.
alternative : str
    The alternative hypothesis, H1, has to be one of the following

       * 'two-sided' : H1: ``value1 - value2 - diff`` not equal to 0.
       * 'larger' :   H1: ``value1 - value2 - diff > 0``
       * 'smaller' :  H1: ``value1 - value2 - diff < 0``

Returns
-------
lower : float or ndarray
    Lower confidence limit. This is -inf for the one-sided alternative
    "smaller".
upper : float or ndarray
    Upper confidence limit. This is inf for the one-sided alternative
    "larger".
r   r	          @r   r   r   )r   r   ppfr
   infr   )r'   rJ   ry   rw   rx   tcritloweruppers           r   rv   rv     s    > 44AOS1x''x'' < 
	'E'x'' < 
(	(AIs+x'' < .//r   c                 J   X-
  U-
  U-  nUS;   a9  [         R                  R                  [        R                  " U5      5      S-  nXV4$ US;   a"  [         R                  R                  U5      nXV4$ US;   a"  [         R                  R                  U5      nXV4$ [        S5      e)a  generic (normal) z-test based on summary statistic

The test statistic is :
    tstat = (value1 - value2 - diff) / std_diff

and is assumed to be normally distributed.

Parameters
----------
value1 : float or ndarray
    Value, for example mean, of the first sample.
value2 : float or ndarray
    Value, for example mean, of the second sample.
std_diff : float or ndarray
    Standard error of the difference value1 - value2
alternative : str
    The alternative hypothesis, H1, has to be one of the following

       * 'two-sided' : H1: ``value1 - value2 - diff`` not equal to 0.
       * 'larger' :   H1: ``value1 - value2 - diff > 0``
       * 'smaller' :  H1: ``value1 - value2 - diff < 0``

diff : float
    value of difference ``value1 - value2`` under the null hypothesis

Returns
-------
tstat : float or ndarray
    Test statistic.
pvalue : float or ndarray
    P-value of the hypothesis test assuming that the test statistic is
    t-distributed with ``df`` degrees of freedom.
r   r-   r   r   r   r   r   r   r
   rl   r   r   )r   r   r   rx   r   zstatr   s          r   _zstat_genericr     s    F _t#x/E44rvve}-1 = 
	'u%
 =	 
(	(& = .//r   c                 >   X-  nUS;   a9  [         R                  R                  [        R                  " U5      5      S-  nX44$ US;   a"  [         R                  R                  U5      nX44$ US;   a"  [         R                  R                  U5      nX44$ [        S5      e)ap  generic (normal) z-test based on summary statistic

The test statistic is :
    zstat = value / std

and is assumed to be normally distributed with standard deviation ``std``.

Parameters
----------
value : float or ndarray
    Value of a sample statistic, for example mean.
value2 : float or ndarray
    Value, for example mean, of the second sample.
std : float or ndarray
    Standard error of the sample statistic value.
alternative : str
    The alternative hypothesis, H1, has to be one of the following

       * 'two-sided' : H1: ``value1 - value2 - diff`` not equal to 0.
       * 'larger' :   H1: ``value1 - value2 - diff > 0``
       * 'smaller' :  H1: ``value1 - value2 - diff < 0``

Returns
-------
zstat : float or ndarray
    Test statistic.
pvalue : float or ndarray
    P-value of the hypothesis test assuming that the test statistic is
    normally distributed.
r   r-   r   r   r   r   )r   r@   rx   r   r   s        r   _zstat_generic2r     s    @ KE44rvve}-1 = 
	'u%
 =	 
(	(& = .//r   c                    US;   a6  [         R                  R                  SUS-  -
  5      nXU-  -
  nXU-  -   nXV4$ US;   a9  [         R                  R                  U5      nXU-  -   n[        R                  nXV4$ US;   a=  [         R                  R                  SU-
  5      n[        R                  * nXU-  -   nXV4$ [        S5      e)a=  generic normal-confint based on summary statistic

Parameters
----------
mean : float or ndarray
    Value, for example mean, of the first sample.
std_mean : float or ndarray
    Standard error of the difference value1 - value2
alpha : float
    Significance level for the confidence interval, coverage is
    ``1-alpha``
alternative : str
    The alternative hypothesis, H1, has to be one of the following

       * 'two-sided' : H1: ``value1 - value2 - diff`` not equal to 0.
       * 'larger' :   H1: ``value1 - value2 - diff > 0``
       * 'smaller' :  H1: ``value1 - value2 - diff < 0``

Returns
-------
lower : float or ndarray
    Lower confidence limit. This is -inf for the one-sided alternative
    "smaller".
upper : float or ndarray
    Upper confidence limit. This is inf for the one-sided alternative
    "larger".
r   r	   r   r   r   r   )r   r   r   r
   r   r   )r'   rJ   rw   rx   zcritr   r   s          r   r~   r~   $  s    : 44

q53;/x''x'' < 
	'

u%x'' < 
(	(

q5y)x'' < .//r   c                       \ rS rSrSrS r\ SS j5       rSS jr\	S 5       r
\	S 5       rS	 rSS
 jrSS jr SS jr SS jrSS jrSS jrSrg)r   iS  a  class for two sample comparison

The tests and the confidence interval work for multi-endpoint comparison:
If d1 and d2 have the same number of rows, then each column of the data
in d1 is compared with the corresponding column in d2.

Parameters
----------
d1, d2 : instances of DescrStatsW

Notes
-----
The result for the statistical tests and the confidence interval are
independent of the user specified ddof.

TODO: Extend to any number of groups or write a version that works in that
case, like in SAS and SPSS.

c                     Xl         X l        g)z4assume d1, d2 hold the relevant attributes

        Nd1r   r   r   r   s      r   r   CompareMeans.__init__h  s     r   Nc           	      4    U " [        XUS9[        X$US95      $ )a  construct a CompareMeans object from data

Parameters
----------
data1, data2 : array_like, 1-D or 2-D
    compared datasets
weights1, weights2 : None or 1-D ndarray
    weights for each observation of data1 and data2 respectively,
    with same length as zero axis of corresponding dataset.
ddof1, ddof2 : int
    default ddof1=0, ddof2=0, degrees of freedom for data1,
    data2 respectively.

Returns
-------
A CompareMeans instance.

r   r   )r   )clsdata1data2weights1weights2ddof1ddof2s          r   	from_dataCompareMeans.from_datat  s%    , e<e<
 	
r   c           	         U R                   nU R                  nSUS-  -
  nU(       a$  U R                  X4S9u  pn
U R                  X#S9u  pO"U R	                  X4S9u  pU R                  X#S9u  pUS:X  a  U R                  nOU R                  n[        R                  " U5      n[        R                  " U5      n[        R                  " U	5      n	[        R                  " U5      n[        R                  " U5      n[        R                  " X45      n[        R                  " UR                  UR                  -
  U-
  5      nSnSn[        UR                  S   5       Vs/ s H  nSUS	-   -  PM     nnSS
KJn  U" SXXU4UUUUUS9$ s  snf )a  summarize the results of the hypothesis test

Parameters
----------
use_t : bool, optional
    if use_t is True, then t test results are returned
    if use_t is False, then z test results are returned
alpha : float
    significance level for the confidence interval, coverage is
    ``1-alpha``
usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is
    assumed to be the same. If ``unequal``, then the variance of
    Welch ttest will be used, and the degrees of freedom are those
    of Satterthwaite if ``use_t`` is True.
value : float
    difference between the means under the Null hypothesis.

Returns
-------
smry : SimpleTable

d   )usevarr   )rw   r   pooledzTest for equality of meansyr   z
subset #%dr	   )summary_paramsN)rw   use_tynamexnametitle)r   r   	ttest_indtconfint_diff	ztest_indzconfint_diffstd_meandiff_pooledvarstd_meandiff_separatevarr
   rR   rW   r'   rX   r   statsmodels.iolib.summaryr   )r   r   rw   r   r   r   r   confint_percentsr   r   _r   r   std_errconf_intparamsr   r   rr   r   r   s                        r   summaryCompareMeans.summary  sv   2 WWWW,#~~V~IE1--E-ILE5 NN&NFME--E-ILEX11G33G--(e$v&e$e$??E>2rww0589,38Q3HI3HRa(3HI<6E8<
 	
	 Js    F	c                     U R                   nU R                  n[        R                  " UR                  UR
                  S-
  -  UR                  UR
                  S-
  -  -   5      $ )Nr	   )r   r   r
   r6   r=   r    r   s      r   r   %CompareMeans.std_meandiff_separatevar  sL     WWWWwwrww"''A+.BGGaK1HHIIr   c                    U R                   nU R                  nUR                  UR                  -   UR                  S-
  UR                  -   S-
  -  n[        R
                  " USUR                  -  SUR                  -  -   -  5      $ )z<variance assuming equal variance in both data sets

        r	         ?)r   r   r.   r    r
   r6   )r   r   r   
var_pooleds       r   r   #CompareMeans.std_meandiff_pooledvar  sx     WWWW ]]R]]* WWq[277"Q&( 	 wwzS277]S277]%BCDDr   c                    U R                   nU R                  nUR                  UR                  S-
  -  nUR                  UR                  S-
  -  nX4-   nX5-  S-  UR                  S-
  -  nXE-  S-  UR                  S-
  -  nSXg-   -  nU$ )zAdegrees of freedom of Satterthwaite for unequal variance
        r	   r-   r  )r   r   r=   r    )	r   r   r   sem1sem2semsumz1z2ry   s	            r   dof_sattCompareMeans.dof_satt  s     WWWW ww"''A+&ww"''A+&m!RWWq[1m!RWWq[1RWo
r   c           	      >   U R                   nU R                  nUS:X  a,  U R                  nUR                  S-
  UR                  -   S-
  nO.US:X  a  U R                  nU R                  5       nO[        S5      e[        UR                  UR                  XgXS9u  pXU4$ )aV  ttest for the null hypothesis of identical means

this should also be the same as onewaygls, except for ddof differences

Parameters
----------
x1 : array_like, 1-D or 2-D
    first of the two independent samples, see notes for 2-D case
x2 : array_like, 1-D or 2-D
    second of the two independent samples, see notes for 2-D case
alternative : str
    The alternative hypothesis, H1, has to be one of the following
    'two-sided': H1: difference in means not equal to value (default)
    'larger' :   H1: difference in means larger than value
    'smaller' :  H1: difference in means smaller than value

usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. If ``unequal``, then Welch ttest with Satterthwait degrees
    of freedom is used
value : float
    difference between the means under the Null hypothesis.


Returns
-------
tstat : float
    test statistic
pvalue : float
    pvalue of the t-test
df : int or float
    degrees of freedom used in the t-test

Notes
-----
The result is independent of the user specified ddof.

r   r	   unequal(usevar can only be "pooled" or "unequal"r   )	r   r   r   r    r   r  r   r   r'   )
r   rx   r   r   r   r   stdmry   r   pvals
             r   r   CompareMeans.ttest_ind  s    N WWWWX..D''A+'!+Cy 00D--/CGHH$GGRWWd
 Cr   c                     U R                   nU R                  nUS:X  a  U R                  nOUS:X  a  U R                  nO[	        S5      e[        UR                  UR                  XaUS9u  pxXx4$ )a  z-test for the null hypothesis of identical means

Parameters
----------
x1 : array_like, 1-D or 2-D
    first of the two independent samples, see notes for 2-D case
x2 : array_like, 1-D or 2-D
    second of the two independent samples, see notes for 2-D case
alternative : str
    The alternative hypothesis, H1, has to be one of the following
    'two-sided': H1: difference in means not equal to value (default)
    'larger' :   H1: difference in means larger than value
    'smaller' :  H1: difference in means smaller than value

usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. If ``unequal``, then the standard deviations of the samples may
    be different.
value : float
    difference between the means under the Null hypothesis.

Returns
-------
tstat : float
    test statistic
pvalue : float
    pvalue of the z-test

r   r  r  r  )r   r   r   r   r   r   r'   )	r   rx   r   r   r   r   r  r   r  s	            r   r   CompareMeans.ztest_ind1  so    < WWWWX..Dy 00DGHH$GGRWWde
 {r   c                 >   U R                   nU R                  nUR                  UR                  -
  nUS:X  a,  U R                  nUR                  S-
  UR                  -   S-
  nO.US:X  a  U R
                  nU R                  5       nO[        S5      e[        XgXUS9n	U	$ )  confidence interval for the difference in means

Parameters
----------
alpha : float
    significance level for the confidence interval, coverage is
    ``1-alpha``
alternative : str
    This specifies the alternative hypothesis for the test that
    corresponds to the confidence interval.
    The alternative hypothesis, H1, has to be one of the following :

    'two-sided': H1: difference in means not equal to value (default)
    'larger' :   H1: difference in means larger than value
    'smaller' :  H1: difference in means smaller than value

usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. If ``unequal``, then Welch ttest with Satterthwait degrees
    of freedom is used

Returns
-------
lower, upper : floats
    lower and upper limits of the confidence interval

Notes
-----
The result is independent of the user specified ddof.

r   r	   r  r  rw   rx   )	r   r   r'   r   r    r   r  r   rv   )
r   rw   rx   r   r   r   r   r   ry   ress
             r   r   CompareMeans.tconfint_diff_  s    D WWWWww X22H''A+'!+Cy 44H--/CGHHC+
 
r   c                     U R                   nU R                  nUR                  UR                  -
  nUS:X  a  U R                  nOUS:X  a  U R                  nO[        S5      e[        XgXS9nU$ )r  r   r  r  r  )r   r   r'   r   r   r   r~   )	r   rw   rx   r   r   r   r   r   r  s	            r   r   CompareMeans.zconfint_diff  sq    D WWWWww X22Hy 44HGHH%
 
r   c                     U R                  SX1S9nU R                  SX2S9n[        R                  " US   US   5      XE44$ )aR  
test of equivalence for two independent samples, base on t-test

Parameters
----------
low, upp : float
    equivalence interval low < m1 - m2 < upp
usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. If ``unequal``, then Welch ttest with Satterthwait degrees
    of freedom is used

Returns
-------
pvalue : float
    pvalue of the non-equivalence test
t1, pv1 : tuple of floats
    test statistic and pvalue for lower threshold test
t2, pv2 : tuple of floats
    test statistic and pvalue for upper threshold test
r   rx   r   r   r   r	   )r   r
   r   r   r   r   r   tt1tt2s         r   	ttost_indCompareMeans.ttost_ind  sH    , nn&nLnn6nMzz#a&#a&)C:55r   c                     U R                  SX1S9nU R                  SX2S9n[        R                  " US   US   5      XE4$ )aS  
test of equivalence for two independent samples, based on z-test

Parameters
----------
low, upp : float
    equivalence interval low < m1 - m2 < upp
usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. If ``unequal``, then Welch ttest with Satterthwait degrees
    of freedom is used

Returns
-------
pvalue : float
    pvalue of the non-equivalence test
t1, pv1 : tuple of floats
    test statistic and pvalue for lower threshold test
t2, pv2 : tuple of floats
    test statistic and pvalue for upper threshold test
r   r  r   r	   )r   r
   r   r   s         r   	ztost_indCompareMeans.ztost_ind  sF    , nn&nLnn6nMzz#a&#a&)333r   r   )NNr   r   )Tr   r   r   )r   r   r   )r   r   r   )r   )r   r   r   r   r   r   classmethodr   r   r   r   r   r  r   r   r   r   r#  r&  r   r   r   r   r   r   S  s    ( HI
 
4?
B J J E E"7 r,^ ;C1h ;C/b664r   r   NNc           	      t    [        [        XS   SS9[        XS   SS95      nUR                  X#US9u  pxn	XxU	4$ )a  ttest independent sample

Convenience function that uses the classes and throws away the intermediate
results,
compared to scipy stats: drops axis option, adds alternative, usevar, and
weights option.

Parameters
----------
x1 : array_like, 1-D or 2-D
    first of the two independent samples, see notes for 2-D case
x2 : array_like, 1-D or 2-D
    second of the two independent samples, see notes for 2-D case
alternative : str
    The alternative hypothesis, H1, has to be one of the following

       * 'two-sided' (default): H1: difference in means not equal to value
       * 'larger' :   H1: difference in means larger than value
       * 'smaller' :  H1: difference in means smaller than value

usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. If ``unequal``, then Welch ttest with Satterthwait degrees
    of freedom is used
weights : tuple of None or ndarrays
    Case weights for the two samples. For details on weights see
    ``DescrStatsW``
value : float
    difference between the means under the Null hypothesis.


Returns
-------
tstat : float
    test statistic
pvalue : float
    pvalue of the t-test
df : int or float
    degrees of freedom used in the t-test

r   r   r	   r  )r   r   r   )
x1x2rx   r   r   r   cmr   r  ry   s
             r   r   r     sW    b 
B
3B
3
B ||e $ E r   c           	      h   U(       an  U[         R                  L a  U" U 5      n U" U5      nO:U" [         R                  " X4S5      5      nUS[        U 5       n U[        U 5      S nU" U5      nU" U5      n[	        [        XS   SS9[        XS   SS95      nUR                  X#US9u  pXS   U
S   4$ )a  test of (non-)equivalence for two independent samples

TOST: two one-sided t tests

null hypothesis:  m1 - m2 < low or m1 - m2 > upp
alternative hypothesis:  low < m1 - m2 < upp

where m1, m2 are the means, expected values of the two samples.

If the pvalue is smaller than a threshold, say 0.05, then we reject the
hypothesis that the difference between the two samples is larger than the
the thresholds given by low and upp.

Parameters
----------
x1 : array_like, 1-D or 2-D
    first of the two independent samples, see notes for 2-D case
x2 : array_like, 1-D or 2-D
    second of the two independent samples, see notes for 2-D case
low, upp : float
    equivalence interval low < m1 - m2 < upp
usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. If ``unequal``, then Welch ttest with Satterthwait degrees
    of freedom is used
weights : tuple of None or ndarrays
    Case weights for the two samples. For details on weights see
    ``DescrStatsW``
transform : None or function
    If None (default), then the data is not transformed. Given a function,
    sample data and thresholds are transformed. If transform is log, then
    the equivalence interval is in ratio: low < m1 / m2 < upp

Returns
-------
pvalue : float
    pvalue of the non-equivalence test
t1, pv1 : tuple of floats
    test statistic and pvalue for lower threshold test
t2, pv2 : tuple of floats
    test statistic and pvalue for upper threshold test

Notes
-----
The test rejects if the 2*alpha confidence interval for the difference
is contained in the ``(low, upp)`` interval.

This test works also for multi-endpoint comparisons: If d1 and d2
have the same number of columns, then each column of the data in d1 is
compared with the corresponding column in d2. This is the same as
comparing each of the corresponding columns separately. Currently no
multi-comparison correction is used. The raw p-values reported here can
be correction with the functions in ``multitest``.

r   Nr   r	   )r   )r
   logconcatenater   r   r   r#  )r+  r,  r   r   r   r   	transformxxr-  r  r  s              r   r#  r#  C  s    v 2B2B 2>>2(A67BIc"gBCGIBnn	B
3B
3
B Sf5IDQQr   c                    U(       an  U[         R                  L a  U" U 5      n U" U5      nO:U" [         R                  " X4S5      5      nUS[        U 5       n U[        U 5      S nU" U5      nU" U5      n[	        X-
  USS9nUR                  USS9u  pn
UR                  USS9u  pn[         R                  " X5      XU
4XU44$ )a  test of (non-)equivalence for two dependent, paired sample

TOST: two one-sided t tests

null hypothesis:  md < low or md > upp
alternative hypothesis:  low < md < upp

where md is the mean, expected value of the difference x1 - x2

If the pvalue is smaller than a threshold,say 0.05, then we reject the
hypothesis that the difference between the two samples is larger than the
the thresholds given by low and upp.

Parameters
----------
x1 : array_like
    first of the two independent samples
x2 : array_like
    second of the two independent samples
low, upp : float
    equivalence interval low < mean of difference < upp
weights : None or ndarray
    case weights for the two samples. For details on weights see
    ``DescrStatsW``
transform : None or function
    If None (default), then the data is not transformed. Given a function
    sample data and thresholds are transformed. If transform is log the
    the equivalence interval is in ratio: low < x1 / x2 < upp

Returns
-------
pvalue : float
    pvalue of the non-equivalence test
t1, pv1, df1 : tuple
    test statistic, pvalue and degrees of freedom for lower threshold test
t2, pv2, df2 : tuple
    test statistic, pvalue and degrees of freedom for upper threshold test

r   Nr   r   r   r   )r
   r/  r0  r   r   r   r   )r+  r,  r   r   r1  r   r2  ddr   r   r   r   r   r   s                 r   ttost_pairedr5    s    R 2B2B 2>>2(A67BIc"gBCGIBnn	RWgA	6B==(=;LBS==)=<LBS::c"3"3??r   c                    US;  a  [        S5      e[        R                  " U 5      n U R                  S   nU R	                  S5      nU R                  S5      nUb  [        R                  " U5      nUR                  S   n	UR	                  S5      n
UR                  S5      nUS:X  a%  Xh-  X-  -   nXU	-   SU-  -
  -  nUSU-  SU	-  -   -  nOUS:X  a  XU-
  -  XU-
  -  -   nO	XU-
  -  nSn
[        R                  " W5      n[        XzXUS9$ )	a  test for mean based on normal distribution, one or two samples

In the case of two samples, the samples are assumed to be independent.

Parameters
----------
x1 : array_like, 1-D or 2-D
    first of the two independent samples
x2 : array_like, 1-D or 2-D
    second of the two independent samples
value : float
    In the one sample case, value is the mean of x1 under the Null
    hypothesis.
    In the two sample case, value is the difference between mean of x1 and
    mean of x2 under the Null hypothesis. The test statistic is
    `x1_mean - x2_mean - value`.
alternative : str
    The alternative hypothesis, H1, has to be one of the following

       'two-sided': H1: difference in means not equal to value (default)
       'larger' :   H1: difference in means larger than value
       'smaller' :  H1: difference in means smaller than value

usevar : str, 'pooled' or 'unequal'
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. If ``unequal``, then the standard deviation of the sample is
    assumed to be different.
ddof : int
    Degrees of freedom use in the calculation of the variance of the mean
    estimate. In the case of comparing means this is one, however it can
    be adjusted for testing other statistics (proportion, correlation)

Returns
-------
tstat : float
    test statistic
pvalue : float
    pvalue of the t-test

Notes
-----
usevar can be pooled or unequal in two sample case

>   r   r  r  r   r   r-   r  r  r  )NotImplementedErrorr
   r   r   r'   r:   r6   r   )r+  r,  r   rx   r   r   nobs1x1_meanx1_varnobs2x2_meanx2_varr:   r   s                 r   ztestr>    s   h **!"LMM	BBHHQKEggajGVVAYF	~ZZ^''!*X.5>1C5=1t8++C3;u,,Cy DL)Fdl,CCC%wws|H'HNNr   c                    US:w  a  [        S5      e[        R                  " U 5      n U R                  S   nU R	                  S5      nU R                  S5      n	Ubl  [        R                  " U5      nUR                  S   n
UR	                  S5      nUR                  S5      nXy-  X-  -   nXU
-   SU-  -
  -  nUSU-  SU
-  -   -  nO	XU-
  -  nSn[        R                  " U5      n[        X-
  U-
  XU5      nU$ )a  confidence interval based on normal distribution z-test

Parameters
----------
x1 : array_like, 1-D or 2-D
    first of the two independent samples, see notes for 2-D case
x2 : array_like, 1-D or 2-D
    second of the two independent samples, see notes for 2-D case
value : float
    In the one sample case, value is the mean of x1 under the Null
    hypothesis.
    In the two sample case, value is the difference between mean of x1 and
    mean of x2 under the Null hypothesis. The test statistic is
    `x1_mean - x2_mean - value`.
usevar : str, 'pooled'
    Currently, only 'pooled' is implemented.
    If ``pooled``, then the standard deviation of the samples is assumed to be
    the same. see CompareMeans.ztest_ind for different options.
ddof : int
    Degrees of freedom use in the calculation of the variance of the mean
    estimate. In the case of comparing means this is one, however it can
    be adjusted for testing other statistics (proportion, correlation)

Notes
-----
checked only for 1 sample case

usevar not implemented, is always pooled in two sample case

``value`` shifts the confidence interval so it is centered at
`x1_mean - x2_mean - value`

See Also
--------
ztest
CompareMeans

r   z#only usevar="pooled" is implementedr   r-   r  )r7  r
   r   r   r'   r:   r6   r~   )r+  r,  r   rw   rx   r   r   r8  r9  r:  r;  r<  r=  r  r   rz   s                   r   zconfintr@    s    d !"GHH	BBHHQKEggajGVVAYF	~ZZ^''!*^en4
ema$h..
cEkC%K//
t|,
wwz"H	E!8K
B Ir   c           	      r    [        XSXAUS9n[        XSXBUS9n[        R                  " US   US   5      UU4$ )a  Equivalence test based on normal distribution

Parameters
----------
x1 : array_like
    one sample or first sample for 2 independent samples
low, upp : float
    equivalence interval low < m1 - m2 < upp
x1 : array_like or None
    second sample for 2 independent samples test. If None, then a
    one-sample test is performed.
usevar : str, 'pooled'
    If `pooled`, then the standard deviation of the samples is assumed to be
    the same. Only `pooled` is currently implemented.

Returns
-------
pvalue : float
    pvalue of the non-equivalence test
t1, pv1 : tuple of floats
    test statistic and pvalue for lower threshold test
t2, pv2 : tuple of floats
    test statistic and pvalue for upper threshold test

Notes
-----
checked only for 1 sample case

r   )rx   r   r   r   r   r	   )r>  r
   r   )r+  r   r   r,  r   r   r!  r"  s           r   ztostrB  j  sT    < 
HVTC 
IfdC 	

3q63q6" r   r   )r   r   r)  r   )r   r)  N)Nr   r   r   r  )Nr   r   r   r   r  )Nr   r  )r   numpyr
   scipyr   statsmodels.tools.decoratorsr   r   r   rv   r   r   r~   r   r   r#  r5  r>  r@  rB  r   r   r   <module>rF     s    D   7{3 {3|.b.b,^)X,^d4 d4n 
9z HLM `9@z JMMOd 

	HV(r   