
    ӆh\                    v   % S SK Jr  S SKrS SKJrJrJrJr  S SKJ	r	  S SK
rS SKrS SKrS SKr\(       a  S SKJr  SrS\S'   SSS jjrSSS	 jjrSSS
 jjrSSS jjrSSS jjrSSS jjr\SSS jj5       r\SS S jj5       rS!S"S jjrS!S#S jjrS!S#S jjrS$S%S jjrS$S%S jjr SS&S jjr!S'S jr"SS(S jjr#g))    )annotationsN)TYPE_CHECKINGFinalLiteraloverload)urlretrievez-https://github.com/shap/shap/raw/master/data/z
Final[str]github_data_urlc           	     d   [         S-   n[        R                  " [        U U  SU  S35      5      R	                  [        R
                  5      n[        R                  " [        U S35      5      nUb<  [        R                  R                  X1SS9n[        R                  R                  XASS9nX44$ )aG  Return a set of 50 images representative of ImageNet images.

Parameters
----------
resolution : int
    The resolution of the images. At present, the only supported value is 224.
n_points : int, optional
    Number of data points to sample. If None, the entire dataset is used.

Returns
-------
X : np.ndarray
    Represents images from ImageNet of a certain resolution.
y : np.ndarray
    The target variables, that is, the ImageNet classes.

Notes
-----
This dataset was collected by randomly finding a working ImageNet link and then pasting the
original ImageNet image into Google image search restricted to images licensed for reuse. A
similar image (now with rights to reuse) was downloaded as a rough replacement for the original
ImageNet image. The point is to have a random sample of ImageNet for use as a background
distribution for explaining models trained on ImageNet data.

Note that because the images are only rough replacements, the labels might no longer be correct.

Examples
--------
To get the processed images and labels::

    images, labels = shap.datasets.imagenet50()

imagenet50_xz.npyz
labels.csvr   random_state)
r	   nploadcacheastypefloat32loadtxtshaputilssample)
resolutionn_pointsprefixXys        `C:\Users\julio\OneDrive\Documentos\Trabajo\Ideas Frescas\venv\Lib\site-packages\shap/datasets.py
imagenet50r      s    D },FGGEVHZL*T"JKLSSTVT^T^_AJJuxz%:;<AJJa:JJa:4K    c                ,   [         R                  R                  5       n[        R                  " UR
                  UR                  S9nUR                  nU b<  [        R                  R                  X SS9n[        R                  R                  X0SS9nX#4$ )a[  Return the California housing data in a tabular format.

Used in predictive regression tasks.

Parameters
----------
n_points : int, optional
    Number of data points to sample. If provided, randomly samples the specified number of points.

Returns
-------
X : pd.DataFrame
    The feature data.
y : np.ndarray
    The target variable.

Notes
-----
The returned feature matrix ``X`` includes the following features:

- ``MedInc`` (float): Median income in block
- ``HouseAge`` (float): Median house age in block
- ``AveRooms`` (float): Average rooms in dwelling
- ``AveBedrms`` (float): Average bedrooms in dwelling
- ``Population`` (float): Block population
- ``AveOccup`` (float): Average house occupancy
- ``Latitude`` (float): House block latitude
- ``Longitude`` (float): House block longitude

The target column represents the median house value for California districts.

References
----------
California housing dataset: :external+scikit-learn:func:`sklearn.datasets.fetch_california_housing`

Examples
--------
To get the processed data and target labels::

    data, target = shap.datasets.california()

datacolumnsr   r   )sklearndatasetsfetch_california_housingpd	DataFramer"   feature_namestargetr   r   r   r   ddfr*   s       r   
californiar.   @   sz    V 	113A	1661??	;BFZZr!<""6!"D:r   c                f   [         R                  R                  5       n[        R                  " UR
                  UR                  S9n[        R                  " UR                  UR                  S9nU b<  [        R                  R                  X SS9n[        R                  R                  X0SS9nX#4$ )a  Return the Linnerud dataset in a convenient package for multi-target regression.

Parameters
----------
n_points : int, optional
    Number of data points to sample. If provided, randomly samples the specified number
    of points.

Returns
-------
X : pd.DataFrame
    The feature data.
y : pd.DataFrame
    The multiclass target variables.

Notes
-----
- The Linnerud dataset contains physiological and exercise data for 20 individuals.
- The feature matrix ``X`` includes three exercise variables: ``Chins``, ``Situps``, ``Jumps``.
- The target variables ``y`` include three physiological measurements: ``Weight``, ``Waist``, ``Pulse``.

More details: :external+scikit-learn:func:`sklearn.datasets.load_linnerud`

Examples
--------
To get the feature matrix and target variables::

    features, targets = shap.datasets.linnerud()

To get a subset of the data::

    subset_features, subset_targets = shap.datasets.linnerud(n_points=100)

)r#   r   r   )r$   r%   load_linnerudr'   r(   r"   r)   r*   target_namesr   r   r   )r   r,   r   r   s       r   linnerudr2   v   s    F 	&&(A
QVVQ__5A
QXXq~~6AJJa:JJa:4Kr   c                N   [        [        [        S-   5      SS9 nUR                  5       nSSS5        [        R
                  " S[        S9nSUSS& U b=  [        R                  R                  WU SS	9n[        R                  R                  X0SS	9nWU4$ ! , (       d  f       Np= f)
a  Return the classic IMDB sentiment analysis training data in a nice package.

Used in binary text classification tasks.

Parameters
----------
n_points : int, optional
    Number of data points to sample. If provided, randomly samples the specified number of points.

Returns
-------
X : list of strings
    Text data, where each string is a movie review.
y : np.ndarray
    The target variable. Contains booleans, where True indicates a positive sentiment and False
    indicates a negative sentiment.

Notes
-----
Full data is at: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

Paper to cite when using the data is: http://www.aclweb.org/anthology/P11-1015

Examples
--------
To get the processed text data and labels::

    text_data, labels = shap.datasets.imdb()

zimdb_train.txtzutf-8)encodingNia  dtyper   i0  r   )
openr   r	   	readlinesr   onesboolr   r   r   )r   fr"   r   s       r   imdbr<      s    > 
eO&667'	Ja{{} 
K
T"AAfuIzz  xa @JJa:7N 
K	Js   B
B$c           	     f   [         R                  " [        [        S-   5      SS9n[        R
                  " [        R                  " [        R                  " UR                  SS2S4   5      5      5      S   nU b  [        R                  R                  X SS9n[        R                  " UR                  US4   [        S9nUR                  US	S
24   n[        R
                  " [        R                  " UR                  5      R                  S5      S:H  5      S   nUR                  SS2U4   nXC4$ )aa  Predict the total number of violent crimes per 100K population.

This dataset is from the classic UCI Machine Learning repository:
https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized

Used in predictive regression tasks.

Parameters
----------
n_points : int, optional
    Number of data points to sample. If provided, randomly samples the specified number of points.

Returns
-------
X : pd.DataFrame
    The feature data.
y : np.ndarray
    The target variable.

Examples
--------
To get the processed data and target labels::

    data, target = shap.datasets.communitiesandcrime()

z CommViolPredUnnormalizedData.txt?)	na_valuesNr   r   r5      i)r'   read_csvr   r	   r   whereinvertisnanilocr   r   r   arrayfloatvaluessum)r   raw_data
valid_indsr   r   
valid_colss         r   communitiesandcrimerN      s    6 {{53U!UVbefH "))BHHX]]1b5-A$BCDQGJZZ&&z!&L

z2~.e<A 	j!C%'(A"((188,003q89!<J	q*}A4Kr   c                ,   [         R                  R                  5       n[        R                  " UR
                  UR                  S9nUR                  nU b<  [        R                  R                  X SS9n[        R                  R                  X0SS9nX#4$ )a  Return the diabetes data in a nice package.

Used in predictive regression tasks.

Parameters
----------
n_points : int, optional
    Number of data points to sample. If provided, randomly samples the specified number of points.

Returns
-------
X : pd.DataFrame
    The feature data.
y : np.ndarray
    The target variable.

Notes
-----
Feature Columns in ``X``:

- ``age`` (float): Age in years
- ``sex`` (float): Sex
- ``bmi`` (float): Body mass index
- ``bp`` (float): Average blood pressure
- ``s1`` (float): Total serum cholesterol
- ``s2`` (float): Low-density lipoproteins (LDL cholesterol)
- ``s3`` (float): High-density lipoproteins (HDL cholesterol)
- ``s4`` (float): Total cholesterol / HDL cholesterol ratio
- ``s5`` (float): Log of serum triglycerides level
- ``s6`` (float): Blood sugar level

Target ``y``:

- Progression of diabetes one year after baseline (float)

The diabetes dataset is a subset of the larger diabetes dataset from scikit-learn.
More details: :external+scikit-learn:func:`sklearn.datasets.load_diabetes`

Examples
--------
To get the processed data and target labels::

    data, target = shap.datasets.diabetes()

r!   r   r   )r$   r%   load_diabetesr'   r(   r"   r)   r*   r   r   r   r+   s       r   diabetesrQ      sz    \ 	&&(A	1661??	;BXXFZZr!<""6!"D:r   c                    g N displayr   s     r   irisrW   5  s    hkr   c                    g rS   rT   rU   s     r   rW   rW   7  s    fir   c                   [         R                  R                  5       n[        R                  " UR
                  UR                  S9nUR                  nUb<  [        R                  R                  X1SS9n[        R                  R                  XASS9nU (       a*  X4 Vs/ s H  n[        UR                  U   5      PM     sn4$ X44$ s  snf )as  Return the classic Iris dataset in a convenient package.

Parameters
----------
display : bool
    If True, return the original feature matrix along with class labels (as strings). Default is False.
n_points : int, optional
    Number of data points to sample. If provided, randomly samples the specified number of points.

Returns
-------
X : pd.DataFrame
    The feature matrix.
y : np.ndarray or a list of strings
    If ``display`` is False, a numpy array representing the class labels encoded as integers is returned.
    If ``display`` is True, then a list of class labels is returned.

Notes
-----
- The dataset includes measurements of sepal length, sepal width, petal length, and petal width for three
  species of iris flowers.
- Class labels are encoded as integers (0, 1, 2) representing the species (setosa, versicolor, virginica).
- If ``display`` is True, class labels are returned as strings.

Examples
--------
To get the feature matrix and class labels::

    features, labels = shap.datasets.iris()

To get the feature matrix and class labels as strings::

    features, class_labels = shap.datasets.iris(display=True)

r!   r   r   )r$   r%   	load_irisr'   r(   r"   r)   r*   r   r   r   strr1   )rV   r   r,   r-   r*   vs         r   rW   rW   ;  s    H 	""$A	1661??	;BFZZr!<""6!"DF;FqCq)*F;;;: <s   "Cc           	        / SQn[         R                  " [        [        S-   5      U Vs/ s H  o3S   PM	     snS[	        U5      S9nUb  [
        R                  R                  XASS9nUR                  S/SS	9n[        [        S
 U5      5      nUS   S:H  US'   SSSSSSS.nU Hj  u  pU	S:X  d  M  US:X  a=  [        R                  " XX    V
s/ s H  oU
R                  5          PM     sn
5      XX'   MP  XX   R                  R                  XX'   Ml     U (       a!  UR                  / SQSS	9US   R                   4$ UR                  SS/SS	9US   R                   4$ s  snf s  sn
f )af  Return the Adult census data in a structured format.

Used in binary classification tasks.

Parameters
----------
display : bool, optional
    If True, return the raw data without target and redundant columns.
n_points : int, optional
    Number of data points to sample. If provided, randomly samples the specified number of points.

Returns
-------
X : pd.DataFrame
    If ``display`` is True, ``X`` contains the raw data without the 'Education', 'Target', and 'fnlwgt' columns.
    Otherwise, ``X`` contains the processed data without the 'Target' and 'fnlwgt' columns.
y : np.ndarray
    The 'Target' column returned as an array.

Notes
-----
- The original data includes the following columns:

    - ``Age`` (float) : Age in years.
    - ``Workclass`` (category) : Type of employment.
    - ``fnlwgt`` (float) : Final weight; the number of units in the target population that the record represents.
    - ``Education`` (category) : Highest level of education achieved.
    - ``Education-Num`` (float) : Numeric representation of education level.
    - ``Marital Status`` (category) : Marital status of the individual.
    - ``Occupation`` (category) : Type of occupation.
    - ``Relationship`` (category) : Relationship status.
    - ``Race`` (category) : Ethnicity of the individual.
    - ``Sex`` (category) : Gender of the individual.
    - ``Capital Gain`` (float) : Capital gains recorded.
    - ``Capital Loss`` (float) : Capital losses recorded.
    - ``Hours per week`` (float) : Number of hours worked per week.
    - ``Country`` (category) : Country of origin.
    - ``Target`` (category) : Binary target variable indicating whether the individual earns more than 50K.

- The Education' column is redundant with 'Education-Num' and is dropped for simplicity.
- The 'Target' column is converted to binary (True/False) where '>50K' is True and '<=50K' is False.
- Certain categorical columns are encoded for numerical representation.

Examples
--------
To get the processed data and target labels::

    data, target = shap.datasets.adult()

To get the raw data for display::

    raw_data, target = shap.datasets.adult(display=True)

))Ager   )	Workclasscategory)fnlwgtr   )	Educationr`   )zEducation-Numr   )zMarital Statusr`   )
Occupationr`   )Relationshipr`   )Racer`   )Sexr`   )zCapital Gainr   )zCapital Lossr   )zHours per weekr   )Countryr`   )Targetr`   z
adult.datar   r>   )namesr?   r6   r   rb      )axisc                    U S   S;  $ )Nr   )rh   rb   rT   )r   s    r   <lambda>adult.<locals>.<lambda>  s    !4K(Kr   rh   z >50K         rA   )zNot-in-family	UnmarriedzOther-relativez	Own-childHusbandWifer`   rd   )rb   rh   ra   ra   )r'   rB   r   r	   dictr   r   r   droplistfilterr   rG   stripcatcodesrI   )rV   r   dtypesr,   rK   r"   filt_dtypesrcodekr6   r\   s              r   adultr   l  sh   nF" {{o,-F5KFqdF5KWZbfgmbnH ::$$Xa$H==+Q=/DvKVTUK(^w.DNa1STablmnEJN"((dg#Fg!'')$4g#FG'++++   }}>Q}GhI^I^^^99h)92DN4I4III' 6L $Gs   E
E"
c                   [         R                  " [        [        S-   5      SS9n[         R                  " [        [        S-   5      SS9S   nUb<  [        R
                  R                  X!SS9n[        R
                  R                  X1SS9nU (       a(  UR                  5       nU[        R                  " U5      4$ U[        R                  " U5      4$ )a  Return a nicely packaged version of NHANES I data with survival times as labels.

Used in survival analysis tasks.

Parameters
----------
display : bool, optional
    If True, returns the features with a modified display. Default is False.
n_points : int, optional
    Number of data points to sample. Default is None (returns the entire dataset).

Returns
-------
X : pd.DataFrame
    The feature data matrix. If ``display`` is True, a modified version of the features for display
    is returned as ``X`` instead.
y : np.ndarray
    The target variables representing survival times.

Examples
--------
Usage example::

    features, survival_times = shap.datasets.nhanesi(display=True, n_points=100)

zNHANESI_X.csvr   )	index_colzNHANESI_y.csvr   r   )
r'   rB   r   r	   r   r   r   copyr   rG   )rV   r   r   r   	X_displays        r   nhanesir     s    6 	E/O;<JA
E/O;<J3OAJJa:JJa:FFH	"((1+%%bhhqk>r   c                ^  ^ [         R                  R                  5       n[         R                  R                  S5        U Sp2[         R                  " U5      mSTSSS2'   [         R                  " U5      n[        SSS5       H?  nS=XEUS-   4'   XES-   U4'   S=XEUS-   4'   XES-   U4'   S=XES-   US-   4'   XES-   US-   4'   MA     U4S jn[         R                  R                  X#5      nXwR                  S5      -
  n[         R                  " UR                  U5      UR                  S   -  n	[         R                  R                  [         R                  R                  U	5      5      R                  n
[         R                  " XR                  5      n[         R                  R                  [         R                  " [         R                  " XR                  5      R                  5      [         R                  " U5      -
  5      S	:  d   e[         R                  " U[         R                  R                  U5      R                  5      nUnU" U5      [         R                  R                  U5      S
-  -   n[         R                  R                  U5        [         R"                  " U5      U4$ )a  Correlated Groups (60 features)

A synthetic dataset consisting of 60 features with tight correlations among distinct groups of features.

Parameters
----------
n_points : int, optional
    Number of data points to generate. Default is 1,000.

Returns
-------
X : pd.DataFrame
    The feature data matrix
y : np.ndarray
    The target variables

Notes
-----
- The dataset is generated with known correlations among distinct groups of features.
- Each feature is a unit variance Gaussian random variable centred around 0.
- The labels are generated based on a linear function of the features with added random noise.

Examples
--------
.. code-block:: python

    data, target = shap.datasets.corrgroups60()

r   <   rj      rp   gGz?ro   c                2   > [         R                  " U T5      $ rS   r   matmulr   betas    r   r;   corrgroups60.<locals>.f$      yyD!!r   gư>{Gz?)r   randomseedzeroseyerangerandnmeanr   Tshapelinalgcholeskyinvnormcorrcoefr'   r(   )r   old_seedNMCir;   X_start
X_centeredSigmaWX_whiteX_finalr   r   r   s                  @r   corrgroups60r     s   > yy~~HIINN1 Rq 88A;DD2aL 	q	A1b!_$((QU(aAqk$((QU(aAqk,00a%Q,!E1q5L/ 
" iiooa#G<<?*JIIjllJ/*2B2B12EEE
		299==/022Aii
CC(G
		r{{299Z#=#?#?@266!9LMPTTT ii!3!3A!6!8!89GA	!ryyq!D((A IINN8<<?Ar   c                  ^ [         R                  R                  5       n[         R                  R                  S5        U Sp2[         R                  " U5      mSTSSS2'   U4S jn[         R                  R	                  X#5      nXUR                  S5      -
  nU" U5      [         R                  R	                  U5      S-  -   n[         R                  R                  U5        [        R                  " U5      U4$ )a@  Independent Linear (60 features)

A synthetic dataset consisting of 60 features.

Parameters
----------
n_points : int, optional
    Number of data points to generate. Default is 1,000.

Returns
-------
X : pd.DataFrame
    The feature data matrix
y : np.ndarray
    The target variables

Notes
-----
- Each feature is a unit variance Gaussian random variable centred around 0.
- The labels are generated based on a linear function of the features with added random noise.

Examples
--------
.. code-block:: python

    features, labels = shap.datasets.independentlinear60()

r   r   rj   r   rp   c                2   > [         R                  " U T5      $ rS   r   r   s    r   r;   independentlinear60.<locals>.fd  r   r   r   )r   r   r   r   r   r   r'   r(   )	r   r   r   r   r;   r   r   r   r   s	           @r   independentlinear60r   <  s    < yy~~HIINN1 Rq 88A;DD2aL" iiooa#G,,q/!A	!ryyq!D((A IINN8<<?Ar   c                    [         R                  R                  [        [        S-   5      5      u  pU b<  [
        R                  R                  XSS9n[
        R                  R                  X SS9nX4$ )a  
Return a sparse dataset in scipy csr matrix format.

Data Source: :external+scikit-learn:func:`sklearn.datasets.load_svmlight_file`

Parameters
----------
n_points : int, optional
    Number of data points to sample. If None, returns the entire dataset. Default is None.

Returns
-------
X : scipy.sparse.csr_matrix
    Sparse feature matrix.
y : np.ndarray
    Target labels.

Examples
--------
.. code-block:: python

    data, target = shap.datasets.a1a()

za1a.svmlightr   r   )r$   r%   load_svmlight_filer   r	   r   r   r   )r   r"   r*   s      r   a1ar   r  sf    6 ##66u_~=]7^_LDzz  a @""6!"D<r   c                 L   Sn [         R                  R                  [        U S-   5      5      u  p[         R                  R                  [        U S-   5      5      u  p4[        R
                  " [        U S-   5      5      n[        R
                  " [        U S-   5      5      nXX4XV4$ )a  Return ranking datasets from the LightGBM repository.

Used in ranking tasks.

Returns
-------
x_train : scipy.sparse.csr_matrix
    Training feature matrix.
y_train : numpy.ndarray
    Training labels.
x_test : scipy.sparse.csr_matrix
    Testing feature matrix.
y_test : numpy.ndarray
    Testing labels.
q_train : numpy.ndarray
    Training query information.
q_test : numpy.ndarray
    Testing query information.

Notes
-----
Data Source: LightGBM repository https://github.com/microsoft/LightGBM/tree/master/examples/lambdarank

Examples
--------
.. code-block:: python

    x_train, y_train, x_test, y_test, q_train, q_test = shap.datasets.rank()

zPhttps://raw.githubusercontent.com/Microsoft/LightGBM/master/examples/lambdarank/z
rank.trainz	rank.testzrank.train.queryzrank.test.query)r$   r%   r   r   r   r   )rank_data_urlx_trainy_trainx_testy_testq_trainq_tests          r   rankr     s    > gM''::5Q]A];^_G%%88}{?Z9[\NFjj}/AABCGZZm.??@AFVW<<r   c                   Uc  [         R                  R                  U 5      n[         R                  R                  [         R                  R	                  [
        5      S5      n[         R                  " USS9  [         R                  R                  X!5      n[         R                  R                  U5      (       d  [        X5        U$ )z0Loads a file from the URL and caches it locally.cached_dataT)exist_ok)	ospathbasenamejoindirname__file__makedirsisfiler   )url	file_namedata_dir	file_paths       r   r   r     s~    GG$$S)	ww||BGGOOH5}EHKK4(WW\\(6I77>>)$$C#r   )   N)r   intr   
int | Nonereturnztuple[np.ndarray, np.ndarray]rS   )r   r   r   tuple[pd.DataFrame, np.ndarray])r   r   r   z!tuple[pd.DataFrame, pd.DataFrame])r   r   r   ztuple[list[str], np.ndarray])..)rV   zLiteral[False]r   r   r   r   )rV   zLiteral[True]r   r   r   ztuple[pd.DataFrame, list[str]])FN)rV   r:   r   r   r   z+tuple[pd.DataFrame, np.ndarray | list[str]])rV   r:   r   r   r   r   )i  )r   r   r   r   )r   r   r   z!tuple[ssp.csr_matrix, np.ndarray])r   zUtuple[ssp.csr_matrix, np.ndarray, ssp.csr_matrix, np.ndarray, np.ndarray, np.ndarray])r   r[   r   z
str | Noner   r[   )$
__future__r   r   typingr   r   r   r   urllib.requestr   numpyr   pandasr'   sklearn.datasetsr$   r   scipy.sparsesparsesspr	   __annotations__r   r.   r2   r<   rN   rQ   rW   r   r   r   r   r   r   r   rT   r   r   <module>r      s    " 	 : : &    M M*Z3l+\(V*Z6r 
 k 
 k	 i 
 i.b\J~&REP3l!H%=Pr   