U
    md7                     @   s  d dl mZ d dlmZ d dlZd dlZd dlZd dl	Z
d dlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZmZ d
dlmZmZ eejZd.eeeee
jdddZ ee
jdddZ!e
jdddZ"ee
jdddZ#ee
jdddZ$e
jdddZ%ee
jdddZ&eee
jdd d!Z'eee
jdd"d#Z(d/e)e)ee e*d%d&d'Z+ed0d$d)ed* e*e
jd+d,d-Z,dS )1    )Path)OptionalN)version   )logging_utils)Literal)settings)readread_visium   )check_datasetdir_existsfilter_oldformatwarning            ?  )n_variables	n_centerscluster_stdn_observationsreturnc                 C   s@   ddl }|jj|| ||dd\}}tj|t|td|jdS )a      Gaussian Blobs.

    Parameters
    ----------
    n_variables
        Dimension of feature space.
    n_centers
        Number of cluster centers.
    cluster_std
        Standard deviation of clusters.
    n_observations
        Number of observations. By default, this is the same observation number
        as in :func:`scanpy.datasets.krumsiek11`.

    Returns
    -------
    Annotated data matrix containing a observation annotation 'blobs' that
    indicates cluster identity.
    r   N)Z	n_samplesZ
n_featuresZcentersr   Zrandom_state)blobs)obsdtype)	Zsklearn.datasetsZdatasetsZ
make_blobsadAnnDatadictastypestrr   )r   r   r   r   ZsklearnXy r"   R/home/sam/Atlas/atlas_env/lib/python3.8/site-packages/scanpy/datasets/_datasets.pyr      s    
r   )r   c                  C   s   t jd } d}t| |d}|S )a9      Bulk data with conditions ulcerative colitis (UC) and Crohn's disease (CD).

    The study assesses transcriptional profiles in peripheral blood mononuclear
    cells from 42 healthy individuals, 59 CD patients, and 26 UC patients by
    hybridization to microarrays interrogating more than 22,000 sequences.

    Reference
    ---------
    Burczynski et al., "Molecular classification of Crohn's disease and
    ulcerative colitis patients using transcriptional profiles in peripheral
    blood mononuclear cells"
    J Mol Diagn 8, 51 (2006). PMID:16436634.
    z!burczynski06/GDS1615_full.soft.gzzQftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz
backup_url)r	   
datasetdirr
   )filenameurladatar"   r"   r#   burczynski069   s    
r*   c                  C   s   t d } tj}dt_t| dd}|t_d|jd< ddd	d
dd}||jd< tdd t|jD }d|dd< d	|dd< d
|dd< d|dd< ||j	d< t
| |S )a      Simulated myeloid progenitors [Krumsiek11]_.

    The literature-curated boolean network from [Krumsiek11]_ was used to
    simulate the data. It describes development to four cell fates: 'monocyte',
    'erythrocyte', 'megakaryocyte' and 'neutrophil'.

    See also the discussion of this data in [Wolf19]_.

    Simulate via :func:`~scanpy.tl.sim`.

    Returns
    -------
    Annotated data matrix.
    zkrumsiek11.txterrorTZfirst_column_namesr   irootZStemZMoEryZMkZNeu)r      i?  i  ik  
highlightsc                 S   s   g | ]}d qS )Z
progenitorr"   .0ir"   r"   r#   
<listcomp>g   s     zkrumsiek11.<locals>.<listcomp>P         i@  i  i  i0  r   	cell_type)HEREr	   	verbosityr
   unsnparrayrangeZn_obsr   r   sanitize_anndata)r'   Zverbosity_saver)   Zfate_labelsr8   r"   r"   r#   
krumsiek11O   s     



r@   c                     s   t jd } d}t| d|d}t|jddddg }|d	d	|f  }d
|jd< dddddd  fdd|jD |j	d< t
j|j	d t  d|j	d< t  |jd< |S )zp    Hematopoiesis in early mouse embryos [Moignard15]_.

    Returns
    -------
    Annotated data matrix.
    zmoignard15/nbt.3154-S3.xlsxzrhttps://static-content.springer.com/esm/art%3A10.1038%2Fnbt.3154/MediaObjects/41587_2015_BFnbt3154_MOESM4_ESM.xlsxzdCt_values.txt)Zsheetr%   ZEif2b1ZMrpl19ZPolr2aZUbcNi  r-   z#D7A83Ez#7AAE5Dz#497ABCz#AF353Az#765099)ZHFZNPZPSZ4SGZ4SFGc                    s&   g | ] t  fd d D qS )c                 3   s   | ]}  |r|V  qd S )N)
startswith)r2   gnameZsnamer"   r#   	<genexpr>   s     
 z(moignard15.<locals>.<listcomp>.<genexpr>)nextkeys)r2   groupsrC   r#   r4      s   zmoignard15.<locals>.<listcomp>Z
exp_groups)
categoriesZexp_groups_colors)r	   r&   r
   r<   Zin1d	var_namescopyr;   Z	obs_namesr   pdZCategoricallistrF   values)r'   r%   r)   Zgene_subsetr"   rG   r#   
moignard15q   s*    	


 
rO   c            
   	      s@  t d ddl} tjd }|jjdd d}t|| | 	|d^}|d	 d
 }|d d
 
t}|d d
 
t}|d d
  
t}|d d
 
t}W 5 Q R X tj| tjd}	||	_||	_ddg   d 7   fdd|D |	jd< t|	 dd |	jD |	_t||	j}|	dd|f }	d|	jd< |	S )a2      Development of Myeloid Progenitors [Paul15]_.

    Non-logarithmized raw data.

    The data has been sent out by Email from the Amit Lab. An R version for
    loading the data can be found here
    https://github.com/theislab/scAnalysisTutorial

    Returns
    -------
    Annotated data matrix.
    zWIn Scanpy 0.*, this returned logarithmized data. Now it returns non-logarithmized data.r   Nzpaul15/paul15.h5Texist_okz"http://falexwolf.de/data/paul15.h5rzdata.debatchedr"   zdata.debatched_rownameszdata.debatched_colnamesz
cluster.idzinfo.genes_strings)r      r.   z3MEP Mk GMP GMP DC Baso Baso Mo Mo Neu Neu Eos Lymphc                    s    g | ]}|  |d    qS )r   r"   r1   r8   r"   r#   r4      s     zpaul15.<locals>.<listcomp>Zpaul15_clustersc                 S   s   g | ]}| d d qS );r   )split)r2   Zgnr"   r"   r#   r4      s     iH  r-   )loggwarningh5pyr	   r&   parentmkdirr   check_presence_downloadFiler   r   flattenintr   r   Z	transposer<   Zfloat32rJ   Z	row_namesrV   r   r?   Zintersect1dr;   )
rY   r'   r%   fr    Z
gene_names
cell_namesZclustersZinfogenes_namesr)   r"   rT   r#   paul15   s4    



rb   c                  C   s"   t d } t| dd}d|jd< |S )z    Simulated toggleswitch.

    Data obtained simulating a simple toggleswitch [Gardner00]_

    Simulate via :func:`~scanpy.tl.sim`.

    Returns
    -------
    Annotated data matrix.
    ztoggleswitch.txtTr,   r   r-   )r9   r
   r;   )r'   r)   r"   r"   r#   toggleswitch   s    
rc   c               
   C   s@   t d } t & tjdtdd t| W  5 Q R  S Q R X dS )a      Subsampled and processed 68k PBMCs.

    10x PBMC 68k dataset from
    https://support.10xgenomics.com/single-cell-gene-expression/datasets

    The original PBMC 68k dataset was preprocessed using scanpy and was saved
    keeping only 724 cells and 221 highly variable genes.

    The saved file contains the annotation of cell types (key: `'bulk_labels'`),
    UMAP coordinates, louvain clustering and gene rankings based on the
    `bulk_labels`.

    Returns
    -------
    Annotated data matrix.
    z10x_pbmc68k_reduced.h5adignoreanndatacategorymoduleN)r9   warningscatch_warningsfilterwarningsFutureWarningr
   )r'   r"   r"   r#   pbmc68k_reduced   s    
rm   c                  C   s   d} t tjd | d}|S )a      3k PBMCs from 10x Genomics.

    The data consists in 3k PBMCs from a Healthy Donor and is freely available
    from 10x Genomics (`here
    <http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz>`__
    from this `webpage
    <https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k>`__).

    The exact same data is also used in Seurat's
    `basic clustering tutorial <https://satijalab.org/seurat/pbmc3k_tutorial.html>`__.

    .. note::

        This downloads 5.9 MB of data upon the first call of the function and stores it in `./data/pbmc3k_raw.h5ad`.

    The following code was run to produce the file.

    .. code:: python

        adata = sc.read_10x_mtx(
            # the directory with the `.mtx` file
            './data/filtered_gene_bc_matrices/hg19/',
            # use gene symbols for the variable names (variables-axis index)
            var_names='gene_symbols',
            # write a cache file for faster subsequent reading
            cache=True,
        )

        adata.var_names_make_unique()  # this is unnecessary if using 'gene_ids'
        adata.write('write/pbmc3k_raw.h5ad', compression='gzip')

    Returns
    -------
    Annotated data matrix.
    z(http://falexwolf.de/data/pbmc3k_raw.h5adzpbmc3k_raw.h5adr$   )r
   r	   r&   )r(   r)   r"   r"   r#   pbmc3k   s    'rn   c                
   C   sB   t  0 t jdtdd ttjd ddW  5 Q R  S Q R X dS )zProcessed 3k PBMCs from 10x Genomics.

    Processed using the `basic tutorial <https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html>`__.

    Returns
    -------
    Annotated data matrix.
    rd   re   rf   zpbmc3k_processed.h5adz[https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5adr$   N)ri   rj   rk   rl   r
   r	   r&   r"   r"   r"   r#   pbmc3k_processed&  s    
ro   F)	sample_idspaceranger_versionbase_dirdownload_imagec              	   C   s   ddl }|dkrtj}d| d|  d}||  }|jdd |  d}|| }tj||| d ||*}	|	D ]}
||
j  sp|		|
| qpW 5 Q R X tj|d	 ||  d
 d |rtj|d ||  d d dS )z
    Params
    ------
    sample_id
        String name of example visium dataset.
    base_dir
        Where to download the dataset to.
    download_image
        Whether to download the high-resolution tissue section.
    r   Nz/https://cf.10xgenomics.com/samples/spatial-exp//TrP   z_spatial.tar.gz)r'   r%   zfiltered_feature_bc_matrix.h5z_filtered_feature_bc_matrix.h5	image.tifz
_image.tif)
tarfiler	   r&   r[   r   r\   opennameexistsextract)rp   rq   rr   rs   rv   Z
url_prefixZ
sample_dirZtar_filenameZtar_pthr`   elr"   r"   r#   _download_visium_dataset9  s2    
 r|   "V1_Breast_Cancer_Block_A_Section_1)include_hires_tiff)r}   Z"V1_Breast_Cancer_Block_A_Section_2ZV1_Human_HeartZV1_Human_Lymph_NodeZV1_Mouse_KidneyZV1_Adult_Mouse_BrainZ!V1_Mouse_Brain_Sagittal_PosteriorZ+V1_Mouse_Brain_Sagittal_Posterior_Section_2Z V1_Mouse_Brain_Sagittal_AnteriorZ*V1_Mouse_Brain_Sagittal_Anterior_Section_2ZV1_Human_Brain_Section_1ZV1_Human_Brain_Section_2Z&V1_Adult_Mouse_Brain_Coronal_Section_1Z&V1_Adult_Mouse_Brain_Coronal_Section_2Z-Targeted_Visium_Human_Cerebellum_NeuroscienceZParent_Visium_Human_CerebellumZ-Targeted_Visium_Human_SpinalCord_NeuroscienceZParent_Visium_Human_SpinalCordZ-Targeted_Visium_Human_Glioblastoma_Pan_CancerZ Parent_Visium_Human_GlioblastomaZ-Targeted_Visium_Human_BreastCancer_ImmunologyZ Parent_Visium_Human_BreastCancerZ.Targeted_Visium_Human_OvarianCancer_Pan_CancerZ.Targeted_Visium_Human_OvarianCancer_ImmunologyZ!Parent_Visium_Human_OvarianCancerZ4Targeted_Visium_Human_ColorectalCancer_GeneSignatureZ$Parent_Visium_Human_ColorectalCancer)rp   r~   r   c                C   sT   d| krd}nd}t | ||d |rBttj|  tj|  d d}nttj|  }|S )u      Processed Visium Spatial Gene Expression data from 10x Genomics.
    Database: https://support.10xgenomics.com/spatial-gene-expression/datasets

    Parameters
    ----------
    sample_id
        The ID of the data sample in 10x’s spatial database.
    include_hires_tiff
        Download and include the high-resolution tissue image (tiff) in `adata.uns["spatial"][sample_id]["metadata"]["source_image_path"]`.

    Returns
    -------
    Annotated data matrix.
    ZV1_z1.1.0z1.2.0)rs   ru   )Zsource_image_path)r|   r   r	   r&   )rp   r~   rq   r)   r"   r"   r#   
visium_sgel  s    2  r   )r   r   r   r   )NF)r}   )-pathlibr   typingr   ri   numpyr<   ZpandasrL   re   r   	packagingr    r   rW   r   Z_compatr   Z	_settingsr	   Z	readwriter
   r   r   r   __file__rZ   r9   r_   floatr   r   r*   r@   rO   rb   rc   rm   rn   ro   r   boolr|   r   r"   r"   r"   r#   <module>   st   
    &"%6*  3  