U
    mdV                     @   s  d dl Z d dlmZ d dlZd dlZd dlmZ	 d dl
mZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ d	d
lmZ d	dlmZ d	dlmZ d!eee eee eeeeeej  d	ddZ!ddej"dddddfeee ee ee ee ee ee eed ej d
ddZ#dddej"dddddddddfeee ee ee ee ee ee ee eed eeee eeej  ddd Z$dS )"    N)Optional)AnnData   )logging)settings	Verbosity)sanitize_anndatacheck_nonnegative_integers)Literal   )_get_mean_var)materialize_as_ndarray)filter_genes  T333333?F)	adatalayern_top_genes	batch_keycheck_valuesspansubsetinplacereturnc           %      C   s  zddl m} W n tk
r,   tdY nX tj| jd}	|dk	rN| j| n| j}
|rlt|
slt	
dt t|
\|	d< |	d< |dkrttj| jd td	}n| j| j}g }t|D ]}|
||k }t|\}}|dk}tj|
jd
 tjd	}t|| }t|| }||||dd}|  |jj||< td| }|tj }|jd }t|}|| | }t|rt|}|j ||j! k}||j!|  |j |< t"|#dj$dd}t"|j$dd}n>t%||j}t&|||k| t'|j$dd}|j$dd}d
|d
 t'|  |t'| | d| |   }|(|)d
d qtj*|dd}tj+tj+| d
dd
d} | tj,} tj$| |k tdd}!tj-| | |k< tj./| }"tj.j0|"dd1tj-}#|!|	d< |#|	d< tj2|dd|	d< |	ddg j3ddgddgddj4}$d|	d< d|	j5|$dt| df< |sT|rddi| j6d< t78d |	d j| j9d< |	d j| j9d< |	d j| j9d< |	d j| j9d< |	d jjddd| j9d< |dk	r|	d j| j9d< |r| :|	d j n|dkr|	j;dgd
d}	|	S dS )a&      See `highly_variable_genes`.

    For further implementation details see https://www.overleaf.com/read/ckptrbgzzzpg

    Returns
    -------
    Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or
    updates `.var` with the following fields:

    highly_variable : bool
        boolean indicator of highly-variable genes.
    **means**
        means per gene.
    **variances**
        variance per gene.
    **variances_norm**
        normalized variance per gene, averaged in the case of multiple batches.
    highly_variable_rank : float
        Rank of the gene according to normalized variance, median rank in the case of multiple batches.
    highly_variable_nbatches : int
        If batch_key is given, this denotes in how many batches genes are detected as HVG.
    r   )loesszAPlease install skmisc package via `pip install --user scikit-misc)indexNzI`flavor='seurat_v3'` expects raw count data, but non-integers were found.meansZ	variances)Zdtyper   r   )r   Zdegree
   Zaxishighly_variable_nbatchesZhighly_variable_rankZvariances_normTFlast)	ascendingna_positionhighly_variableflavor	seurat_v3hvgzadded
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)float64copy)<Zskmisc.loessr   ImportErrorpd	DataFrame	var_nameslayersXr	   warningswarnUserWarningr   ZCategoricalnpzerosshapeintobsvaluesuniquer(   log10fitoutputsZfitted_valuessqrtastyper*   	sp_sparseissparseZ
csr_matrixdataindicesarraypowersumZbroadcast_toZputmaskZsquareappendZreshapeconcatenateargsortfloat32nanmaZmasked_invalidmedianZfilledmeansort_valuesr   locunslogghintvar_inplace_subset_varZdrop)%r   r   r   r   r   r   r   r   r   dfr0   Z
batch_infoZnorm_gene_varsbZX_batchrN   rT   Z	not_constZestimat_varyxmodelZreg_stdZbatch_countsNZvmaxZclip_valmaskZsquared_batch_counts_sumZbatch_counts_sumZclip_val_broadZnorm_gene_varZranked_norm_gene_varsZnum_batches_high_varZ	ma_rankedZmedian_rankedZsorted_index r]   d/home/sam/Atlas/atlas_env/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py _highly_variable_genes_seurat_v3   s    "




 	
 

r_   g      ?g?      seurat)rb   cell_ranger)
r   r   min_dispmax_dispmin_meanmax_meanr   n_binsr%   r   c	                 C   sH  |dk	r| j | n| j}	|dkr`d|  krV| jd d dk	rV|	t| jd d 9 }	t|	}	tt|	\}
}d|
|
dk< ||
 }|dkrtj	||dk< t|}t
|
}
t }|
|d< ||d< |dkrtj|d |d	|d
< |d
d }| }|jdd}| }t||d
 j d  }t|dkrLtd| d ||j j||j< d||j< |d j||d
 j j ||d
 j j |d< n|dkrZddlm} t|d tjtj t|d tdddtjf |d
< |d
d }| }t  J t!d |"|j#}|d j||d
 j j ||d
 j j |d< W 5 Q R X nt$d|d j}|dk	r
|t%|  }|ddd &  || j'krt(d | j'}||j)krt*dt+ |j)}||d  }t,|d j|k}td| d|  n2d|t%|< tj-.|
|k|
|k ||k||k f}||d< |S )z    See `highly_variable_genes`.

    Returns
    -------
    A DataFrame that contains the columns
    `highly_variable`, `means`, `dispersions`, and `dispersions_norm`.
    Nrb   log1pbaseg-q=r   r   dispersions)ZbinsZmean_binr   )ZddofzGene indices zy fell into a single bin: their normalized dispersion was set to 1.
    Decreasing `n_bins` will likely avoid this effect.dispersions_normrc   )robustr   i      ignorez.`flavor` needs to be "seurat" or "cell_ranger"r   z3`n_top_genes` > `adata.n_var`, returning all genes.zb`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.zthe z; top genes correspond to a normalized dispersion cutoff of r$   )/r/   r0   Zuns_keysrQ   r4   logexpm1r   r   rK   ri   r,   r-   cutgroupbyrN   ZstdZisnullwherer9   tolistlenrR   debugZstatsmodelsrm   Zr_infZ
percentileZarangerM   r1   catch_warningssimplefilterapplyZmad
ValueErrorisnansortZn_varsinfosizer2   r3   Z
nan_to_numlogical_andreduce)r   r   rd   re   rf   rg   r   rh   r%   r0   rN   rT   Z
dispersionrV   Zdisp_groupedZdisp_mean_binZdisp_std_binZone_gene_per_binZgen_indicesrm   Zdisp_median_binZdisp_mad_bindispersion_normZdisp_cut_offgene_subsetr]   r]   r^   #_highly_variable_genes_single_batch   s    







*





	r   )rb   rc   r&   )r   r   r   rd   re   rf   rg   r   rh   r%   r   r   r   r   r   c                 C   s  |dk	r,t dd ||||fD s,td td}t| tsHtd|	dkrht| ||||||
|dS |dkrt| ||||||||	d		}nZt|  | j	| j
j}g }| j}|D ]}| | j	| |k }tjtj t|d
ddd }W 5 Q R X |dd|f }t|||||||||	d		}tjtt| t|jf|jd}|d t|d< ||  |d< |jj|d< tj||gdd}tt|d t| d f}|j t!| }|"| qtj|dd}|d t#|d< |$d%t&tj'tj'tj'tj(d}|j)t&dddd |d t|k|d< |dk	r|j*ddgdddd t|j+d }d|d|< |t|d< |j | jddf }nV|j | j }|j,j}d|t-|< tj./|j0|k|j0|k |j,|k|j,|k f}||d< tjd|d |s|
rd|	i| j1d< t2d  |d j| j3d< |d! j| j3d!< |d" j| j3d"< |d jjd#dd$| j3d< |dk	r|d j| j3d< |d j| j3d< |
r| 4|d j n|S dS )%uJ      Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_.

    Expects logarithmized data, except when `flavor='seurat_v3'`, in which count
    data is expected.

    Depending on `flavor`, this reproduces the R-implementations of Seurat
    [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_.

    For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized
    dispersion is obtained by scaling with the mean and standard deviation of
    the dispersions for genes falling into a given bin for mean expression of
    genes. This means that for each bin of mean expression, highly variable
    genes are selected.

    For [Stuart19]_, a normalized variance for each gene is computed. First, the data
    are standardized (i.e., z-score normalization per feature) with a regularized
    standard deviation. Next, the normalized variance is computed as the variance
    of each gene after the transformation. Genes are ranked by the normalized variance.

    See also `scanpy.experimental.pp._highly_variable_genes` for additional flavours
    (e.g. Pearson residuals).

    Parameters
    ----------
    adata
        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    layer
        If provided, use `adata.layers[layer]` for expression values instead of `adata.X`.
    n_top_genes
        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'`.
    min_mean
        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
    max_mean
        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
    min_disp
        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
    max_disp
        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
    span
        The fraction of the data (cells) used when estimating the variance in the loess
        model fit if `flavor='seurat_v3'`.
    n_bins
        Number of bins for binning the mean gene expression. Normalization is
        done with respect to each bin. If just a single gene falls into a bin,
        the normalized dispersion is artificially set to 1. You'll be informed
        about this if you set `settings.verbosity = 4`.
    flavor
        Choose the flavor for identifying highly variable genes. For the dispersion
        based methods in their default workflows, Seurat passes the cutoffs whereas
        Cell Ranger passes `n_top_genes`.
    subset
        Inplace subset to highly-variable genes if `True` otherwise merely indicate
        highly variable genes.
    inplace
        Whether to place calculated metrics in `.var` or return them.
    batch_key
        If specified, highly-variable genes are selected within each batch separately and merged.
        This simple process avoids the selection of batch-specific genes and acts as a
        lightweight batch correction method. For all flavors, genes are first sorted
        by how many batches they are a HVG. For dispersion-based flavors ties are broken
        by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median
        (across batches) rank based on within-batch normalized variance.
    check_values
        Check if counts in selected layer are integers. A Warning is returned if set to True.
        Only used if `flavor='seurat_v3'`.

    Returns
    -------
    Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or
    updates `.var` with the following fields

    highly_variable : bool
        boolean indicator of highly-variable genes
    **means**
        means per gene
    **dispersions**
        For dispersion-based flavors, dispersions per gene
    **dispersions_norm**
        For dispersion-based flavors, normalized dispersions per gene
    **variances**
        For `flavor='seurat_v3'`, variance per gene
    **variances_norm**
        For `flavor='seurat_v3'`, normalized variance per gene, averaged in
        the case of multiple batches
    highly_variable_rank : float
        For `flavor='seurat_v3'`, rank of the gene according to normalized
        variance, median rank in the case of multiple batches
    highly_variable_nbatches : int
        If batch_key is given, this denotes in how many batches genes are detected as HVG
    highly_variable_intersection : bool
        If batch_key is given, this denotes the genes that are highly variable in all batches

    Notes
    -----
    This function replaces :func:`~scanpy.pp.filter_genes_dispersion`.
    Nc                 s   s   | ]}|d kV  qd S )Nr]   ).0mr]   r]   r^   	<genexpr>  s    z(highly_variable_genes.<locals>.<genexpr>z3If you pass `n_top_genes`, all cutoffs are ignored.z extracting highly variable geneszv`pp.highly_variable_genes` expects an `AnnData` argument, pass `inplace=False` if you want to return a `pd.DataFrame`.r&   )r   r   r   r   r   r   r   )r   rd   re   rf   rg   r   rh   r%   r   F)Z	min_cellsr   r   )columnsr$   ZgeneT)Zignore_indexr   )r   rk   rl   r$   r    )r$   )r   r   Zhighly_variable_intersectionrl   r!   )r"   r#   r   z    finished)timer%   r'   zadded
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)r   rk   rJ   r)   )5allrR   r   
isinstancer   r}   r_   r   r   r8   cat
categoriesr.   r   	verbosityoverrider   errorr   r,   r-   r4   r5   rF   rw   r   r?   boolr9   concatrH   ru   rP   rI   rG   r7   rt   ZaggdictZnanmeanZnansumrenamerO   r6   rl   r~   r   r   r   rQ   rS   rT   rU   )r   r   r   rd   re   rf   rg   r   rh   r%   r   r   r   r   startrV   ZbatchesZ	gene_listbatchZadata_subsetZfiltr'   Zmissing_hvgZidxsZhigh_varr   r   r]   r]   r^   highly_variable_genes'  s    w





$
 


 
r   )Nr   NTr   FT)%r1   typingr   numpyr4   Zpandasr,   Zscipy.sparsesparser@   Zanndatar    r   rR   Z	_settingsr   r   _utilsr   r	   Z_compatr
   r   Z_distributedr   _simpler   strr7   r   floatr-   r_   ry   r   r   r]   r]   r]   r^   <module>   s           "x