U
    mdp                     @   sB  d Z ddlmZ ddlmZmZmZ ddlZddl	Z
ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ eed  Zed Zdd Zd(ddZdd ZG dd dZ d)ee!ee" eed ee! f e!ee# e"e"ee! e"eee"ee! ee dddZ$dd  Z%d*edd%d&d'Z&dS )+z1Rank genes according to differential expression.
    )floor)IterableUnionOptionalN)AnnData)issparsevstack   )_utils)logging)_get_mean_var)Literal)_get_obs_rep)check_nonnegative_integers)logregt-testwilcoxont-test_overestim_var)benjamini-hochberg
bonferronic                 C   sX   | j d }tj|td}t| | | d  }t| | d d d }|| | }|S )Nr   )Zdtype)shapenparangeintZargpartitionZargsort)scoresZn_topZn_fromZreference_indices	partitionZpartial_indicesglobal_indices r   X/home/sam/Atlas/atlas_env/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py_select_top_n   s    
r    c                 #   s   d}| j d }t| r(dd dd  ntjdd  d k	oDd k	}|rptt }fdd}n| j d } fd	d}t|| }td||D ]:}	t|	| |}
tj	|| |	|
d
}|
 }||	|
fV  qd S )Ni    c                 S   s   t |  S N)r   toarray)Ztplr   r   r   <lambda>'       z_ranks.<locals>.<lambda>c                 S   s   |   S r"   )r#   Xr   r   r   r$   (   r%   c                 S   s   | S r"   r   r&   r   r   r   r$   +   r%   c                    s$   |  ||f | ||f fS r"   r   r'   leftright)mask	mask_restmerger   r   r$   1   s   r   c                    s    | d d ||f S r"   r   r(   )adaptr   r   r$   6   r%   )data)r   r   r   r   count_nonzeror   rangeminpd	DataFrameZrank)r'   r+   r,   ZCONST_MAX_SIZEn_genesZmaskedn_cellsZ	get_chunkZ	max_chunkr)   r*   Zdfranksr   )r.   r+   r,   r-   r   _ranks!   s&    


r8   c                 C   s   t | jd }|dk r*t | jd dS t j| dd}t j|dd  |d d kd|jd d fddd}t |t |jd d d d f d}t j|dd}t j|dd	t j}d|d | j
dd|d |   S )	Nr   r	   r!         ?Zaxisr   T   )r   float64r   repeatsortinsertwherer   diffZastypesum)r7   sizeZarrtfidxZcntr   r   r   _tiecorrectC   s    4&rF   c                   @   s@   e Zd ZdddZdd Zd	d
 Zdd Zdd ZdddZdS )
_RankGenesrestTNFc                    sv  d   kr. jd d d k	r. fdd| _ntj| _t ||\| _| _t	| jt	 j
|  jdd  j@ }t|dkrtdd| }	|d k	r|rtd	|	j| }
n|r̈ jd k	r̈ j}	|	j}
t|
r|
  |
| _|	j| _d | _|d
krt| j|kd d | _d | _d | _d | _d | _|| _d | _d | _ d | _! j
| "| j| _# j
j| j#|f | _$d S )Nlog1pbasec                    s   t | t  jd d  S NrI   rJ   r   expm1logunsxadatar   r   r$   ^   r%   z%_RankGenes.__init__.<locals>.<lambda>c                 S   s   | dk S )Nr	   r   rP   r   r   r   r$   h   r%   r   zPCould not calculate statistics for groups {} since they only contain one sample.z, z/Cannot specify `layer` and have `use_raw=True`.rH   )%uns_keysrO   
expm1_funcr   rM   r
   Zselect_groupsgroups_ordergroups_maskssetobsZvalue_countslocindexlen
ValueErrorformatjoinZlayersrawr'   r   Zeliminate_zeros	var_names
ireferencer@   meansvars
means_rest	vars_restcomp_ptsptspts_reststatsisingrouping_maskgrouping)selfrS   groupsgroupby	referenceuse_rawlayerrg   Zinvalid_groups_selectedZ
adata_compr'   r   rR   r   __init__R   sT      

z_RankGenes.__init__c           	      C   s  | j jd }| jjd }t||f| _t||f| _| jrLt||fnd | _| j	d krt||f| _
t||f| _| jrt||fnd | _n4| j| j	 }| j | }t|\| j| j	< | j| j	< ~t| j rdd }ndd }t| jD ]\}}| j | }| jr$|||jd  | j|< | j	d k	r>|| j	kr>qt|\| j|< | j|< | j	d kr| }| j | }t|\| j
|< | j|< | jr|||jd  | j|< ~qd S )Nr!   r   c                 S   s   | j ddS Nr   r:   )getnnzr&   r   r   r   r$      r%   z)_RankGenes._basic_stats.<locals>.<lambda>c                 S   s   t j| ddS ru   )r   r0   r&   r   r   r   r$      r%   )r'   r   rW   r   zerosrc   rd   rg   rh   rb   re   rf   ri   r   r   	enumerate)	rn   r5   n_groupsr,   ZX_restZget_nonzerosimaskr+   ZX_maskr   r   r   _basic_stats   s@    






z_RankGenes._basic_statsc                 c   sH  ddl m} |   t| jD ]"\}}| jd k	r>|| jkr>q| j| }| j| }t	|}| jd k	r| j| j }| j| j }	t	| j| j }
n$| j
| }| j| }	| jjd | }
|dkr|
}n|dkr|}ntdtjdd. |j|t|||t|	|dd	\}}W 5 Q R X d|t|< d
|t|< |||fV  qd S )Nr   rj   r   r   zMethod does not exist.ignore)invalidF)Zmean1Zstd1Znobs1Zmean2Zstd2Znobs2Z	equal_varr!   )scipyrj   r{   rx   rW   rb   rc   rd   r   r0   re   rf   r'   r   r]   ZerrstateZttest_ind_from_statssqrtisnan)rn   methodrj   group_indexr+   
mean_groupZ	var_groupZns_group	mean_restZvar_restZns_otherZns_restr   pvalsr   r   r   t_test   sB    





z_RankGenes.t_testc              
   c   s  ddl m} |   | jjd }| jd k	rlt|}|rFt|}nd}t| j	D ]\}}|| jkrjqT| j	| j }t
|}	t
|}
|	dks|
dkrtd t| j||D ]B\}}}t|jd|	d d f |||< |rt||||< qt||	 |
 |	|
 d  d }||	|	|
 d d   | }d|t|< d|jjt| }|||fV  qTnn| j	jd }t||f}| jjd }|rt||f}t| jD ]`\}}}t| j	D ]H\}}t|j|d d f ||||f< |rt|||||f< qĐqt| j	D ]\}}t
|}	|r>|| }nd}t||	 ||	  |d  d }||d d f |	|d  d  | ||d d f< d|t|< d|jjt||d d f  }||| |fV  qd S )	Nr   r|   r!      zQFew observations in a group for normal approximation (<=25). Lower test accuracy.g      (@g       @r	   )r   rj   r{   r'   r   rb   r   rw   rx   rW   r0   logghintr8   rB   ZilocrF   r   r   distributionsZnormZsfabs)rn   tie_correctrj   r5   r   Tr   r+   r,   Zn_activeZm_activer7   r)   r*   Zstd_devr   ry   r6   rz   ZT_ir   r   r   r      sn    



$$

$z_RankGenes.wilcoxonc           	      k   s   ddl m} | j| jjd d f }t| jdkr8td|f |}||| j	j
j |j}t| jD ]F\}}t| jdkr|d }n|| }||d fV  t| jdkrd qqdd S )Nr   )LogisticRegressionr!   z7Cannot perform logistic regression on a single cluster.r	   )Zsklearn.linear_modelr   r'   rl   valuesr\   rV   r]   fitrm   catcodesZcoef_rx   )	rn   kwdsr   r'   ZclfZ
scores_allZigroup_r   r   r   r   r   L  s    

z_RankGenes.logregr   c                 K   s  |dkr|  |}n(|dkr(| |}n|dkr<| jf |}d | _| jjd }|D ]\}	}
}t| j|	 }|d k	r|rt	|
n|
}t
||}d}ntd }d}| jd krtj||fg}tj|d| _|d k	r| j| | j|df< |
| | j|df< |d k	r~|| | j|df< |d	krRd
dlm} d|t|< ||ddd\}}}}n|dkrlt|| d}|| | j|df< | jd k	rR| j|	 }| jd kr| j|	 }n| j| j }| |d | |d  }t|| | j|df< qR|d kr | j| j_d S )N>   r   r   r   r   r!   namesr   )columnsr   r   r   )multipletestsg?Zfdr_bh)alphar   r   r9   	pvals_adj&.>logfoldchanges)r   r   r   rj   r'   r   strrV   r   r   r    slicer3   Z
MultiIndexfrom_tuplesr4   ra   Zstatsmodels.stats.multitestr   r   minimumrc   rb   re   rU   log2r[   )rn   r   corr_methodn_genes_user
rankby_absr   r   Zgenerate_test_resultsr5   r   r   r   
group_nameZscores_sortr   Z	first_colrE   r   r   r   r   r   Zfoldchangesr   r   r   compute_statisticse  s`    




  



z_RankGenes.compute_statistics)rH   TNF)r   NFF)	__name__
__module____qualname__rt   r{   r   r   r   r   r   r   r   r   rG   Q   s       
D/2U    rG   allrH   Fr   )rS   rp   rr   ro   rq   r5   r   rh   	key_addedcopyr   r   r   rs   returnc                 K   sF  |dkr| j dk	}n|dkr.| j dkr.td|
dkrDtd d}
d|krX|d }td}ddd	d
h}|
|krtd| dddh}||krtd| d|	r|  n| } t|  |dkrd}n`t	|t
tfrtdnHt|}t	|d trdd |D }|dkr.|t|kr.||g7 }|dkrt|| j| jjkrt| j| jj }td| d| d|dkrd}i | j|< t|||
|||d| j| d< t| ||||||}t|jr|
d
krtd |}|dks||jjd kr
|jjd }td|d tdtj|jdd   |j|
||||f| |jdk	rd!d |jD }tj |jj!|j"|d"| j| d#< |j#dk	rtj |j#j!|j"|d"| j| d$< |j$j%& |j$_%d%d&d&d'd'd(}|j$j%j'd D ](}|j$| j(d)|| d*| j| |< qtjd+|d,|d-|
d.kr.d/nd0 d1 |	rB| S dS )2a9      Rank genes for characterizing groups.

    Expects logarithmized data.

    Parameters
    ----------
    adata
        Annotated data matrix.
    groupby
        The key of the observations grouping to consider.
    use_raw
        Use `raw` attribute of `adata` if present.
    layer
        Key from `adata.layers` whose value will be used to perform tests on.
    groups
        Subset of groups, e.g. [`'g1'`, `'g2'`, `'g3'`], to which comparison
        shall be restricted, or `'all'` (default), for all groups.
    reference
        If `'rest'`, compare each group to the union of the rest of the group.
        If a group identifier, compare with respect to this group.
    n_genes
        The number of genes that appear in the returned tables.
        Defaults to all genes.
    method
        The default method is `'t-test'`,
        `'t-test_overestim_var'` overestimates variance of each group,
        `'wilcoxon'` uses Wilcoxon rank-sum,
        `'logreg'` uses logistic regression. See [Ntranos18]_,
        `here <https://github.com/scverse/scanpy/issues/95>`__ and `here
        <http://www.nxn.se/valent/2018/3/5/actionable-scrna-seq-clusters>`__,
        for why this is meaningful.
    corr_method
        p-value correction method.
        Used only for `'t-test'`, `'t-test_overestim_var'`, and `'wilcoxon'`.
    tie_correct
        Use tie correction for `'wilcoxon'` scores.
        Used only for `'wilcoxon'`.
    rankby_abs
        Rank genes by the absolute value of the score, not by the
        score. The returned scores are never the absolute values.
    pts
        Compute the fraction of cells expressing the genes.
    key_added
        The key in `adata.uns` information is saved to.
    **kwds
        Are passed to test methods. Currently this affects only parameters that
        are passed to :class:`sklearn.linear_model.LogisticRegression`.
        For instance, you can pass `penalty='l1'` to try to come up with a
        minimal set of genes that are good predictors (sparse solution meaning
        few non-zero fitted coefficients).

    Returns
    -------
    **names** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the gene
        names. Ordered according to scores.
    **scores** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the z-score
        underlying the computation of a p-value for each gene for each
        group. Ordered according to scores.
    **logfoldchanges** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the log2
        fold change for each gene for each group. Ordered according to
        scores. Only provided if method is 't-test' like.
        Note: this is an approximation calculated from mean-log values.
    **pvals** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        p-values.
    **pvals_adj** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Corrected p-values.
    **pts** : `pandas.DataFrame` (`.uns['rank_genes_groups']`)
        Fraction of cells expressing the genes for each group.
    **pts_rest** : `pandas.DataFrame` (`.uns['rank_genes_groups']`)
        Only if `reference` is set to `'rest'`.
        Fraction of cells from the union of the rest of each group
        expressing the genes.

    Notes
    -----
    There are slight inconsistencies depending on whether sparse
    or dense data are passed. See `here <https://github.com/scverse/scanpy/blob/master/scanpy/tests/test_rank_genes_groups.py>`__.

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon')
    >>> # to visualize the results
    >>> sc.pl.rank_genes_groups(adata)
    NTz2Received `use_raw=True`, but `adata.raw` is empty.zNDefault of the method has been changed to 't-test' from 't-test_overestim_var'r   Zonly_positivezranking genesr   r   r   zMethod must be one of .r   r   z!Correction method must be one of r   zSpecify a sequence of groupsr   c                 S   s   g | ]}t |qS r   r   ).0nr   r   r   
<listcomp>9  s     z%rank_genes_groups.<locals>.<listcomp>rH   zreference = z needs to be one of groupby = rank_genes_groups)rp   rq   r   rr   rs   r   paramszyIt seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.r!   z	consider z groups:zwith sizes: r:   c                 S   s   g | ]}t |qS r   r   )r   namer   r   r   r   e  s     )r[   r   rh   ri   OZfloat32r<   )r   r   r   r   r   F)r[   Zcolumn_dtypesz    finishedzadded to `.uns[zz]`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
>   r   r   r   z    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids )timedeep))r`   r]   r   warningpopinfor   r
   Zsanitize_anndata
isinstancer   r   listrX   rY   r   
categoriestolistrO   dictrG   r   r'   r   debugr   r0   rW   r   rh   rV   r3   r4   r   ra   ri   rj   r   Z	swaplevelZlevels
to_records)rS   rp   rr   ro   rq   r5   r   rh   r   r   r   r   r   rs   r   startZavail_methodsZ
avail_corrrV   ZcatsZtest_objr   Zgroups_namesZdtypescolr   r   r   r     s    k





	        
 
r   c                 C   s2   t | r| jdd}ntj| dd}|| jd  S ru   )r   rv   r   r0   r   )r'   Z	n_nonzeror   r   r   
_calc_frac  s    r   rank_genes_groups_filtered      ?r!         ?)rS   r   c	                    s  |dkrd}|dkr& j | d d }|dkr@ j | d d } j | d d |ko j | d d dko j | d d |k}	|	od j | k}
|	od	 j | k}t j | d
 }tjt|j|j|jd}tjt|j|j|jd}|
rt j | d }nRtjt|j|j|jd}d  krZ j d d dk	rZ fdd}ntj	}t
d| d| d|  |jD ]>}|| j}|
r|s|r jdd|f jn dd|f j} j| |k}|| }||  }|rB j | d | j| j|jdd|f<  j | d	 | j| j|jdd|f< n,t||jdd|f< t||jdd|f< |
st|d}t|d}t||d ||d  |jdd|f< q|r| }|||k||k @ ||k@  } j |   j |< |jdd j | d
< dS )a      Filters out genes based on log fold change and fraction of genes expressing the
    gene within and outside the `groupby` categories.

    See :func:`~scanpy.tl.rank_genes_groups`.

    Results are stored in `adata.uns[key_added]`
    (default: 'rank_genes_groups_filtered').

    To preserve the original structure of adata.uns['rank_genes_groups'],
    filtered genes are set to `NaN`.

    Parameters
    ----------
    adata
    key
    groupby
    use_raw
    key_added
    min_in_group_fraction
    min_fold_change
    max_out_group_fraction
    compare_abs
        If `True`, compare absolute values of log fold change with `min_fold_change`.

    Returns
    -------
    Same output as :func:`scanpy.tl.rank_genes_groups` but with filtered genes names set to
    `nan`

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon')
    >>> sc.tl.filter_rank_genes_groups(adata, min_fold_change=3)
    >>> # visualize results
    >>> sc.pl.rank_genes_groups(adata, key='rank_genes_groups_filtered')
    >>> # visualize results using dotplot
    >>> sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups_filtered')
    Nr   r   rp   rr   rq   rH   r   ri   r   )r   r[   rI   rJ   c                    s   t | t  jd d  S rK   rL   rP   rR   r   r   r$     r%   z*filter_rank_genes_groups.<locals>.<lambda>z.Filtering genes using: min_in_group_fraction: z min_fold_change: z, max_out_group_fraction: rh   r   r   F)r[   )rO   r3   r4   r   rw   r   r   r[   rT   rM   r   r   r   r`   r'   rY   rZ   r   ZravelZmeanr   r   r   r   )rS   keyrp   rr   r   Zmin_in_group_fractionZmin_fold_changeZmax_out_group_fractionZcompare_absZsame_paramsZuse_logfoldsZuse_fractionZ
gene_namesZfraction_in_cluster_matrixZfraction_out_cluster_matrixZfold_change_matrixrU   Zclusterra   Zsub_XZin_groupZX_inZX_outZmean_in_clusterZmean_out_clusterr   rR   r   filter_rank_genes_groups  s    4


"
,


r   )NN)Nr   rH   NFFNFNr   FN)NNNr   r   r!   r   F)'__doc__mathr   typingr   r   r   numpyr   Zpandasr3   Zanndatar   Zscipy.sparser   r   r   r
   r   r   Zpreprocessing._simpler   Z_compatr   getr   r   _MethodZ_CorrMethodr    r8   rF   rG   r   boolr   r   r   r   r   r   r   r   <module>   s~   

"  a             d
        	