U
    md;                     @   sz   d dl mZ d dlmZ d dlmZ d dlZd dlZd dl	Z
dd Zdd Zd	d
dgdddfejeeeeedddZdS )    )norm)check_nonnegative_integers)productNc           2      C   s  dd }d}t | jd df}t | jd }| jd }|dk	rJ|| nd}|| }t | d } t j| dd	}	t j| dd	}
t |
ddd
f }t |
ddd| f }t |t 	| }}t |t 	| }}i }i }t 
|D ]}| dd|f }t |	ddd|f |kd }t |	ddd
f |k}|| }|| }||||}||||}|||< |||< qi }d}tt 
|t 
|D ]\}}|	ddd
f |k}|	dddf |k} || @ }!t|!dkrqt |!d }"dt|t|g}#||t |!d < |#||< |d7 }|| }$|| }%| |! }&t tj|&dd|f f|%dd |%d |%d
 d| }'|| }(t tj|&dd|f |(d |(d
 d| })t tj|&dd|f |$d |$d
 d| }*t tj|&dd|f |$d |$d
 d| }+t j|*|+gdd	},t j|*|'gdd	}-t j|)|'gdd	}.|,|-|.g}/t|/D ]\}0}1|1||"|0f< qZq|||fS )a      Calculate log likelihoods for each hypothesis, negative, singlet, doublet

    Parameters
    ----------
    data : np.ndarray
        cells by hashing counts matrix
    number_of_noise_barcodes : int,
        number of barcodes to used to calculated noise distribution

    Returns
    -------
    log_likelihoods_for_each_hypothesis : np.ndarray
        a 2d np.array log likelihood of each hypothesis
    all_indices
    counter_to_barcode_combo
    c                 S   s   d|d  }t | }t | dkr.dt|  n|}|||  }t | dkrht| | | ||  | n|}|d||d   d fS )a          Update parameters of your gaussian https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf

        Parameters
        ----------
        data : np.array
            1-d array of counts
        mu_o : float,
            global mean for hashing count distribution
        std_o : float,
            global std for hashing count distribution

        Returns
        -------
        float
            mean of gaussian
        float
            std of gaussian
              r   g      ?)lennpvarmean)dataZmu_oZstd_oZlam_onZlamZlam_nZmu_n r   U/home/sam/Atlas/atlas_env/lib/python3.8/site-packages/scanpy/external/pp/_hashsolo.pygaussian_updates.   s    ,z4_calculate_log_likelihoods.<locals>.gaussian_updatesgV瞯<r      r   Nr   Zaxis_)locscale)r   zerosshapeemptylogZargsortsortZravelr
   ZstdZarangewherer   sumjoinstrr   Zpdf	enumerate)2r   number_of_noise_barcodesr   Zeps#log_likelihoods_for_each_hypothesisZall_indicesZnum_of_barcodesZnumber_of_non_noise_barcodesZnum_of_noise_barcodesZdata_argZ	data_sortZglobal_signal_countsZglobal_noise_countsZglobal_mu_signal_oZglobal_sigma_signal_oZglobal_mu_noise_oZglobal_sigma_noise_oZnoise_params_dictZsignal_params_dictxZsample_barcodesZsample_barcodes_noise_idxZsample_barcodes_signal_idxZnoise_countsZsignal_countsZnoise_paramZsignal_paramZcounter_to_barcode_combocounterZnoise_sample_idxZsignal_sample_idxZsignal_subsetZnoise_subsetZsubsetindicesZbarcode_comboZnoise_paramsZsignal_paramsZdata_subsetZlog_signal_signal_probsZsignal_noise_paramsZlog_noise_signal_probsZlog_noise_noise_probsZlog_signal_noise_probsZprobs_of_negativeZprobs_of_singletZprobs_of_doubletZlog_probs_listZprob_idxZlog_probr   r   r   _calculate_log_likelihoods   s    

    
 
			   
r&   c                 C   sj   t |}t| |\}}}t || t jt t ||dddddf  }t j|dd}|||dS )a#      Calculate bayes rule from log likelihoods

    Parameters
    ----------
    data : np.array
        Anndata object filled only with hashing counts
    priors : list,
        a list of your prior for each hypothesis
        first element is your prior for the negative hypothesis
        second element is your prior for the singlet hypothesis
        third element is your prior for the doublet hypothesis
        We use [0.01, 0.8, 0.19] by default because we assume the barcodes
        in your cell hashing matrix are those cells which have passed QC
        in the transcriptome space, e.g. UMI counts, pct mito reads, etc.
    number_of_noise_barcodes : int
        number of barcodes to used to calculated noise distribution

    Returns
    -------
    bayes_dict_results : dict
        "most_likely_hypothesis" key is a 1d np.array of the most likely hypothesis
        "probs_hypotheses" key is a 2d np.array probability of each hypothesis
        "log_likelihoods_for_each_hypothesis" key is a 2d np.array log likelihood of each hypothesis
    r   r   N)most_likely_hypothesisprobs_hypothesesr"   )r   arrayr&   expr   multiplyargmax)r   priorsr!   r"   r   r(   r'   r   r   r   _calculate_bayes_rule   s*    
 

r.   g{Gz?g?gRQ?T)adatacell_hashing_columnsr-   pre_existing_clustersr!   inplacec                 C   s  t d | j| j}t|s$td|dk	r@|t|kr@td| jd }tjt	
|dfddd	d
ddg| jd}|dk	r4|}	t	| j|	 }
|
D ]}| j|	 |k}t|| ||}|d |j|df< ||j|d	f< |d dddf |j|d
f< |d dddf |j|df< |d dddf |j|df< qnt|||}|d |jdddf< d|jddd	f< |d dddf |jddd
f< |d dddf |jdddf< |d dddf |jdddf< |j| jdf | jd< |j| jd	f | jd	< |j| jd
f | jd
< |j| jdf | jd< |j| jdf | jd< d| jd< d| jj| jd dkdf< d| jj| jd dkdf< | jd dk}t	j| jj||f jdd}| j| j| | jj|df< |s| S dS )a7      Probabilistic demultiplexing of cell hashing data using HashSolo [Bernstein20]_.

    .. note::
        More information and bug reports `here <https://github.com/calico/solo>`__.

    Parameters
    ----------
    adata
        Anndata object with cell hashes in .obs columns
    cell_hashing_columns
        list specifying which columns in adata.obs
        are cell hashing counts
    priors
        a list of your prior for each hypothesis
        first element is your prior for the negative hypothesis
        second element is your prior for the singlet hypothesis
        third element is your prior for the doublet hypothesis
        We use [0.01, 0.8, 0.19] by default because we assume the barcodes
        in your cell hashing matrix are those cells which have passed QC
        in the transcriptome space, e.g. UMI counts, pct mito reads, etc.
    pre_existing_clusters
        column in adata.obs for how to break up demultiplexing
        for example leiden or cell types, not batches though
    number_of_noise_barcodes
        Use this if you wish change the number of barcodes used to create the
        noise distribution. The default is number of cell hashes - 2.
    inplace
        To do operation in place

    Returns
    -------
    if inplace is False returns AnnData with demultiplexing results
    in .obs attribute otherwise does is in place

    Examples
    -------
    >>> import anndata
    >>> import scanpy.external as sce
    >>> data = anndata.read("data.h5ad")
    >>> sce.pp.hashsolo(data, ['Hash1', 'Hash2', 'Hash3'])
    >>> data.obs.head()
    z\Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2z(Cell hashing counts must be non-negativeNznumber_of_noise_barcodes must be at least one less         than the number of samples you have as determined by the number of         cell_hashing_columns you've given as input  r      r'   r(   cluster_featureZnegative_hypothesis_probabilityZsinglet_hypothesis_probabilityZdoublet_hypothesis_probability)columnsindexr   r   ZClassificationZDoubletZNegativer   )printZobsvaluesr   
ValueErrorr   r   pdZ	DataFramer   r   Z	obs_namesuniquer.   r   r,   r5   )r/   r0   r-   r1   r!   r2   r   Znum_of_cellsresultsZcluster_featuresZunique_cluster_featuresr4   Zcluster_feature_bool_vectorZposterior_dictZ	all_singsZsinglet_sample_indexr   r   r   hashsolo  s    3







 r=   )Zscipy.statsr   Zscanpy._utilsr   	itertoolsr   Zanndatanumpyr   Zpandasr:   r&   r.   ZAnnDatalistr   intboolr=   r   r   r   r   <module>   s(    ;1