U
    md]-                     @   s  d dl mZmZmZmZ d dlZd dlZd dlm	Z
 d dlmZ d dlmZ ddlmZ ddlmZ ejeee ejd	d
dZejejeeejejejejf dddZdeeeee  eeeejdf dddZdejejejeeeeeeejejf d	ddZdd Zdd ZdS )    )
CollectionTupleOptionalUnionN)linalg)issparse)AnnData   )logging)sanitize_anndata)model	batch_keybatch_levelsreturnc           	         s0  ddl }|jd|| dd}| j|gdd} | djj td	|j	d  d
  fdd| jjD }|rd
dd |D }|jd|| | dd}tj||fdd}td	t| d tdd
| d   dk	r,td	t  d tdd
  d   D ]}| | ||< q|S )a(      Computes a simple design matrix.

    Parameters
    --------
    model
        Contains the batch annotation
    batch_key
        Name of the batch column
    batch_levels
        Levels of the batch annotation

    Returns
    --------
    The design matrix for the regression problem
    r   Nz%~ 0 + C(Q('{}'), levels=batch_levels)Z	dataframe)return_type   ZaxisnumberFound z	 batches
c                    s   g | ]}| kr|qS  r   ).0cZnumerical_covariatesr   U/home/sam/Atlas/atlas_env/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py
<listcomp>+   s      z"_design_matrix.<locals>.<listcomp>z + c                 s   s   | ]}d  |V  qdS )zQ('{}')N)format)r   xr   r   r   	<genexpr>.   s     z!_design_matrix.<locals>.<genexpr>z~ 0 + {}z categorical variables:	z, 
z numerical variables:)patsyZdmatrixr   ZdropZselect_dtypescolumnsvalueslogginfoshapejoinpdconcatlen)	r   r   r   r    designZ
other_colsZcol_reprZfactor_matrixZnCr   r   r   _design_matrix   s4      
r+   )r   datar   r   c                 C   s  |  |j }t| \}}t|}tdd |D }tt|}t	| ||}	t
t
tt
|	j|	|	j|j}
t
|| j|
d|ddf }|t
|	|
j d }t
|tt|dft| }t|dkdkrtdt|dk d t
|jt|dftdt|f}t|	 }d|ddd|f< |t
||
j7 }t|dkd|| t
t|tdt|f }tj||j|jd	}||	||fS )
a      Standardizes the data per gene.

    The aim here is to make mean and variance be comparable across batches.

    Parameters
    --------
    model
        Contains the batch annotation
    data
        Contains the Data
    batch_key
        Name of the batch column in the model matrix

    Returns
    --------
    s_data
        Standardized Data
    design
        Batch assignment as one-hot encodings
    var_pooled
        Pooled variance per gene
    stand_mean
        Gene-wise mean
    c                 S   s   g | ]}t |qS r   r)   r   vr   r   r   r   b   s     z%_standardize_data.<locals>.<listcomp>Nr	   r   r   r   z genes with zero variance.)indexr!   )groupbygroupsitemszipr)   nparrayfloatsumr+   dotlainvTonesintprintreshapecopywheresqrtr'   	DataFramer0   r!   )r   r,   r   Zbatch_itemsr   
batch_infon_batch	n_batchesn_arrayr*   ZB_hatZ
grand_mean
var_pooled
stand_meantmps_datar   r   r   _standardize_dataA   s4    ("" &rM   batchT)adatakey
covariatesinplacer   c           $   
   C   sn  ||   krtd||dk	rt||   }t| r`t||   }td|||krptdt|tt	|krtdt
| jr| jjj}n| jj}tj|| j| jd}t|  | j|g|r|ng   }||j }	t|	}
tdd |	D }tt|}td	 t|||\}}}}td
 ||jd|
  }t|j| |j |j j}g }t|	D ]*\}}| |j!dd|f j"dd qz|j#dd}|j"dd}t$t%t&|}t$t%t'|}td g g  }}t|	D ]`\}}t(|j!dd|f j|| || j|| || || || \}}| | | | qtd |}t|}t|}t|	D ]\}}t)||ddf } | *t| df} t+| t,d|| f}!t|j!dd|f t+|j!| |j }"|"|! |j!dd|f< qt)|*t|df}#|t+|#t,dt-|f | }|r`|j. | _n
|j. S dS )a      ComBat function for batch effect correction [Johnson07]_ [Leek12]_
    [Pedersen12]_.

    Corrects for batch effects by fitting linear models, gains statistical power
    via an EB framework where information is borrowed across genes.
    This uses the implementation `combat.py`_ [Pedersen12]_.

    .. _combat.py: https://github.com/brentp/combat.py

    Parameters
    ----------
    adata
        Annotated data matrix
    key
        Key to a categorical annotation from :attr:`~anndata.AnnData.obs`
        that will be used for batch effect removal.
    covariates
        Additional covariates besides the batch variable such as adjustment
        variables or biological condition. This parameter refers to the design
        matrix `X` in Equation 2.1 in [Johnson07]_ and to the `mod` argument in
        the original combat function in the sva R package.
        Note that not including covariates may introduce bias or lead to the
        removal of biological signal in unbalanced designs.
    inplace
        Whether to replace adata.X or to return the corrected data

    Returns
    -------
    Depending on the value of `inplace`, either returns the corrected matrix or
    or modifies `adata.X`.
    z(Could not find the key {!r} in adata.obsNz1Could not find the covariate(s) {!r} in adata.obsz'Batch key and covariates cannot overlapzCovariates must be unique)r,   r0   r!   c                 S   s   g | ]}t |qS r   r-   r.   r   r   r   r      s     zcombat.<locals>.<listcomp>z!Standardizing Data across genes.
z%Fitting L/S model and finding priors
r   r   zFinding parametric adjustments
zAdjusting data
)/Zobs_keys
ValueErrorr   r5   isinanyr6   tolistr)   setr   XAr<   r'   rD   Z	var_namesZ	obs_namesr   Zobsr1   indicesr"   r7   r8   r#   r$   rM   r!   r:   r;   	enumerateappendZilocvarmeanlistmap_aprior_bprior_it_solrC   r@   r9   r=   r>   Z	transpose)$rO   rP   rQ   rR   Z	cov_existZmissing_covrX   r,   r   rE   rF   rG   rH   rL   r*   rI   rJ   Zbatch_designZ	gamma_hat	delta_hatiZ
batch_idxsZ	gamma_bart2Za_priorZb_priorZ
gamma_starZ
delta_stargammadeltaZ	bayesdatajZdsqdenomZnumerZvpsqr   r   r   combat   s    (


$






"rk   -C6?)	rL   g_hatd_hatg_barrf   abconvr   c                 C   s   dt |  jdd}| }	| }
d}d}|	}|
}||kr|| | |
|  || |
  }| ||jd dft d| jd f  }|d }|jdd}d| | |d | d  }tt||	 |	  t||
 |
  }|}	|}
|d }q6||fS )a=      Iteratively compute the conditional posterior means for gamma and delta.

    gamma is an estimator for the additive batch effect, deltat is an estimator
    for the multiplicative batch effect. We use an EB framework to estimate these
    two. Analytical expressions exist for both parameters, which however depend on each other.
    We therefore iteratively evalutate these two expressions until convergence is reached.

    Parameters
    --------
    s_data
        Contains the standardized Data
    g_hat
        Initial guess for gamma
    d_hat
        Initial guess for delta
    g_bar, t_2, a, b
        Hyperparameters
    conv: float, optional (default: `0.0001`)
        convergence criterium

    Returns:
    --------
    gamma
        estimated value for gamma
    delta
        estimated value for delta
    r   r   r   r	   g      ?g       @g      ?)	r5   isnanr8   rA   r@   r%   r=   maxabs)rL   rm   rn   ro   rf   rp   rq   rr   nZg_oldZd_oldZchangecountZg_newZd_newZsum2r   r   r   rc     s.    '  
rc   c                 C   s$   |   }|  }d| |d  | S )Nr	   r^   r]   rd   ms2r   r   r   ra   ]  s    ra   c                 C   s$   |   }|  }|| |d  | S )N   rx   ry   r   r   r   rb   c  s    rb   )rN   NT)rl   )typingr   r   r   r   Zpandasr'   numpyr5   r   r:   Zscipy.sparser   Zanndatar    r
   r#   _utilsr   rD   strr+   ZndarrayrM   boolrk   r7   rc   ra   rb   r   r   r   r   <module>   sT     5  D   
  F