U
    |e:                 
   @   s  d dl mZ d dlZd dlZd dlmZmZ d dlm	Z	 d dl
mZmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 ej78ej78ej7j9ej7j9ej7j:fZ;e<ej=j>d	 Z?e<ej=j@d	 ZAeBejCjDZEe d	d	ZFd
d ZGejHddddd ZIejHejCej=ej=ddddd ZJejHdejCej=ej=ddddd ZKejHdddd ZLejHddddd ZMejHdddd ZNeH d ejOd!d"dfd#d$ZPeH d ejOd!d"dfd%d&ZQeH d ejOd!d"eFddddf	d'd(ZRejHdd)d:d+d,ZSejHdd)d;d-d.ZTejHdd)d<d0d1ZUd=d2d3ZVd4d5 ZWG d6d7 d7ZXG d8d9 d9eeZYdS )>    )warnN)check_random_statecheck_array)	normalize)BaseEstimatorTransformerMixin)
csr_matrix
coo_matrixisspmatrix_csrvstackissparse)tau_rand_inttau_rand	make_heapdeheap_sortnew_build_candidatestssimple_heap_pushchecked_flagged_heap_pushhas_been_visitedmark_visitedapply_graph_updates_high_memoryapply_graph_updates_low_memory!initalize_heap_from_graph_indices/initalize_heap_from_graph_indices_and_distances(sparse_initalize_heap_from_graph_indices)	make_forestrptree_leaf_arrayconvert_tree_formatFlatTreedenumbaify_treerenumbaify_treeselect_sidesparse_select_sidescore_linked_tree   c                 C   s   t | dd }|d k	o|d S )NflagsC_CONTIGUOUS)getattr)
array_liker&    r*   U/var/www/website-v5/atlas_env/lib/python3.8/site-packages/pynndescent/pynndescent_.pyis_c_contiguousD   s    r,   TF)parallelcachec                 C   s   dd t | jd D }t| jd D ]}t | jd D ]}| ||f }|dk rV q(t |d | jd D ]Z}| ||f }	|	dk r q:||| ||	 }
|
|| k s|
||	 k rj|| ||	|
f qjq:q(|S )Nc                 S   s   g | ]}d d t jfgqS npinf.0ir*   r*   r+   
<listcomp>L   s     z)generate_leaf_updates.<locals>.<listcomp>r   r%   )rangeshapenumbaprangeappend)
leaf_blockdist_thresholdsdatadistupdatesnr6   pjqdr*   r*   r+   generate_leaf_updatesI   s    rG   )rF   rC   rE   )localsr.   c                 C   s   |j d }d}|| }t|d D ]}|| }t||d | }	|||	 }
|d d d df }t|
|| |}tt|D ]}tt|| D ]}|| | \}}}|dks|dkrqt|d | |d | |d | ||td t|d | |d | |d | ||td qqzq"d S )Nr   i   r%   r0      )r9   r8   minrG   lenr   r2   uint8)r?   r@   current_graph
leaf_arrayn_leaves
block_sizen_blocksr6   block_start	block_endr=   r>   rA   rD   krC   rE   rF   r*   r*   r+   init_rp_tree`   s<    






rU   )rF   idxr6   )fastmathrH   r.   c           	      C   s   t |jd D ]}|d |df dk rt | t|d | dk D ]^}tt||jd  }||| || }t|d | |d | |d | ||td qDqd S )Nr           r%   rI   )r8   r9   r2   sumabsr   r   rL   )	n_neighborsr?   heapr@   	rng_stater6   rD   rV   rF   r*   r*   r+   init_random   s    "
 
 
   r^   )r.   c              	   C   sn   t |jd D ]Z}t |jd D ]F}|||f }|||f }t| d | | d | | d | ||d q qd S )Nr   r%   rI   )r8   r9   r   )r\   indices	distancesrC   rT   rE   rF   r*   r*   r+   init_from_neighbor_graph   s    .ra   c                 C   s2  | j d }dd t|D }| j d }t|D ]}t|D ]}	t| ||	f }
|
dk rZq<t|	|D ]\}t| ||f }|dk rqd|||
 || }|||
 ks||| krd|| |
||f qdt|D ]^}t|||f }|dk rq|||
 || }|||
 ks||| kr|| |
||f qq<q0|S )Nr   c                 S   s   g | ]}d d t jfgqS r/   r1   r4   r*   r*   r+   r7      s     z*generate_graph_updates.<locals>.<listcomp>r%   )r9   r8   r:   r;   intr<   )new_candidate_blockold_candidate_blockr>   r?   r@   rP   rA   max_candidatesr6   rD   rC   rT   rE   rF   r*   r*   r+   generate_graph_updates   s.    

rf   c                 C   s   d}|j d }	t|d D ]j}
|
| }t|	|
d | }||| }||| }|d d d df }t|||| |}|t|||7 }q|S Nr   r%   )r9   r8   rJ   rf   r   )r?   r@   rM   new_candidate_neighborsold_candidate_neighborsrQ   rP   	n_threadsc
n_verticesr6   rR   rS   rc   rd   r>   rA   r*   r*   r+   process_candidates   s"    
    rm   2   
   MbP?c	              
   C   s   |j d }	d}
|	|
 }t }t|D ]r}|r@td|d d| t| |||\}}t||| ||||
|}||| |j d  kr&|rtd|d d  d S q&d S )Nr    @  	r%    / (	Stopping threshold met -- exiting after
iterations)r9   r:   get_num_threadsr8   printr   rm   )rM   r?   r[   r]   re   r@   n_itersdeltaverboserl   rP   rQ   rj   rB   rh   ri   rk   r*   r*   r+   'nn_descent_internal_low_memory_parallel   s6    
   r{   c	                    s&  |j d }	d}
|	|
 }t } fddt d j d D }t|D ]}|r`td|d d| t |||\}}d}t|d D ]j}||
 }t|	|d |
 }||| }||| } d d d df }t|||||}|t ||7 }q||| |j d  krF|rtd|d d	  d S qFd S )
Nr   rq   c                    s$   g | ]}t  d  | tjqS r   )setastyper2   int64r4   rM   r*   r+   r7   $  s   z<nn_descent_internal_high_memory_parallel.<locals>.<listcomp>rr   r%   rs   rt   ru   )	r9   r:   rv   r8   rw   r   rJ   rf   r   )rM   r?   r[   r]   re   r@   rx   ry   rz   rl   rP   rQ   rj   Zin_graphrB   rh   ri   rk   r6   rR   rS   rc   rd   r>   rA   r*   r   r+   (nn_descent_internal_high_memory_parallel  sF    

       r   c                 C   s   |d j d dkrFt| j d |}|r4t| |||	 t|| ||| n8|d j d | j d krv|d j d |krv|}ntd|
rt|| |||||||d	 nt|| |||||||d	 t|d |d S )Nr   r%   z Invalid initial graph specified!)re   r@   rx   ry   rz   )r9   r   rU   r^   
ValueErrorr{   r   r   )r?   r[   r]   re   r@   rx   ry   
init_graphrp_tree_initrN   
low_memoryrz   rM   r*   r*   r+   
nn_descentF  sF    r   r-         ?c                 C   sT  t | jd D ]8}| |df g}||df g}td| jd D ]}	| ||	f dk rZ qd}
tt|D ]V}|| }||| ||	f  || }|| tkrj||||	f k rjt||k rjd}
 qqj|
rB|| ||	f  ||||	f  qBt| jd D ]N}	|	t|k r.||	 | ||	f< ||	 |||	f< qd| ||	f< tj	|||	f< qq| |fS )Nr   r%   TFr0   )
r:   r;   r9   r8   rK   FLOAT32_EPSr   r<   r2   r3   )r_   r`   r?   r@   r]   prune_probabilityr6   new_indicesZnew_distancesrD   flagrT   rk   rF   r*   r*   r+   	diversify  s0    r   c                 C   s:  | j d d }t|D ]}|| | | |d   }	|| | | |d   }
t|
}tj|j d tjd}td|j d D ]x}|| }t|D ]b}|| }|| dkr|||	|  ||	|  }|
| tkr||
| k rt	||k rd||<  q~qq~t|j d D ],}|| }|| dkrd|| | | < qqd S )Nr   r%   dtype)
r9   r:   r;   r2   argsortonesint8r8   r   r   )Zgraph_indptrZgraph_indicesZ
graph_dataZsource_datar@   r]   r   n_nodesr6   Zcurrent_indicesZcurrent_dataorderZretainedrV   rD   rT   lrF   r*   r*   r+   diversify_csr  s0    


 
r      c                 C   s   t | jd d D ]h}|| | | |d   }|jd |krt|| }t| | | |d  D ]}|| |krbd||< qbqd S )Nr   r%   rX   )r:   r;   r9   r2   sortr8   )indptrr?   
max_degreer6   row_dataZ	cut_valuerD   r*   r*   r+   degree_prune_internal  s    r   c                 C   s   t | j| j| |   | S )av  Prune the k-neighbors graph back so that nodes have a maximum
    degree of ``max_degree``.

    Parameters
    ----------
    graph: sparse matrix
        The adjacency matrix of the graph

    max_degree: int (optional, default 20)
        The maximum degree of any node in the pruned graph

    Returns
    -------
    result: sparse matrix
        The pruned graph.
    )r   r   r?   eliminate_zeros)graphr   r*   r*   r+   degree_prune  s    r   c                 C   s.   t | j| j| j| j| jtjdd| j}|S )z;Given a new data indexing, resort the tree indices to matchCr   )	r   hyperplanesoffsetschildrenr_   r~   r2   int32	leaf_size)tree
tree_orderZnew_treer*   r*   r+   resort_tree_indices  s    r   c                   @   sr   e Zd ZdZd#ddZdd Zdd Zdd Zdd Zdd Z	e
dd Zdd Zdd Zd$dd Zd%d!d"ZdS )&	NNDescenta  NNDescent for fast approximate nearest neighbor queries. NNDescent is
    very flexible and supports a wide variety of distances, including
    non-metric distances. NNDescent also scales well against high dimensional
    graph_data in many cases. This implementation provides a straightfoward
    interface, with access to some tuning parameters.

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The training graph_data set to find nearest neighbors in.

    metric: string or callable (optional, default='euclidean')
        The metric to use for computing nearest neighbors. If a callable is
        used it must be a numba njit compiled function. Supported metrics
        include:
            * euclidean
            * manhattan
            * chebyshev
            * minkowski
            * canberra
            * braycurtis
            * mahalanobis
            * wminkowski
            * seuclidean
            * cosine
            * correlation
            * haversine
            * hamming
            * jaccard
            * dice
            * russelrao
            * kulsinski
            * rogerstanimoto
            * sokalmichener
            * sokalsneath
            * yule
            * hellinger
            * wasserstein-1d
        Metrics that take arguments (such as minkowski, mahalanobis etc.)
        can have arguments passed via the metric_kwds dictionary. At this
        time care must be taken and dictionary elements must be ordered
        appropriately; this will hopefully be fixed in the future.

    metric_kwds: dict (optional, default {})
        Arguments to pass on to the metric, such as the ``p`` value for
        Minkowski distance.

    n_neighbors: int (optional, default=30)
        The number of neighbors to use in k-neighbor graph graph_data structure
        used for fast approximate nearest neighbor search. Larger values
        will result in more accurate search results at the cost of
        computation time.

    n_trees: int (optional, default=None)
        This implementation uses random projection forests for initializing the index
        build process. This parameter controls the number of trees in that forest. A
        larger number will result in more accurate neighbor computation at the cost
        of performance. The default of None means a value will be chosen based on the
        size of the graph_data.

    leaf_size: int (optional, default=None)
        The maximum number of points in a leaf for the random projection trees.
        The default of None means a value will be chosen based on n_neighbors.

    pruning_degree_multiplier: float (optional, default=1.5)
        How aggressively to prune the graph. Since the search graph is undirected
        (and thus includes nearest neighbors and reverse nearest neighbors) vertices
        can have very high degree -- the graph will be pruned such that no
        vertex has degree greater than
        ``pruning_degree_multiplier * n_neighbors``.

    diversify_prob: float (optional, default=1.0)
        The search graph get "diversified" by removing potentially unnecessary
        edges. This controls the volume of edges removed. A value of 0.0 ensures
        that no edges get removed, and larger values result in significantly more
        aggressive edge removal. A value of 1.0 will prune all edges that it can.

    n_search_trees: int (optional, default=1)
        The number of random projection trees to use in initializing searching or
        querying.

        .. deprecated:: 0.5.5

    tree_init: bool (optional, default=True)
        Whether to use random projection trees for initialization.

    init_graph: np.ndarray (optional, default=None)
        2D array of indices of candidate neighbours of the shape
        (data.shape[0], n_neighbours). If the j-th neighbour of the i-th
        instances is unknown, use init_graph[i, j] = -1

    init_dist: np.ndarray (optional, default=None)
        2D array with the same shape as init_graph,
        such that metric(data[i], data[init_graph[i, j]]) equals
        init_dist[i, j]

    random_state: int, RandomState instance or None, optional (default: None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    algorithm: string (optional, default='standard')
        This implementation provides an alternative algorithm for
        construction of the k-neighbors graph used as a search index. The
        alternative algorithm can be fast for large ``n_neighbors`` values.
        The``'alternative'`` algorithm has been deprecated and is no longer
        available.

    low_memory: boolean (optional, default=True)
        Whether to use a lower memory, but more computationally expensive
        approach to index construction.

    max_candidates: int (optional, default=None)
        Internally each "self-join" keeps a maximum number of candidates (
        nearest neighbors and reverse nearest neighbors) to be considered.
        This value controls this aspect of the algorithm. Larger values will
        provide more accurate search results later, but potentially at
        non-negligible computation cost in building the index. Don't tweak
        this value unless you know what you're doing.

    n_iters: int (optional, default=None)
        The maximum number of NN-descent iterations to perform. The
        NN-descent algorithm can abort early if limited progress is being
        made, so this only controls the worst case. Don't tweak
        this value unless you know what you're doing. The default of None means
        a value will be chosen based on the size of the graph_data.

    delta: float (optional, default=0.001)
        Controls the early abort due to limited progress. Larger values
        will result in earlier aborts, providing less accurate indexes,
        and less accurate searching. Don't tweak this value unless you know
        what you're doing.

    n_jobs: int or None, optional (default=None)
        The number of parallel jobs to run for neighbors index construction.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors.

    compressed: bool (optional, default=False)
        Whether to prune out data not needed for searching the index. This will
        result in a significantly smaller index, particularly useful for saving,
        but will remove information that might otherwise be useful.

    parallel_batch_queries: bool (optional, default=False)
        Whether to use parallelism of batched queries. This can be useful for large
        batches of queries on multicore machines, but results in performance degradation
        for single queries, so is poor for streaming use.

    verbose: bool (optional, default=False)
        Whether to print status graph_data during the computation.
    	euclideanN         ?r   r%   Trp   Fc                    s  |d kr,dt t|jd d  }td|}|d krRtdt tt|jd }|| _tdt t| jd | _|| _	|| _
|| _|| _|| _|| _|	| _|| _|| _|| _|| _|jd | _|| _|| _|| _|| _t|dd tjkrt|st|rd}nd	}t|tjd
dd}|| _|
r>|dks>|d k	rFd	| _nd| _|pTi }t |! | _"|| _#t$| j#}d | _%t&|r| nL|t'j(kr|t'j)krt'j)| d  t'j)| d | _%n
t'j(|  nt*dt+| j"dkr
| j"t,-  fdd}|| _.n | _.|dkr"d| _/nd	| _/|dkrFt0|d|d}|| _|1t2t3d4tj5| _6|1t2t3d4tj5| _7t8dD ]}t9| j7}q~| jr|rt:t; dt<|d t=||||| j6|| j| j/| _>t?| j>}nd | _>t@dgg}| jd kr
td| j	}n| j}t,A | _B| jdkr>| jd k	r>t,C| j tD| jrd| _E| jjFsd| jG  |tHjIkr|tHjJkrtHjJ| d  tHjJ| d | _%n
tHjI|  nt&|r| nt*dK||tHjLkr| jjd |d< t |! | _"t+| j"dkr$| j"t,-  fdd}|| _.n | _.|d kr:tM}nR|jd | jjd krZt*dtN|jd | j	}tO||| jjP| jjQ| jjR| j.}|rt:t; dt<|d  tSjT| jjQ| jjP| jjR| j	| j6|| j.| j| jd||| j|d!| _Und	| _E|d krtM}nr|jd | jjd krt*dtN|jd | j	}|d krLtV|||| j.}n$|j|jkrdt*d"ntW|||}|rt:t; d#t<|d  tT| j| j	| j6|| j.| j| j| jd|||d$| _UtX| jUd dk rtYd% t,C| jB d S )&N   r   g      ?    r%      r   TFcsrr   r   accept_sparser   r@   
correctionz3Metric is neither callable, nor a recognised stringc                    s    | |f S Nr*   )xy_distance_func	dist_argsr*   r+   _partial_dist_func  s    z.NNDescent.__init__.<locals>._partial_dist_func)cosinedotcorrelationdicejaccard	hellingerhammingr   l2)normcopyro   zBuilding RP forest withZtreesr0   <   z'Metric {} not supported for sparse data
n_featuresc                    s    | |||f S r   r*   )ind1data1ind2data2r   r*   r+   r   T  s    z,Init graph size does not match dataset size!zmetric NN descent forru   )	re   r@   rx   ry   r   rN   r   r   rz   z9The shapes of init graph and init distances do not match!zNN descent for)r   r   r   rN   rz   zFailed to correctly find n_neighbors for some samples. Results may be less than ideal. Try re-running with different parameters.)Zrb   roundr9   rJ   maxr2   log2n_treesn_trees_after_updater[   metricmetric_kwdsr   prune_degree_multiplierdiversify_probn_search_treesre   r   rx   ry   dimn_jobs
compressedparallel_batch_queriesrz   r(   float32r   r,   r   	_raw_data	tree_inittuplevaluesZ
_dist_argsrandom_stater   _distance_correctioncallable
pynnd_distnamed_distancesZfast_distance_alternativesr   rK   r:   njitr   _angular_treesr   randint	INT32_MIN	INT32_MAXr~   r   r]   search_rng_stater8   r   rw   r   strr   
_rp_forestr   arrayrv   _original_num_threadsset_num_threadsr
   
_is_sparsehas_sorted_indicessort_indicessparsesparse_named_distancesZ!sparse_fast_distance_alternativesformatsparse_need_n_featuresEMPTY_GRAPHr   r   r   r_   r?   
sparse_nndr   _neighbor_graphr   r   anyr   )selfr?   r   r   r[   r   r   pruning_degree_multiplierr   r   r   r   Z	init_distr   r   re   rx   ry   r   r   r   rz   Zcopy_on_normalizecurrent_random_stater   r6   _rN   effective_max_candidatesZ_init_graphr*   r   r+   __init__  s   




	
  





	


     zNNDescent.__init__c                 C   sj   t | ds|   t | ds4| jr,|   n|   | j }t | drN|d= tdd | jD |d< |S )N_search_graph_search_functionr   c                 S   s   g | ]}t |qS r*   )r    r5   r   r*   r*   r+   r7     s     z*NNDescent.__getstate__.<locals>.<listcomp>_search_forest)	hasattr_init_search_graphr   _init_sparse_search_function_init_search_function__dict__r   r   r  r   resultr*   r*   r+   __getstate__  s    




zNNDescent.__getstate__c                 C   s:   || _ tdd |d D | _| jr.|   n|   d S )Nc                 S   s   g | ]}t |qS r*   )r!   r  r*   r*   r+   r7     s     z*NNDescent.__setstate__.<locals>.<listcomp>r  )r  r   r  r   r  r  )r   rF   r*   r*   r+   __setstate__  s    
zNNDescent.__setstate__c                    s  t   _ jdkr* jd k	r*t  j t ds2 jd kr jrt j	}t
 j j j j j| j j} fdd|D  _ng  _n fdd jD } jrtt dt| tt dt| tt dt| t|d  j } fd	d|D } ` fd
d|D  _t jd dk} jrֈ jrt  jd  jd  jj! jj" jj# j$ j j%\}}nBt  jd &  jd &  jj! jj" jj# j$ j j%\}}nf jr
t  jd  jd  j j$ j j%\}}n2t  jd &  jd &  j j$ j j%\}}t' jj(d  jj(d ftj)d _*t+||dk< t,tj-|j(d tj.d|j(d  j*_/|0  j*_1|0  j*_# j*2  _*d j*j# j*j!dk<  j*3   jrtt d| j*j4  j*j4}	 j*5 }
 jrDt6|
j"|
j!|
j# jj" jj! jj# j$ j j%	 n"t6|
j"|
j!|
j# j j$ j j% |
3   jrtt d|	|
j4 |
2 }
|
7   j*2  _* j*7   j*8|
2  _* j*9d  j*3   j*j4}t: j*t;t< j= j  _* j*3   j*dk>tj? _* jr@tt d| j*j4 tj@ jj(d d d tj?dd _A jrxtt d  jr> jd j! _B j* jBd d f C }|d d  jBf  _* j*2  _* j*7   jr j jBd d f  _ntD j jBd d f  _t jBtEfdd jd  j D  _nt- jj(d  _B jr jrntt d t dr~ ` `t  j d S )Nr0   r  c                    s(   g | ] }t | jjd   jjd qS r   r%   r   r   r9   r  r   r*   r+   r7     s    
 
z0NNDescent._init_search_graph.<locals>.<listcomp>c                    s   g | ]}t | jd  qS r|   )r$   r   r  r  r*   r+   r7     s   zWorst tree score: {:.8f}zMean tree score: {:.8f}zBest tree score: {:.8f}c                    s   g | ]} j | qS r*   )r   )r5   rV   r  r*   r+   r7     s     c                    s(   g | ] }t | jjd   jjd qS r  r  r  r  r*   r+   r7     s    
 
r   r%   r   rX   z3Forward diversification reduced edges from {} to {}z3Reverse diversification reduced edges from {} to {}z*Degree pruning reduced edges from {} to {}   r   )r   r   z,Resorting data and graph based on tree orderc                 3   s   | ]}t | V  qd S r   )r   r  )r   r*   r+   	<genexpr>  s   z/NNDescent._init_search_graph.<locals>.<genexpr>z1Compressing index by removing unneeded attributesr   )Fr:   rv   r   r   r   r  r   r   r   r   r   r   r[   r   r   r]   r   r  rz   rw   r   r   r2   rJ   meanr   r   rY   r   r   r   r   r   r_   r   r?   r   r   r   r	   r9   r   r   r   repeataranger   rowravelcoltocsrr   nnz	transposer   r   maximumsetdiagr   rb   r   r   r~   rL   zeros_visited_vertex_ordertocscascontiguousarrayr   )r   r   	rp_forestZtree_scoresZbest_tree_indicesZ
best_treesZnnz_pre_diversifyZdiversified_rowsZdiversified_dataZpre_reverse_diversify_nnzZreverse_graphZpre_prune_nnzZrow_ordered_graphr*   )r   r   r+   r    sh   










		 
 
	 


   
zNNDescent._init_search_graphc                    sj  | j rtt d | jr| jd j	| jd j| jd j
| jd jt	j
t	jjt	jjddddt	jjt	jjddddt	jjt	jjddddgt	jjt	jjdd		fd
d| _n&t	
 dd | _tjdtjd
tjtj | j| jj| jj| j| j| jt	j
dt	jjd d d t	jjt	jjt	jjd d d t	jjd d d t	jjt	jjt	jjt	jjt	jjd d d t	jjd d d t	jjd d d t	jjd d d d df t	jjt	jjt	jjt	jjd| jd 
f
dd}|| _ t!t"dr*t	j
| jdt"j#| _$nt"| _$| jd d }|  |dd| j%| j&\}}}| $||}d S )Nz&Building and compiling search functionr   r%   r   TreadonlyFnodesiderH   c                    sX   d} |df dkrNt | | | |}|dkr@ |df }q |df }q |  S rg   )r"   )pointr]   r&  r'  tree_childrentree_hyperplanestree_offsetsr*   r+   tree_search_closure  s    
   z<NNDescent._init_search_function.<locals>.tree_search_closurec                 S   s   dS N)r   r   r*   )r)  r]   r*   r*   r+   r.    s    r   )current_queryr6   rD   heap_prioritiesheap_indices	candidatevertexrF   d_vertexvisitedr_   r   r?   	heap_sizedistance_scaledistance_bound
seed_scalerW   rH   r-   c                    s  t | jd |}d| }t|}t| jd D ]r}rHt|}	n|}	d|	d d < ksh krt| | d  }
|
dkr2| | |
 }qq2n| | }|d | }|d | }dd t	dD }	||}|d |d  }|jd }t
|| }t	|D ]J}|| }t| |}t|||| t|||f t|	| q|dkrt	|D ]p}ttt|jd  }t|	|dkrft| |}t|||| t|||f t|	| qf||d  }t|\}}||k r2t	| |d  D ]p}| }t|	|dkrt|	| t| |}||k rt|||| t|||f ||d  }qt|dkrq2nt|\}}qq2|S )Nr   r   rI   rX   r%   c                 S   s"   g | ]}t t jt d fqS r/   r2   r   r3   r   r5   rD   r*   r*   r+   r7     s     zKNNDescent._init_search_function.<locals>.search_closure.<locals>.<listcomp>)r   r9   r2   r   r:   r;   
zeros_likesqrtrY   r8   rJ   r   r   heapqheappushr   r   rZ   r   r   heappoprK   )Zquery_pointsrT   epsilonr6  r]   r
  r8  internal_rng_stater6   visited_nodesr   r0  r1  r2  seed_setindex_boundscandidate_indicesn_initial_pointsn_random_samplesrD   r3  rF   r9  r5  r4  )
alternative_cosinealternative_dotr?   r@   r_   r   r[   parallel_searchtree_indicesr.  r*   r+   search_closure  s~    



   

   z7NNDescent._init_search_function.<locals>.search_closurepy_funcr   r   rX   )'rz   rw   r   r   r  r   r   r_   r   r:   r   typesArrayr   r   r   uint32boolean_tree_searchr2   r  r   rL  rK  r   r   r   r   r[   r   rL   int16r  r  r   rP  _deheap_functionr  r   r   rO  
query_dataindsdistsr   r*   )rK  rL  r?   r@   r_   r   r[   rM  r+  r,  rN  r-  r.  r+   r    s    	
 Y    
zNNDescent._init_search_functionc                    s  | j rtt d | jr| jd j| jd j| jd j| jd jt	j
t	jjt	jjddddt	jjt	jjddddt	jjt	jjddddt	jjt	jjddddgt	jjt	jjdd	fd
d

| _n&t	
 dd 

| _tjdtjdddlmm  | jj| jj| jj| jj| jj| j| j| j	t	j
dt	jjd d d t	jjt	jjd d d t	jjd d d t	jjt	jjt	jjd d d t	jjd d d t	jjd d d t	jjd d d d df t	jj t	jjt	jjd| jd 	
fdd}|| _!t"t#drBt	j
| jdt#j$| _%nt#| _%| jd d }| !|j|j|jdd| j&| j'\}}}| %||}d S )Nz-Building and compiling sparse search functionr   r%   r   Tr#  Fr%  r(  c                    sZ   d} |df dkrPt | | | ||}|dkrB |df }q |df }q |  S rg   )r#   )
point_inds
point_datar]   r&  r'  r*  r*   r+   sparse_tree_search_closuref  s    zJNNDescent._init_sparse_search_function.<locals>.sparse_tree_search_closurec                 S   s   dS r/  r*   )r\  r]  r]   r*   r*   r+   r^    s    r   )rL  rK  )r0  r6   r1  r2  r3  rF   r6  r_   r   r?   r7  r8  r:  r;  c               	      sp  |j d d }j d d }t||}	d| }
t|}t|D ]&}	rXt|}n|}d|d d < | || ||d   }||| ||d   }ks krt|d  }|dkrB|| }nqB|	d | }|	d | }dd t	dD }t
| 
|||}|d |d  }|j d }t|| }t	|D ]z}|| }| |d   }| |d   }t||||}t|||| t
|||f t|| qB|dkrlt	|D ]}ttt|| }t||dkrЈ| |d   }| |d   }t||||}t|||| t
|||f t|| q|
|d  }t
|\}}||k rBt	| |d  D ]}| }t||dkrt|| | |d   }| |d   }t||||}||k rt|||| t
|||f |
|d  }qt|dkrXqBnt
|\}}qqB|	S )Nr   r%   r   rI   rX   c                 S   s"   g | ]}t t jt d fqS r/   r<  r=  r*   r*   r+   r7     s     zRNNDescent._init_sparse_search_function.<locals>.search_closure.<locals>.<listcomp>)r9   r   r2   r   r:   r;   r>  r?  rY   r8   r@  heapifyrJ   r   r   rA  r   r   rZ   r   r   rB  rK   ) Z
query_indsZquery_indptrrY  rT   rC  r6  r]   Zn_query_pointsZn_index_pointsr
  r8  rD  r6   rE  Zcurrent_query_indsZcurrent_query_datar   r1  r2  rF  rG  rH  rI  rJ  rD   r3  Z	from_inds	from_datarF   r9  r5  r4  )rK  rL  	data_datadata_indptr	data_indsr@   r_   r   r[   rM  r^  rN  r*   r+   rO    s    



  
 
 
   
 
 
	   
 
 
	
   z>NNDescent._init_sparse_search_function.<locals>.search_closurerP  r   r   rX   )(rz   rw   r   r   r  r   r   r_   r   r:   r   rQ  rR  r   r   r   rS  rT  rU  r2   r  pynndescent.distancesrL  rK  r   r   r?   r   r   r[   r   rL   rV  r  r  r   rP  rW  r  r   rX  r*   )rK  rL  ra  rb  rc  r@   r_   r   r[   rM  r^  r+  r,  rN  r-  r+   r  [  s    

$ 

	z&NNDescent._init_sparse_search_functionc                 C   sf   | j rt| dstd d S | jd k	rF| jd  | | jd f}n| jd  | jd  f}|S )Nr   z:Compressed indexes do not have neighbor graph information.r   r%   )r   r  r   r   r   r   r	  r*   r*   r+   neighbor_graphG  s    
zNNDescent.neighbor_graphc                 C   s>   dd l }|   d| _t| dr$| `t| dr2| `|  d S )Nr   Tr   r   )gcpreparer   r  r   r   collect)r   rf  r*   r*   r+   compress_indexV  s    

zNNDescent.compress_indexc                 C   s8   t | ds|   t | ds4| jr,|   n|   d S )Nr   r  )r  r  r   r  r  r  r*   r*   r+   rg  d  s    


zNNDescent.preparero   皙?c              	   C   s   t | ds|   | js^t | ds*|   t|jtjdd}| |||| j	| j
\}}}nlt | dsp|   t|dtjd}t|st|tjd}|js|  | |j|j|j||| j	| j
\}}}| ||\}}| j| }| jdk	r| |}||fS )	a  Query the training graph_data for the k nearest neighbors

        Parameters
        ----------
        query_data: array-like, last dimension self.dim
            An array of points to query

        k: integer (default = 10)
            The number of nearest neighbors to return

        epsilon: float (optional, default=0.1)
            When searching for nearest neighbors of a query point this values
            controls the trade-off between accuracy and search cost. Larger values
            produce more accurate nearest neighbor results at larger computational
            cost for the search. Values should be in the range 0.0 to 0.5, but
            should probably not exceed 0.3 without good reason.

        Returns
        -------
        indices, distances: array (n_query_points, k), array (n_query_points, k)
            The first array, ``indices``, provides the indices of the graph_data
            points in the training set that are the nearest neighbors of
            each query point. Thus ``indices[i, j]`` is the index into the
            training graph_data of the jth nearest neighbor of the ith query points.

            Similarly ``distances`` provides the distances to the neighbors
            of the query points such that ``distances[i, j]`` is the distance
            from the ith query point to its jth nearest neighbor in the
            training graph_data.
        r   r  r   r   r   )r   r   r   N)r  r  r   r  r2   asarrayr~   r   r  r  r   r  r   r
   r   r   r   r_   r   r?   rW  r  r   )r   rY  rT   rC  r_   r[  r   r*   r*   r+   queryn  sD    

    





zNNDescent.queryc                 C   s  t | j}|ttdtj}td}|dk	rt	|tj
ddd}|dkrRtd| jr^|qzttt|}W n  ttfk
r   tdY nX t|}|jd	 }||krtd
| d| dn|dk	rtd d}|dkrg }g }|dkr<| jrtg g g fd	| jjd ftj
d}ntjd	| jjd ftj
d}nt	|tj
ddd}t| drht| j}	ntj| jjd	 tjd}	| jrt| j|g| _|rx|n| j|	ddf | _t||D ]\}
}|
| j|< qtt| j|g| _| j \}}|j\}}t!|}t"|D ]b}||kr8d||< tj#||< qt"|D ]2}|||f |kr@d|||f< tj#|||f< q@q| jr|n0| j$| _%t&| j| j'| j%| j(||| j)| j*| _+t,| j+}t-| jjd	 | j'}t.|| j d	 | j d  t/| j| j0|| | j1dkrt2d| j'}n| j1}t3| j| j'| j4|| j0| j5| j6|| j7dt8dgdgg| j9d| _ t| dst| dst| drt| dr| `:t| dr| `;t| dr| `<| =  dS )a  
        Updates the index with a) fresh data (that is appended to
        the existing data), and b) data that was only updated (but should not be appended
        to the existing data).

        Not applicable to sparse data yet.

        Parameters
        ----------
        xs_fresh: np.ndarray (optional, default=None)
            2D array of the shape (n_fresh, dim) where dim is the dimension
            of the data from which we built self.

        xs_updated: np.ndarray (optional, default=None)
            2D array of the shape (n_updates, dim) where dim is the dimension
            of the data from which we built self.

        updated_indices: array-like of size n_updates (optional, default=None)
            Something that is convertable to list of ints.
            If self is currently built from xs, then xs[update_indices[i]]
            will be replaced by xs_updated[i].

        Returns
        -------
            None
        r   zSparse update not complete yetNr   r   r   zBIf xs_updated are provided, updated_indices must also be provided!z4Could not convert updated indices to list of int(s).r   zNumber of updated indices (z+) must match number of rows of xs_updated (z).zMxs_updated not provided, while update_indices provided. They will be ignored.r%   )r9   r   r   r  r0   r   F)r   r   r   rN   rz   r   r  r  )>r   r   r   r   r   r~   r2   r   NotImplementedErrorr   r   r   r   listmaprb   	TypeErrorrK   r9   r   r   r   r  r  r   r  r   bool_sparse_vstackzipr!  r   r   r}   r8   r3   r   r   r   r[   r   r   r   r   r   r   ra   rU   r   re   rJ   r   r]   rx   ry   r   r   rz   r   r  r  rg  )r   Zxs_freshZ
xs_updatedZupdated_indicesr   r]   Zerror_sparse_to_don1n2Zoriginal_orderZ	x_updatedZi_freshnsdsZ
n_examplesr[   Zindices_setr6   rD   rN   rM   r   r*   r*   r+   update  s    
   


     





  zNNDescent.update)r   Nr   NNr   r   r%   TNNNTNNrp   NFFF)ro   rj  )NNN)__name__
__module____qualname____doc__r   r  r  r  r  r  propertyre  ri  rg  rl  rx  r*   r*   r*   r+   r     sN                        
  
 ] 5 m


Ir   c                   @   s8   e Zd ZdZdddZdddZdddZdddZdS )PyNNDescentTransformeraN  PyNNDescentTransformer for fast approximate nearest neighbor transformer.
    It uses the NNDescent algorithm, and is thus
    very flexible and supports a wide variety of distances, including
    non-metric distances. NNDescent also scales well against high dimensional
    graph_data in many cases.

    Transform X into a (weighted) graph of k nearest neighbors

    The transformed graph_data is a sparse graph as returned by kneighbors_graph.

    Parameters
    ----------
    n_neighbors: int (optional, default=5)
        The number of neighbors to use in k-neighbor graph graph_data structure
        used for fast approximate nearest neighbor search. Larger values
        will result in more accurate search results at the cost of
        computation time.

    metric: string or callable (optional, default='euclidean')
        The metric to use for computing nearest neighbors. If a callable is
        used it must be a numba njit compiled function. Supported metrics
        include:
            * euclidean
            * manhattan
            * chebyshev
            * minkowski
            * canberra
            * braycurtis
            * mahalanobis
            * wminkowski
            * seuclidean
            * cosine
            * correlation
            * haversine
            * hamming
            * jaccard
            * dice
            * russelrao
            * kulsinski
            * rogerstanimoto
            * sokalmichener
            * sokalsneath
            * yule
            * hellinger
            * wasserstein-1d
        Metrics that take arguments (such as minkowski, mahalanobis etc.)
        can have arguments passed via the metric_kwds dictionary. At this
        time care must be taken and dictionary elements must be ordered
        appropriately; this will hopefully be fixed in the future.

    metric_kwds: dict (optional, default {})
        Arguments to pass on to the metric, such as the ``p`` value for
        Minkowski distance.

    n_trees: int (optional, default=None)
        This implementation uses random projection forests for initialization
        of searches. This parameter controls the number of trees in that
        forest. A larger number will result in more accurate neighbor
        computation at the cost of performance. The default of None means
        a value will be chosen based on the size of the graph_data.

    leaf_size: int (optional, default=None)
        The maximum number of points in a leaf for the random projection trees.
        The default of None means a value will be chosen based on n_neighbors.

    pruning_degree_multiplier: float (optional, default=1.5)
        How aggressively to prune the graph. Since the search graph is undirected
        (and thus includes nearest neighbors and reverse nearest neighbors) vertices
        can have very high degree -- the graph will be pruned such that no
        vertex has degree greater than
        ``pruning_degree_multiplier * n_neighbors``.

    diversify_prob: float (optional, default=1.0)
        The search graph get "diversified" by removing potentially unnecessary
        edges. This controls the volume of edges removed. A value of 0.0 ensures
        that no edges get removed, and larger values result in significantly more
        aggressive edge removal. A value of 1.0 will prune all edges that it can.

    n_search_trees: int (optional, default=1)
        The number of random projection trees to use in initializing searching or
        querying.

        .. deprecated:: 0.5.5

    search_epsilon: float (optional, default=0.1)
        When searching for nearest neighbors of a query point this values
        controls the trade-off between accuracy and search cost. Larger values
        produce more accurate nearest neighbor results at larger computational
        cost for the search. Values should be in the range 0.0 to 0.5, but
        should probably not exceed 0.3 without good reason.

    tree_init: bool (optional, default=True)
        Whether to use random projection trees for initialization.

    random_state: int, RandomState instance or None, optional (default: None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    n_jobs: int or None (optional, default=None)
        The maximum number of parallel threads to be run at a time. If none
        this will default to using all the cores available. Note that there is
        not perfect parallelism, so at several pints the algorithm will be
        single threaded.

    low_memory: boolean (optional, default=False)
        Whether to use a lower memory, but more computationally expensive
        approach to index construction. This defaults to false as for most
        cases it speeds index construction, but if you are having issues
        with excessive memory use for your dataset consider setting this
        to True.

    max_candidates: int (optional, default=20)
        Internally each "self-join" keeps a maximum number of candidates (
        nearest neighbors and reverse nearest neighbors) to be considered.
        This value controls this aspect of the algorithm. Larger values will
        provide more accurate search results later, but potentially at
        non-negligible computation cost in building the index. Don't tweak
        this value unless you know what you're doing.

    n_iters: int (optional, default=None)
        The maximum number of NN-descent iterations to perform. The
        NN-descent algorithm can abort early if limited progress is being
        made, so this only controls the worst case. Don't tweak
        this value unless you know what you're doing. The default of None means
        a value will be chosen based on the size of the graph_data.

    early_termination_value: float (optional, default=0.001)
        Controls the early abort due to limited progress. Larger values
        will result in earlier aborts, providing less accurate indexes,
        and less accurate searching. Don't tweak this value unless you know
        what you're doing.

    parallel_batch_queries: bool (optional, default=False)
        Whether to use parallelism of batched queries. This can be useful for large
        batches of queries on multicore machines, but results in performance degradation
        for single queries, so is poor for streaming use.

    verbose: bool (optional, default=False)
        Whether to print status graph_data during the computation.

    Examples
    --------
    >>> from sklearn.manifold import Isomap
    >>> from pynndescent import PyNNDescentTransformer
    >>> from sklearn.pipeline import make_pipeline
    >>> estimator = make_pipeline(
    ...     PyNNDescentTransformer(n_neighbors=5),
    ...     Isomap(neighbors_algorithm='precomputed'))
    r   r   Nrj  r   r   r%   Trp   Fc                 C   sp   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _d S r   )r[   r   r   r   r   search_epsilonr   r   r   r   r   r   re   rx   early_termination_valuer   r   rz   )r   r[   r   r   r   r   r  r   r   r   r   r   r   r   re   rx   r  r   rz   r*   r*   r+   r     s$    zPyNNDescentTransformer.__init__c                 C   s   |j d | _| jdkri }n| j}| jr4tt d | jd }t|| j||| j	| j
| j| j| j| j| j| j| j| j| j| j|| j| jd| _| S )aX  Fit the PyNNDescent transformer to build KNN graphs with
        neighbors given by the dataset X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Sample graph_data

        Returns
        -------
        transformer : PyNNDescentTransformer
            The trained transformer
        r   NzCreating indexr%   )r   r   r[   r   r   r   r   r   r   r   r   re   rx   ry   r   r   r   rz   )r9   n_samples_fitr   rz   rw   r   r[   r   r   r   r   r   r   r   r   r   r   re   rx   r  r   r   index_)r   Xri  r   Zeffective_n_neighborsr*   r*   r+   fit#  s:    

zPyNNDescentTransformer.fitc                 C   s   |dkr| j }n
|jd }|dkr0| jj\}}n| jj|| j| jd\}}| jr\tt	 d t
|| j ftjd}ttj|jd tjd|jd |_| |_| |_| S )a  Computes the (weighted) graph of Neighbors for points in X

        Parameters
        ----------
        X : array-like, shape (n_samples_transform, n_features)
            Sample graph_data

        Returns
        -------
        Xt : CSR sparse matrix, shape (n_samples_transform, n_samples_fit)
            Xt[i, j] is assigned the weight of edge that connects i to j.
            Only the neighbors have an explicit value.
        Nr   )rT   rC  zConstructing neighbor matrixr   r%   )r  r9   r  re  rl  r[   r  rz   rw   r   r	   r2   r   r  r  r   r  r  r  r?   r  )r   r  r   n_samples_transformr_   r`   r
  r*   r*   r+   	transformW  s(    
  
 

z PyNNDescentTransformer.transformc                 K   s:   | j |dd | jdd}| jr,tt d | j  |S )aB  Fit to graph_data, then transform it.

        Fits transformer to X and y with optional parameters fit_params
        and returns a transformed version of X.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            Training set.

        y : ignored

        Returns
        -------
        Xt : CSR sparse matrix, shape (n_samples, n_samples)
            Xt[i, j] is assigned the weight of edge that connects i to j.
            Only the neighbors have an explicit value.
            The diagonal is always explicit.
        F)ri  N)r  zCompressing index)r  r  rz   rw   r   r  ri  )r   r  r   
fit_paramsr
  r*   r*   r+   fit_transform}  s    
z$PyNNDescentTransformer.fit_transform)r   r   NNNrj  r   r   r%   TNNTNNrp   FF)T)N)N)ry  rz  r{  r|  r   r  r  r  r*   r*   r*   r+   r~  a  s0                      
)
4
&r~  )r   )r   )r   )r   )Zwarningsr   r:   numpyr2   sklearn.utilsr   r   sklearn.preprocessingr   sklearn.baser   r   scipy.sparser   r	   r
   r   rr  r   r@  pynndescent.sparser   Zpynndescent.sparse_nndescentZsparse_nndescentr   rd  r`   r   Zpynndescent.utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Zpynndescent.rp_treesr   r   r   r   r    r!   r"   r#   r$   rQ  Listr   float64Zupdate_typeiinfor   rJ   r   r   r   finfor   epsr   r   r,   r   rG   rU   r^   ra   rf   rm   r   r{   r   r   r   r   r   r   r   r   r~  r*   r*   r*   r+   <module>   s   D,


(




#

)2:
$
 )

          l