Source code for SuperSCC.clustering.clustering

"""
Clustering module for functions related to data clustering.
"""

import os
import pickle
import warnings
import copy
from datetime import datetime
from collections import Counter
from scipy.spatial.distance import pdist, squareform
from functools import reduce
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import scanpy as sc
import pandas as pd
import numpy as np
import plotly.graph_objects as go


from ..utils import record_time, key_search, retrieve_positive_markers
from ..feature_selection import pairwise_compare, find_signature_genes, find_markers_ovr


def core_consensus_cluster(data, 
                           n_components = 30,
                           resolution = 1,
                           return_hvgs = False):
    """
    A function to do KNN clustering via using Scanpy API.

    Parameters
    ----------
    data: 
        A pandas data frame. Rows are cells; columns are features.
    n_components: 
        A int to decide how many principle components will be used for KNN clustering.
    resolution: 
        A int to control the coarseness of the clustering. Higher values lead to more clusters.
    return_hvgs: 
        A Bool value to decide whether highly variable genes detected by Scanpy will be returned. Default is False. 
    """
    adata = sc.AnnData(data.copy())
    sc.pp.highly_variable_genes(adata, n_top_genes = 3000)
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    sc.pp.scale(adata)
    sc.tl.pca(adata)
    sc.pp.neighbors(adata, n_pcs = n_components)
    sc.tl.leiden(adata, n_iterations = 2, resolution = resolution)


    if return_hvgs:
        return list(adata.obs.leiden), list(adata.var.loc[list(adata.var.highly_variable), :].index)
    else:    
        return list(adata.obs.leiden)



[docs]
def global_consensus_cluster(       
                      data, 
                      n_components = 30,
                      resolution = 1,
                      only_positive = True,
                      ratio_of_none_zero_counts = 0.1,
                      n_features_to_select = 0.15,
                      cut_off = 0.1,
                      focus = "expression1",
                      class_weight = "balanced",
                      ep_cut_off = 1,
                      pct_cut_off = 0.5, 
                      robust = True,
                      save = True,
                      file_name = None,
                      n_jobs = -1,
                      logger = None,
                      **kwargs
                     ):
    
    """
    A function to merge global clusters and find markers for each global cluster.

    Parameters
    ----------
    data: 
        A log normalized counts matrix. Rows are cells; Columns are features. 
    n_components: 
        A int to decide how many principle components will be used for KNN clustering. Default is 30.
    resolution: 
        A int to control the coarseness of the clustering. Higher values lead to more clusters. Default is 1.
    only_positive: 
        A Bool value to control whether only positive features are used. Default is True.
    ratio_of_none_zero_counts: 
        A float to determine the cutoff in which a feature will be omited when below specified value. A higher value leads to less features will be kept for calculation. Default is 0.1. 
    n_features_to_select: 
        A int or float to control the number of features to select. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of features to select. Default is 0.15.
    cut_off: 
        A int or float to control the cutoff of combined score (jaccard index * distance correlation) that will used to decide whether a pair of cluster should be merged or not. Default is 0.1.
    focus: 
        A string to decide the value used for distance correlation calculation. "expression1" - the mean expression of feauture will be used. "rank1" - the rank of feature based on expression will be used. Default is 'expression1'.
    class_weight: 
        A string to decide whether class weights will be considered. If None, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). Default is 'balanced'.
    ep_cut_off: 
        A int or float value to decide the expression cutoff to select positive informative features. Default is 1.
    pct_cut_off: 
        A int or float value to decide the expression percentage cutoff to select positive informative features. Default is 0.5. 
    robust: 
        A Bool value to decider whether re calculate the markers of clusters after merging. Default is True. 
    save: 
        A Bool value to decide whether write the output into the disk. Default is true.
    filename: 
        A string to control the name of output. Default is None. 
    n_jobs: 
        A int to decide the number of thread used for the program. Default is -1, meaning using all available threads.
    logger: 
        A log_file object. Default is None.
    **kwargs: 
        Other paremeters passed to feature_selection function. 
    """
    
    global_cluster = core_consensus_cluster(data = data.copy(), n_components = n_components, resolution = resolution)
    logger.write("info", "{} clusters are found under resolution {}".format(len(set(global_cluster)), resolution))
    
    # add the 'cluster' column in the data
    global_cluster = [str(i) for i in global_cluster]
    data.loc[:, "cluster"] = global_cluster
    data.loc[:, "cluster"] = data.loc[:, "cluster"].astype(str)
                              
    # find sinataure genes for whole global cluster
    logger.write("info", "finding overall signature genes")

    global_features = find_signature_genes(data = data.copy(), label_column = "cluster", n_features_to_select = n_features_to_select, ratio_of_none_zero_counts = ratio_of_none_zero_counts,logger = logger, save = False, filename = "global", class_weight = class_weight, n_jobs = n_jobs,**kwargs)
    global_features_intersection = global_features["features"]["final_feature_selection_by_intersection"]
    global_features_ensemble = [i[0] for i in global_features["features"]["final_feature_selection_by_ensemble"]]
    # global_features_combined = set(global_features_intersection).intersection(global_features_ensemble)
    if len(global_features_ensemble) > len(global_features_intersection):
        global_features_combined = global_features_ensemble
    else:
        global_features_combined = global_features_intersection
    target = ["cluster"]
    target.extend(global_features_combined)
    
    # remove "sub_high_expression_genes" key
    # global_features.pop("sub_high_expression_genes")

    logger.write("info", "{} overall signature genes are found".format(len(global_features_combined)))

    copy_data = data.copy()
    copy_data = copy_data.loc[:, copy_data.columns.isin(target)]
   

    global_sign_features_pool = find_markers_ovr(copy_data, label_column = "cluster", n_features_to_select = n_features_to_select, ratio_of_none_zero_counts = ratio_of_none_zero_counts, logger = logger, save = False, class_weight = class_weight, n_jobs = n_jobs, **kwargs) 
    global_sign_features_pool_copy = copy.deepcopy(global_sign_features_pool)
    
    # merge similar cluster
    pre_merge_dict = {"global_sign_features": global_sign_features_pool_copy}
    merge_cluster = between_group_merge(pre_merge_dict, data2 = data.copy(), cut_off = cut_off, focus = focus, ep_cut_off = ep_cut_off, pct_cut_off = pct_cut_off, only_positive = only_positive, logger = logger)
    merge_keys = list(merge_cluster["global_sign_features"].keys())
    
    data.loc[:, "cluster"] = data.loc[:, "cluster"].astype(str)
    for i in merge_keys:
        if re.search("&", i):
            merge_candidate = re.split("&", i)
            for j in merge_candidate:
                data.loc[data.cluster == j, "cluster"] = str(i)

    # relabel the cluster label afetr merging
    encoder = LabelEncoder().fit(data.cluster)
    relabel_classes = list(encoder.classes_)
    relabel_cluster_labels = encoder.transform(data.cluster)

    if robust:
        data.loc[:, "cluster"] = [str(i) for i in relabel_cluster_labels]
        data.loc[:, "cluster"] = data.loc[:, "cluster"].astype(str)
        global_sign_features_after_merging = find_markers_ovr(data.copy(), label_column = "cluster", n_features_to_select = n_features_to_select, ratio_of_none_zero_counts = ratio_of_none_zero_counts, logger = logger, save = False, class_weight = class_weight, n_jobs = n_jobs, **kwargs) 

        res = {
                "global_features_before_merging": global_features,
                "global_sign_features_before_merging": global_sign_features_pool,
                "global_cluster_before_merging": global_cluster,
                "global_sign_features_after_merging": global_sign_features_after_merging,
                "global_cluster_after_merging": {"label_classes": relabel_classes, "labels": relabel_cluster_labels, "merge_label": [relabel_classes[i] for i in relabel_cluster_labels]}
              } 
    
    else:
        res = {
                "global_features_before_merging": global_features,
                "global_sign_features_before_merging": global_sign_features_pool,
                "global_cluster_before_merging": global_cluster,
                "global_sign_features_after_merging": merge_cluster["global_sign_features"],
                "global_cluster_after_merging": {"label_classes": relabel_classes, "labels": relabel_cluster_labels, "merge_label": [relabel_classes[i] for i in relabel_cluster_labels]}
              } 
    
    if save == True:
        if file_name != None:
            file_name = f"consensus_cluster_result_{file_name}_{record_time()}.pkl"
        else:
            file_name = f"consensus_cluster_result_{record_time()}.pkl"
        
        with open(file_name, "wb") as file:
            pickle.dump(res, file)
    
    logger.write("info", "finish global consensus clustering")
    return res



def within_group_merge(data, ep_cut_off = 1, pct_cut_off = 0.5, num_pos_genes = 10):
    """
    A function to find the mergeing candidates in sub clustering level. 

    Parameters
    ----------
    data: 
        A dict returned by find_signature_genes function. 
    ep_cut_off: 
        A int or float value to decide the expression cutoff to select positive informative features. Default is 1.
    pct_cut_off: 
        A int or float value to decide the expression percentage cutoff to select positive informative features. Default is 0.5. 
    num_pos_genes: 
        A int to decide the cutoff for positive markers. If the number of positive markers in a cluster is above this number, it will not be merged. Otherwise, it would be merged. Default is 10. 
    """

    keep = dict()
    merge = dict()
    
    key1 = data["sub_sign_genes"].keys()
    
    for i in key1:

        keep_ls = list()
        merge_ls = list()

        key2 = data["sub_sign_genes"][i].keys()
        
        for j in key2:
            candidates = retrieve_positive_markers({"sub_sign_genes_before_merging": data["sub_sign_genes"]}, i, j, pct_cut_off = pct_cut_off, ep_cut_off = ep_cut_off, level = "before")

            score_pool = [z[1] for z in data["sub_sign_genes"][i][j]["features"]["final_feature_selection_by_ensemble"]]
            names = [z[0] for z in data["sub_sign_genes"][i][j]["features"]["final_feature_selection_by_ensemble"]]
            score_df = pd.DataFrame({"name": names, "score": score_pool})
            score_p80 = np.percentile(score_df.score, 80)

            candidates = candidates.loc[candidates.score > score_p80]
 
            if candidates.shape[0] >= num_pos_genes:
                keep_ls.append(j)
            else:
                merge_ls.append(j)
        
        keep[i] = keep_ls
        merge[i] = merge_ls

    return keep, merge



[docs]
def sub_consensus_cluster(data, 
                          data2, 
                          n_components = 30, 
                          resolution = 0.2, 
                          class_weight = "balanced", 
                          n_features_to_select = 0.15,
                          ratio_of_none_zero_counts = 0.1,
                          ep_cut_off = 1,
                          pct_cut_off = 0.5,
                          num_pos_genes = 10,
                          n_neighbors = 100,
                          robust = True,
                          n_jobs = -1,
                          file_name = None,
                          save = True, 
                          logger = None, 
                          **kwargs):

    """
    A function to merge sub clusters in different global clusters and find markers for corresponding sub cluster. 
    
    Parameters
    ----------
    data: 
        A log-normalized expression matrix. Rows are cells; Columns are features.
    data2: 
        A dict returned by running global_consensus_cluster function.
    n_components: 
        A int to decide how many principle components will be used for KNN clustering. Default is 30.
    resolution: 
        A int to control the coarseness of the clustering. Higher values lead to more clusters. Default is 0.2.
    class_weight: 
        A string to decide whether class weights will be considered. If None, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). Default is 'balanced'.
    n_features_to_select:
        A int or float to control the number of features to select. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of features to select. Default is 0.15. 
    ep_cut_off: 
        A float value to control the selection criteria of positive markers. Default is 1.
    pct_cut_off: 
        A float value to control the selection criteria of positibe markers. Default is 0.5.
    num_pos_genes: 
        A int to decide the cutoff for positive markers. If the number of positive markers in a cluster is above this number, it will not be merged. Default is 10. 
    n_neighbors:
        A int to control the number of neighborhoods being considered for merging. Default is 100. 
    robust: 
        A Bool value to decide whether re find the markers of each cluster after cluster merging. Default is true.
    filename: 
        A string to control the name of output. Default is None. 
    save: 
        A Bool value to decide whether write the output into the disk. Default is true.
    n_jobs: 
        A int to decide the number of thread used for the program. Default is -1, meaning using all available threads.
    logger: 
        A log_file object. Default is None.
    **kwargs: 
        Other paremeters passed to feature_selection function. 
    """
    
    # add a cluster column in the data
    data.loc[:, "cluster"] = data2["global_cluster_after_merging"]["labels"]
    
    # group data by global cluster
    group_data = data.groupby("cluster")

    # make containers
    sub_sign_features = dict()
    sub_clusters = dict()
    compare = dict()
    discard = list()
    
    # redo the clustering in each sub population
    for key1 in list(set(data.cluster)):
       
        # get the subset data 
        sub_data = group_data.get_group(key1)
        
        # remove cluster column before clustering
        sub_data = sub_data.loc[:, sub_data.columns != "cluster"]
        
        logger.write("info", "doing sub clustering in global cluster {}".format(key1))  

        # when speciifed n_components > min(n_samples, n_features)
        n_components_update = None
        if n_components > np.min(sub_data.shape):
            n_components_update = np.min(sub_data.shape) - 1
    
        if n_components_update == None:
            sub_cluster = core_consensus_cluster(data = sub_data.copy(), n_components = n_components, resolution = resolution, return_hvgs = True)
        else:
            sub_cluster = core_consensus_cluster(data = sub_data.copy(), n_components = n_components_update, resolution = resolution, return_hvgs = True)
            
        sub_clusters["sub_clusters_in_global_cluster_{}".format(key1)] = sub_cluster[0]
        
        # add the 'cluster' column in the sub data
        sub_data.loc[:, "cluster"] = sub_cluster[0]

        # do PCA 
        filter_data = sub_data.loc[:, sub_data.columns.isin(sub_cluster[1])]

        if n_components_update == None:
            pca = PCA(n_components = n_components).fit_transform(filter_data)
        else:
            pca = PCA(n_components = n_components_update).fit_transform(filter_data)
        
        pca = pd.DataFrame(pca, columns = ["PC_{}".format(i) for i in range(pca.shape[1])])
        pca.loc[:, "cluster"] = sub_cluster[0]

        pca_group = pca.groupby("cluster")

        pca_centroids = list()
        for i in pca_group.groups.keys():
            individual = pca_group.get_group(i)
            individual = pd.DataFrame(individual.select_dtypes("number").mean(), columns=[i]).T
            pca_centroids.append(individual)

        # use PCA centroids to calculate the distance between clusters
        pca_centroids = pd.concat(pca_centroids)
        distance = pdist(pca_centroids, metric = 'euclidean')
        square_distances = pd.DataFrame(squareform(distance), columns = pca_centroids.index)
        square_distances.index = pca_centroids.index

        
        for i in range(square_distances.shape[0]):
            index = square_distances.iloc[i, :].sort_values()[1:(n_neighbors+1)].index
            key = square_distances.index[i]
            if i == 0:
                compare[str(key1)] = {str(key): list(index)}
            else:
                compare[str(key1)].update({str(key): list(index)})

        # find sinataure genes for each sub cluster within corresponding global cluster
        sub_sign_features_pool = dict()
        
        for key2 in list(set([str(i) for i in sub_cluster[0]])):
            logger.write("info", "finding signature genes in sub cluster {} of global cluster {}".format(key2, key1))
            intermediate = sub_data.copy()
            # intermediate = intermediate.loc[:, intermediate.columns.isin(target)]
            intermediate.loc[:, "cluster"] = intermediate.loc[:, "cluster"].astype(str)
            selection = compare[str(key1)][key2]
            selection.append(key2)
            intermediate = intermediate.loc[intermediate.cluster.isin(selection), :] # only do One vs Rest for the nearby clusters, aka One vs Neighbors
            intermediate.loc[intermediate.cluster != str(key2), "cluster"] = "OnevsRest" 
            if len(set(intermediate.cluster)) >= 2:
                intermediate_features = find_signature_genes(data = intermediate, label_column = "cluster", HVG = False, logger = logger, save = False, filename = "sub_cluster_{}".format(key), n_features_to_select = n_features_to_select, ratio_of_none_zero_counts = ratio_of_none_zero_counts, class_weight = class_weight, n_jobs = n_jobs, **kwargs)
                sub_sign_features_pool[str(key2)] = intermediate_features
            else:
                sub_sign_features_pool[str(key2)] = f"no more clusters are found in sub cluster {key2} global cluster {key1} before cluster merging"
                discard.append(f"{key1}_{key2}")
        
        sub_sign_features[str(key1)] = sub_sign_features_pool
    
    robust_d = dict()  
    if robust:
        # generate new label 
        logger.write("info", "re find signature genes of clusters after cluster merging")
        copy_data = data.copy()
        copy_data.loc[:, "row"] = range(copy_data.shape[0])
        
        # remove sub cluster with only one member 
        sub_sign_features_copy = copy.deepcopy(sub_sign_features)
        for i in discard:
            key1 = re.split("_", i)[0]
            key2 = re.split("_", i)[1]
            sub_sign_features_copy[key1].pop(key2)

        merge_sub = within_group_merge({"sub_sign_genes": sub_sign_features_copy}, num_pos_genes = num_pos_genes, ep_cut_off = ep_cut_off, pct_cut_off = pct_cut_off)
   

        ls = list()
        for i in sub_clusters.keys():
            key = re.findall("\d+", i)[0]
            sub_data = copy_data.loc[copy_data.cluster == int(key), :]
            sub_data.loc[:, "sub_cluster"] = [f"{key}_{z}" for z in sub_clusters[i]]
            condition1 = list()
            condition2 = list()
            if len(merge_sub[1][key]) >= 2:
                for j in merge_sub[1][key]:
                    condition1.append(f"{key}_{j}")
                    condition2.append(j)
            sub_data.loc[sub_data.sub_cluster.isin(condition1), "sub_cluster"] = f"{key}_merge_{'-'.join(condition2)}"
            ls.append(sub_data)
        
        # label encoding
        new_label = pd.concat(ls).sort_values("row")
        le = LabelEncoder().fit(new_label.loc[:, "sub_cluster"])
        new_label = le.transform(new_label.loc[:, "sub_cluster"])
        
        copy_data.loc[:, "final_cluster"] = new_label

    
        # find signature genes of clusters after cluster merging
        for i in copy_data.cluster.unique():
            wk_data = copy_data.loc[copy_data.cluster == i]
            wk_data = wk_data.loc[:, wk_data.columns != "cluster"]
            wk_data = wk_data.loc[:, wk_data.columns != "row"]
            if (wk_data.shape[1] - 1) != (data.shape[1] - 1):
                raise Exception("Extra none-useful columns in the dataset. Please double check the input data only contains feature columns representating expression in each cell")
            
            wk_data.loc[:, "final_cluster"] = le.inverse_transform(wk_data.loc[:, "final_cluster"])
            if len(set(wk_data.loc[:, "final_cluster"])) >= 2:
                robust_d[str(i)] = find_markers_ovr(wk_data, label_column = "final_cluster", n_features_to_select = n_features_to_select, ratio_of_none_zero_counts = ratio_of_none_zero_counts, logger = logger, save = False, class_weight = class_weight, n_jobs = n_jobs, **kwargs) 
            else:
                robust_d[str(i)] = f"no more sub clusters in global cluster {i} after cluster merging"

    if robust:
        res = {"sub_cluster": sub_clusters, "sub_sign_genes_after_merging": robust_d, "sub_sign_genes_before_merging": sub_sign_features, "neighbors": compare, "merge_labels": new_label, "merge_candidates": merge_sub, "label_encoder": le}
    else:    
        res = {"sub_cluster": sub_clusters, "sub_sign_genes": sub_sign_features, "neighbors": compare}

    if save == True:
        if file_name != None:
            file_name = f"sub_consensus_cluster_result_{file_name}_{record_time()}.pkl"
        else:
            file_name = f"sub_consensus_cluster_result_{record_time()}.pkl"
        
        with open(file_name, "wb") as file:
            pickle.dump(res, file)
    
    logger.write("info", "finish sub consensus cluster")
    
    return res



def between_group_merge(data, data2, cut_off = 0.1, increase = 0, focus = "expression1", n_merge = 100, ep_cut_off = 1, pct_cut_off = 0.5, only_positive = True, logger = None):

   """
   A function to merge clusters based on features' similarity evulated by jaccard index and distance correlation.
   
   Parameters
   ----------
   data: 
        A dict return by running find_signature_genes function.
   data2: 
        A count expression matrix. Rows are cells; Columns are features. 
   cut_off: 
        A int or float to control the cutoff of combined score (jaccard index * distance correlation) that will used to decide whether a pair of cluster should be merged or not. Default is 0.1.
   penalty_increase: 
        A float to control the augment of cut_off in the next round when the function is recursively called. 
   focus: 
        A string to decide the value used for distance correlation calculation. "expression1" - the mean expression of feauture in the interest group will be used. "rank1" - the rank of feature based on expression in the interest group will be used. Default is 'expression1'.
   n_merge:
        A int value to control the number of candidates which should be merged in each round. Default is 100.
   ep_cut_off: 
        A int or float value to decide the expression cutoff to select positive informative features. Default is 1.
   pct_cut_off: 
        A int or float value to decide the expression percentage cutoff to select positive informative features. Default is 0.5. 
   only_positive: 
        A Bool value to control whether only positive features are used. Default is True. 
   logger: 
        A log_file object. Default is None. a
   """
      
   compare_data = pairwise_compare(data, focus = focus, only_positive = only_positive, ep_cut_off = ep_cut_off, pct_cut_off = pct_cut_off)
   try:
      distance_cor = [*zip(compare_data["distance_cor"].keys(), compare_data["distance_cor"].values())]
      jaccard_score = [*zip(compare_data["jaccard_index"].keys(), compare_data["jaccard_index"].values())]
   except:
      return compare_data
      
   # times the distance correlation and jaccard score and only keep the score above the cutoff 
   combined_score = list()
   for i in range(len(distance_cor)):
      score = distance_cor[i][1] * jaccard_score[i][1]
      if score > cut_off:
         print(score)
         if distance_cor[i][0] != jaccard_score[i][0]:
            raise ValueError("The key between distance_cor and jaccard_index is not consistent")
         else:
            combined_score.append((distance_cor[i][0], score))
      else:
         continue

   if len(combined_score) != 0:
      # detect the label tie
      # e.g 0vs1, 0vs10, 1vs10
      combined_score_label = [ i[0] for i in combined_score ]
      
      former_num = set(sorted([*map(detect_former_num, combined_score_label)]))
      label_tie = dict()
      for i in former_num:
         ls = []
         for j in combined_score_label:
            if re.search("global cluster {} vs global cluster \d+".format(i), j) and len(re.findall("\d+", j)[0]) == len(i):
                  ls.append(j)
            elif re.search("global cluster \d+ vs global cluster {}".format(i), j) and len(re.findall("\d+", j)[1]) == len(i):
               ls.append(j)
            else:
               target = find_keys(j)
               target1 = target[0]
               target2 = target[1]
               if target1 == i or target2 == i:
                  ls.append(j)
            
         label_tie[i] = ls
      
      # combine the two groups with label tie together  
      new_d = key_search(label_tie)
      
      # remove individual keys that have been combined in the new_d
      # from label_tie
      discard_keys = list()
      condition = re.compile("((\d+&?){1,100}|(\d+))")
      for i in new_d.keys():
         target = [*condition.finditer(i)]
         for j in range(len(target)):
            target1 = i[target[j].start():target[j].end()]
            discard_keys.append(target1)
      for i in discard_keys:
         label_tie.pop(i)
      
      # combine label_tie and new_d
      label_tie.update(new_d)
      
      # merge the group with the highest score in each pair group 
      merge_candidate = list()
      for key in label_tie.keys():
         label = label_tie[key]
         merge_candidate.append(sorted([*filter(lambda x: x[0] in label, combined_score)], key = lambda x: -x[1]))
      
      pool = list()
      final_candidate = list()
      num = 0
      for i in merge_candidate:
         for j in i:
            labels = find_keys(j[0])
            if labels[0] not in pool and labels[1] not in pool:
               pool.append(labels[0])
               pool.append(labels[1])
               if num < n_merge:
                  final_candidate.append(j[0])
                  num += 1
               else:
                  break
         num = 0
         pool = list()

   
      high_expression = data["global_sign_features"]
      sign_genes = data["global_sign_features"]

      # for i in merge_candidate:
      for i in final_candidate:
         
         cluster_label = find_keys(i)
         compare1 = cluster_label[0]
         compare2 = cluster_label[1]

         print("compare1: {}; compare2: {}".format(compare1, compare2))
         
         if isinstance(sign_genes[compare1]["features"]["final_feature_selection_by_ensemble"][0], tuple) and isinstance(sign_genes[compare2]["features"]["final_feature_selection_by_ensemble"][0], tuple):
            geneset1 = [ j[0] for j in sign_genes[compare1]["features"]["final_feature_selection_by_ensemble"] ]
            geneset2 = [ j[0] for j in sign_genes[compare2]["features"]["final_feature_selection_by_ensemble"] ]
            union_geneset = list(set(geneset1).union(geneset2))

         elif isinstance(sign_genes[compare1]["features"]["final_feature_selection_by_ensemble"][0], tuple) and isinstance(sign_genes[compare2]["features"]["final_feature_selection_by_ensemble"][0], tuple) == False:
            geneset1 = [ j[0] for j in sign_genes[compare1]["features"]["final_feature_selection_by_ensemble"] ]
            geneset2 = sign_genes[compare2]["features"]["final_feature_selection_by_ensemble"] 
            union_geneset = list(set(geneset1).union(geneset2))

         elif isinstance(sign_genes[compare1]["features"]["final_feature_selection_by_ensemble"][0], tuple) == False and isinstance(sign_genes[compare2]["features"]["final_feature_selection_by_ensemble"][0], tuple):
            geneset1 = sign_genes[compare1]["features"]["final_feature_selection_by_ensemble"]
            geneset2 = [ j[0] for j in sign_genes[compare2]["features"]["final_feature_selection_by_ensemble"] ]
            union_geneset = list(set(geneset1).union(geneset2))

         else:
            geneset1 = sign_genes[compare1]["features"]["final_feature_selection_by_ensemble"]
            geneset2 = sign_genes[compare2]["features"]["final_feature_selection_by_ensemble"] 
            union_geneset = list(set(geneset1).union(geneset2))
            
         high_expression_dataset1 = high_expression[compare1]["sub_high_expression_genes"][compare1]
         high_expression_dataset2 = high_expression[compare2]["sub_high_expression_genes"][compare2]
         union_high_exp_dataset = pd.concat([high_expression_dataset1, high_expression_dataset2])
         union_high_exp_dataset = union_high_exp_dataset.sort_values("feature")
         union_high_exp_dataset.index = range(union_high_exp_dataset.shape[0])
         
         duplicate_feature_rows = np.ravel(np.array(np.where(union_high_exp_dataset.duplicated("feature", keep=False) == True)))
         
         # when shared features found, calucate the mean expression and rank between two datasets
         if len(duplicate_feature_rows) != 0:
            uniq_high_exp_dataset = union_high_exp_dataset.loc[union_high_exp_dataset.index.isin(duplicate_feature_rows) == False, :]
            dup_high_exp_dataset = union_high_exp_dataset.loc[union_high_exp_dataset.index.isin(duplicate_feature_rows) == True, :].sort_values("feature")
            dup_features = list(dup_high_exp_dataset.drop_duplicates("feature").feature)
            dup_high_exp_dataset = dup_high_exp_dataset.select_dtypes("number").rolling(window = 2).mean()[1::2]
            dup_high_exp_dataset.loc[:, "feature"] = dup_features # add feature column back
            try:
               dup_high_exp_dataset = dup_high_exp_dataset[["feature", "expression1", "pct1", "rank1", "expression2", "pct2", "rank2"]] # reorder the column
            except:
               dup_high_exp_dataset = dup_high_exp_dataset[["feature", "expression1", "pct1", "rank1"]]
               
            final_high_exp_dataset = pd.concat([uniq_high_exp_dataset, dup_high_exp_dataset])
         else:
            final_high_exp_dataset = union_high_exp_dataset
         
         # add combined keys
         if logger != None: 
            logger.write("info", "{} merge {} and {}".format(record_time(), compare1, compare2))
         print("{} merge {} and {}".format(record_time(), compare1, compare2))
         
         data["global_sign_features"].update({"{}&{}".format(compare1, compare2): {"features": {"final_feature_selection_by_ensemble": union_geneset}}})
         data["global_sign_features"]["{}&{}".format(compare1, compare2)].update({"sub_high_expression_genes": {"{}&{}".format(compare1, compare2): final_high_exp_dataset}})

         # remove keys that has been merged    
         data["global_sign_features"].pop(compare1)
         data["global_sign_features"].pop(compare2)

      # re calculate the markers of each cluster after merging 
      merge_keys = list(data["global_sign_features"].keys())
      data2_copy = data2.copy()
      data2_copy.loc[:, "cluster"] = data2_copy.loc[:, "cluster"].astype(str)
      for i in merge_keys:
         if re.search("&", i):
            merge_candidate = re.split("&", i)
            for j in merge_candidate:
                  data2_copy.loc[data2_copy.cluster == j, "cluster"] = str(i)
         
      new_markers = find_markers_ovr(data2_copy, 
                                       label_column = "cluster", 
                                       n_features_to_select = 0.15, 
                                       ratio_of_none_zero_counts = 0.1, 
                                       logger = logger, 
                                       save = False, 
                                       class_weight = "balanced", 
                                       n_jobs = -1, 
                                       variance_threshold = "mean",
                                       model = "svm",
                                       mutual_info = False,
                                       F_test = True,
                                       normalization_method = "Min-Max")
      
      data = {"global_sign_features": new_markers}

      
      return between_group_merge(data, data2 = data2, cut_off = cut_off + increase, increase = increase, n_merge = n_merge, focus = focus, ep_cut_off = ep_cut_off, pct_cut_off = pct_cut_off, logger = logger)
   
   else:
      return data


def find_keys(data, split_by = "vs"):
    """
    A function to split string by specified symbol.

    Parameters
    ----------
    data: 
        A string object.
    split_by: 
        A string to decide the regular expression to split the data.
    """
    
    res = re.split(split_by, data)
    output = list()
    for i in res:
        try:
            reg = [*re.compile("((\d+&?){1,100}|(\d+))").finditer(i)][0]
        except:
            raise Exception("Can't find numeric key")
        output.append(i[reg.start():reg.end()])
    return output


def detect_former_num(data):
    """
    A function to find the first numeric elements in a string.
    """
    res = re.split("vs", data)
    output = [*re.compile("((\d+&?){1,100}|(\d+))").finditer(res[0])][0]
    return res[0][output.start():output.end()]


def get_sankey_dataframe(data, plot = False):
    """
    A function to generate the dataframe for sankey plotting.
    
    Parameters
    ----------
    data: 
        A pandas dataframe. From first column to last column, it must conver the original source and final target in order. 
    plot: 
        A Bool value to decide whether plotting basic sankey diagram. 
    """

    # encode source and target labels
    colnames = list(data.columns)
    labels_pool = list()
    for i in range(data.shape[1]):
        data.iloc[:, i] = [f"{colnames[i]}_{j}" for j in data.iloc[:, i]]
        labels_pool.extend(list(set(data.iloc[:, i])))
    
    le = LabelEncoder().fit(labels_pool)

    for i in range(data.shape[1]):
        data.iloc[:, i] = le.transform(data.iloc[:, i])


    # for each source, calculate the numer of occasions to different target  
    source_target_num = data.shape[1]
    target = dict()
    for i in range(source_target_num-1):
        start = set(data.iloc[:, i])
        column = colnames[i]
        for j in start:
            sub_data = data.query("{} == @j".format(column))
            target_num = Counter(sub_data.iloc[:, i+1])
            for z in target_num.keys():
                target["{}_{}_{}".format(i, j, z)] = target_num.get(z)
    

    # detect whether one source has different targets
    # if yes, record the number of targets and target names
    replicate = list()
    for j in target.keys():
        new_source_key = re.split("_", j)[1]
        column = re.split("_", j)[0]
        key = f'{column}_{new_source_key}'
        replicate.append(key)
    
    replicate_count = dict()
    for i in replicate:
        count = 0
        member = list()
        for j in target.keys():
            if re.search(f"{i}_\d+", j):
              count += 1
              member.append(re.split("_", j)[2])
        replicate_count[f"{i}"] = count, member

    # generate the output sankey dataframe
    new_source = list()
    new_target = list()
    value = list()

    for i in replicate_count.keys():
        replicate_num = replicate_count[i][0]
        replicate_member = replicate_count[i][1]

        if replicate_num == 1:
            new_target.append(replicate_member[0])
            new_source.append(re.split("_", i)[1])
            value.append(target[f"{i}_{replicate_member[0]}"])
        else:
            new_target.extend(replicate_member)
            new_source.extend([re.split("_", i)[1]] * replicate_num)
            for j in replicate_member:
                value.append(target[f'{i}_{j}'])

    sankey_df = pd.DataFrame({"source": new_source, "target": new_target, "value": value})
 
    sankey_df = sankey_df.astype(int)
    labels = le.inverse_transform(np.unique(sankey_df[["source", "target"]].values))

    # plotting
    if plot:

        fig = go.Figure(data=[go.Sankey(
            node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = labels
            ),
            link = dict(
            source = sankey_df.source, 
            target = sankey_df.target,
            value =  sankey_df.value
            ))])

        fig.update_layout(title_text="Basic Sankey Diagram", font_size=10,width=1200, height=800)
        fig.show()
    
    return sankey_df, labels