Source code for CytoSimplex._select_top_features

from ._util import _check_cluster_vertices
from ._normalize import row_normalize
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from scipy.sparse import csr_matrix
from scipy import stats
from anndata import AnnData
from typing import Union, Optional


[docs]def select_top_features(
        x: Union[AnnData, np.ndarray, csr_matrix],
        cluster_var: Union[str, list, pd.Series],
        vertices: Union[list, dict],
        n_top: int = 30,
        processed: bool = False,
        lfc_thresh: float = 0.1,
        return_stats: bool = False,
        feature_names: Optional[str] = None
        ) -> Union[list, pd.DataFrame]:
    """
    Select top features for each vertices based on the wilcoxon test.

    Parameters
    ----------
    x
        The matrix of gene expression, where each row is a cell and each column
        is a gene. Recommended to be full size raw counts. (i.e. not
        log-transformed or normalized and not only for highly variable genes)
        When given :class:`anndata.AnnData`, `x.X` will be used.
    cluster_var
        The cluster assignment of each single cell. If `x` is an
        :class:`anndata.AnnData`, `cluster_var` can be a `str` that specifies
        the name of the cluster variable in `x.obs`. `list` or
        :class:`pandas.Series` is accepted in all cases, and the length must
        equal to `x.shape[0]`.
    vertices
        The terminal specifications. Depending on the type of simplex to be
        visualized in downstream, the number of vertices (n) should be
        determined by users. e.g. 3 elements for a 2-simplex (ternary simplex /
        triangle). Acceptable input include:

        - A :class:`list` of n :class:`str` that exist in the categories of
          `cluster_var`.
        - A :class:`dict` of n keys. The keys are presented as customizable
          vertex names. The corresponding value for each key can be either a
          :class:`str` for a single cluster, or a :class:`list` of :class:`str`
          for grouped vertex of multiple clusters.
    n_top
        The number of top features to select for each vertex.
    processed
        Whether the input matrix is already processed. If `False`, the input
        matrix will be log transformed and row normalized. If `True`, the input
        matrix will be directly used to calculate the rank-sum statistics. And
        logFC will be calculated assuming that the input matrix is
        log-transformed.
    lfc_thresh
        The log fold change threshold to select up-regulated genes.
    return_stats
        Whether to return the full statistics of all clusters and all features
        instead of only returning the selected top features by default.
    feature_names
        The names of the features in the matrix. If None, the feature names
        will be the index of the matrix.

    Returns
    -------
    selected : :class:`list`, when `return_stats=False`.
        The list of selected features. Maximum length is
        `n_top * len(vertices)` when enough features can pass the
        threshold.
    stats : :class:`pandas.DataFrame`, when `return_stats=True`.
        The statistics of the wilcoxon test, with `n_groups * n_features` rows.
        Columns are 'group', 'avgExpr', 'logFC', 'ustat', 'auc', 'pval',
        'padj', 'pct_in', 'pct_out' and 'feature'.

    Examples
    --------
        >>> import CytoSimplex as csx
        >>> import scanpy as sc
        >>> adata = sc.read(
        ...     filename="test.h5ad",
        ...     backup_url="https://figshare.com/ndownloader/files/41034857"
        ... )
        >>> vertices = {'OS': ["Osteoblast_1", "Osteoblast_2", "Osteoblast_3"],
        ...             'RE': ['Reticular_1', 'Reticular_2'],
        ...             'CH': ['Chondrocyte_1', 'Chondrocyte_2', 'Chondrocyte_3']}
        >>> gene = csx.select_top_features(adata, "cluster", vertices)
        >>> gene[:8]
        ['Nrk', 'Eps8l2', 'Mfi2', 'Fam101a', 'Scin', 'Sox5', 'Fbln7', 'Edil3']
        >>> stats = csx.select_top_features(adata, "cluster", vertices, return_stats=True)
            group    avgExpr     logFC   ustat     auc          pval          padj  pct_in  pct_out           feature
        0         CH   0.000000  0.000000  5000.0  0.5000           NaN           NaN     0.0      0.0               Rp1
        1         CH   0.000000  0.000000  5000.0  0.5000           NaN           NaN     0.0      0.0             Sox17
        2         CH   8.413918  5.771032  6948.0  0.6948  7.674938e-08  9.017050e-07    64.0     19.0            Mrpl15
        3         CH   4.627888  2.507528  5894.0  0.5894  4.888534e-03  1.529972e-02    36.0     15.5            Lypla1
        4         CH   0.256851  0.256851  5100.0  0.5100  4.999579e-02  1.110755e-01     2.0      0.0           Gm37988
        ...      ...        ...       ...     ...     ...           ...           ...     ...      ...               ...
        141696    RE   2.269271  0.260934  5105.0  0.5105  7.154050e-01  1.000000e+00    16.0     14.5              PISD
        141697    RE   2.593425 -0.267436  4968.0  0.4968  9.268655e-01  1.000000e+00    18.0     22.0             DHRSX
        141698    RE   0.000000 -0.185628  4925.0  0.4925  3.973749e-01  9.619204e-01     0.0      1.5    CAAA01147332.1
        141699    RE  16.347951  3.563943  7373.0  0.7373  2.048452e-07  1.117704e-05   100.0     80.0    tdT-WPRE_trans
        141700    RE   0.000000  0.000000  5000.0  0.5000           NaN           NaN     0.0      0.0  CreER-WPRE_trans
    """
    if isinstance(x, AnnData):
        feature_names = x.var_names
    else:
        if feature_names is None:
            feature_names = np.arange(x.shape[1])
        else:
            assert len(feature_names) == x.shape[1], \
                "feature_names must have the same length as the number of " \
                "columns in X."
    mat, grouping, vertices, cluster_var = \
        _check_cluster_vertices(x, cluster_var, vertices)
    if not processed:
        mat = row_normalize(mat)
        mat = np.log1p(1e10*mat)
    if (grouping.value_counts() > 0).sum() < 2:
        raise ValueError("Must have at least 2 non-empty groups defined.")
    stats = wilcoxon(mat, grouping)
    stats['feature'] = np.tile(feature_names, len(grouping.cat.categories))
    if return_stats:
        return stats
    stats = stats[stats['group'].isin(vertices)]
    stats = stats[stats['logFC'] > lfc_thresh]
    selected = []
    for i, sub in stats.groupby('group', sort=False):
        sub = sub.sort_values(['padj', 'logFC'], ascending=[True, False])
        selected.extend(sub['feature'].values[:n_top])
    return selected


def wilcoxon(X, y):
    """
    X - n cells by m features sparse matrix
    y - n cells categorical vector
    """
    # Calculate Primary Statistics
    if isinstance(X, csr_matrix):
        rank_mat, ties = _ranking_csr(X)
        # rankdata(X.toarray().T, axis=1)
    else:
        rank_mat = rankdata(X.T, axis=1)
        ties = [_count_ties(rank_mat[i]) for i in range(rank_mat.shape[0])]
    # rank_mat: m features by n cells
    # Get mann-whitney u stats
    group_size = np.array([y.value_counts(sort=False)]).T
    ustats = _compute_ustats(rank_mat, y, group_size)
    # Get p-values

    n1n2 = group_size * (X.shape[0] - group_size)
    auc = ustats / n1n2
    pvals = _compute_pval(ustats, ties, X.shape[0], n1n2)
    fdr = _compute_fdr(pvals)

    # Calculate Auxiliary Statistics
    group_sums = np.zeros((len(y.cat.categories), X.shape[1]))
    for i, cat in enumerate(y.cat.categories):
        group_sums[i, :] = X[y == cat, :].sum(axis=0)
    group_nnz = np.zeros((len(y.cat.categories), X.shape[1]))
    for i, cat in enumerate(y.cat.categories):
        group_nnz[i, :] = (X[y == cat, :] > 0).sum(axis=0)
    group_mean = group_sums / group_size
    group_pct_in = group_nnz / group_size
    group_pct_out = group_nnz.sum(0) - group_nnz
    group_pct_out = group_pct_out / (X.shape[0] - group_size)
    lfc = group_mean - (X.sum(0) - group_sums) / (X.shape[0] - group_size)
    if isinstance(X, csr_matrix):
        lfc_flat = np.array(lfc.flatten())[0]
    else:
        lfc_flat = lfc.flatten()
    final = pd.DataFrame({
        'group': np.repeat(y.cat.categories, X.shape[1]),
        'avgExpr': group_mean.flatten(),
        'logFC': lfc_flat,
        'ustat': ustats.flatten(),
        'auc': auc.flatten(),
        'pval': pvals.flatten(),
        'padj': fdr.flatten(),
        'pct_in': 100*group_pct_in.flatten(),
        'pct_out': 100*group_pct_out.flatten()
    })
    return final


def _count_ties(rank_mat):
    """
    rank_mat - 1D array, length n cells
    """
    _, counts = np.unique(rank_mat, return_counts=True)
    return counts[counts > 1]


def _compute_ustats(Rank, y, group_size):
    """
    Rank       - m features x n cells matrix of ranking
    y          - n cells categorical pd.Series
    group_size - n groups x 1 array of group sizes
    """
    sums = np.zeros((len(y.cat.categories), Rank.shape[0]))
    for i, cat in enumerate(y.cat.categories):
        sums[i, :] = Rank[:, y == cat].sum(axis=1).T
    rhs = group_size * (group_size + 1) / 2
    if isinstance(Rank, np.ndarray):
        ustat = sums - rhs
    else:
        # Count for the rank sums of zeros, because ranking for csr matrix
        # does not count for zeros.
        group_nnz = np.zeros((len(y.cat.categories), Rank.shape[0]))
        for i, cat in enumerate(y.cat.categories):
            group_nnz[i, :] = Rank[:, y == cat].getnnz(axis=1)
        group_n_zero = group_size - group_nnz
        zero_ranks = (Rank.shape[1] - np.diff(Rank.indptr) + 1)/2
        ustat = group_n_zero * zero_ranks + sums - rhs
    return ustat


def _compute_pval(ustats, ties, N, n1n2):
    z = ustats - np.array([n1n2/2]).T
    z = z - np.sign(z)/0.5
    x1 = N ** 3 - N
    x2 = 1 / (12 * (N ** 2 - N))
    rhs = np.array([x1 - np.sum(tvals ** 3 - tvals) for tvals in ties]) * x2
    usigma = np.dot(n1n2, np.array([rhs]))
    usigma = np.sqrt(usigma)
    z = z / usigma
    pvals = 2 * stats.norm.cdf(-np.abs(z))
    return pvals[0]


def _compute_fdr(p_vals):
    ranked_p_values = rankdata(p_vals, axis=1)
    fdr = p_vals * p_vals.shape[1] / ranked_p_values
    fdr[fdr > 1] = 1
    return fdr


def _ranking_csr(X):
    """
    X - n cells by m features sparse matrix of gene expression
    This function only counts for the ranking of nonzero values in the
        csr_matrix.
    """
    rank_csr = X.copy()
    # Create a transposed row major csr matrix of ranks that has the same
    # sparse structure. So each row is a feaeture and diff(indptr) helps easily
    # locate nonzero values in each row.
    rank_csr = csr_matrix(rank_csr.T)
    all_ties = []
    nz_data_idx = 0
    # Vector of number of nonzero values per row
    nnz_per_row = np.diff(rank_csr.indptr)
    for nnz in nnz_per_row:
        # nnz: number of nonzero values in the row
        nz_slice = slice(nz_data_idx, nz_data_idx+nnz)
        rank = rankdata(rank_csr.data[nz_slice], method='average')
        n_zero = rank_csr.shape[1] - nnz
        rank += n_zero
        ties = [n_zero]
        ties.extend(np.unique(rank, return_counts=True)[1])
        all_ties.append(np.array(ties))
        # Inserting the rank values into the data array of the csr matrix
        # bc using [row,col] index creates copies of the matrix and is slow
        rank_csr.data[nz_slice] = rank
        nz_data_idx += nnz
    return rank_csr, all_ties