Source code for CytoSimplex._select_top_features

from ._util import _check_cluster_vertices
from ._normalize import row_normalize
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from scipy.sparse import csr_matrix
from scipy import stats
from anndata import AnnData
from typing import Union, Optional


[docs]def select_top_features( x: Union[AnnData, np.ndarray, csr_matrix], cluster_var: Union[str, list, pd.Series], vertices: Union[list, dict], n_top: int = 30, processed: bool = False, lfc_thresh: float = 0.1, return_stats: bool = False, feature_names: Optional[str] = None ) -> Union[list, pd.DataFrame]: """ Select top features for each vertices based on the wilcoxon test. Parameters ---------- x The matrix of gene expression, where each row is a cell and each column is a gene. Recommended to be full size raw counts. (i.e. not log-transformed or normalized and not only for highly variable genes) When given :class:`anndata.AnnData`, `x.X` will be used. cluster_var The cluster assignment of each single cell. If `x` is an :class:`anndata.AnnData`, `cluster_var` can be a `str` that specifies the name of the cluster variable in `x.obs`. `list` or :class:`pandas.Series` is accepted in all cases, and the length must equal to `x.shape[0]`. vertices The terminal specifications. Depending on the type of simplex to be visualized in downstream, the number of vertices (n) should be determined by users. e.g. 3 elements for a 2-simplex (ternary simplex / triangle). Acceptable input include: - A :class:`list` of n :class:`str` that exist in the categories of `cluster_var`. - A :class:`dict` of n keys. The keys are presented as customizable vertex names. The corresponding value for each key can be either a :class:`str` for a single cluster, or a :class:`list` of :class:`str` for grouped vertex of multiple clusters. n_top The number of top features to select for each vertex. processed Whether the input matrix is already processed. If `False`, the input matrix will be log transformed and row normalized. If `True`, the input matrix will be directly used to calculate the rank-sum statistics. And logFC will be calculated assuming that the input matrix is log-transformed. lfc_thresh The log fold change threshold to select up-regulated genes. return_stats Whether to return the full statistics of all clusters and all features instead of only returning the selected top features by default. feature_names The names of the features in the matrix. If None, the feature names will be the index of the matrix. Returns ------- selected : :class:`list`, when `return_stats=False`. The list of selected features. Maximum length is `n_top * len(vertices)` when enough features can pass the threshold. stats : :class:`pandas.DataFrame`, when `return_stats=True`. The statistics of the wilcoxon test, with `n_groups * n_features` rows. Columns are 'group', 'avgExpr', 'logFC', 'ustat', 'auc', 'pval', 'padj', 'pct_in', 'pct_out' and 'feature'. Examples -------- >>> import CytoSimplex as csx >>> import scanpy as sc >>> adata = sc.read( ... filename="test.h5ad", ... backup_url="https://figshare.com/ndownloader/files/41034857" ... ) >>> vertices = {'OS': ["Osteoblast_1", "Osteoblast_2", "Osteoblast_3"], ... 'RE': ['Reticular_1', 'Reticular_2'], ... 'CH': ['Chondrocyte_1', 'Chondrocyte_2', 'Chondrocyte_3']} >>> gene = csx.select_top_features(adata, "cluster", vertices) >>> gene[:8] ['Nrk', 'Eps8l2', 'Mfi2', 'Fam101a', 'Scin', 'Sox5', 'Fbln7', 'Edil3'] >>> stats = csx.select_top_features(adata, "cluster", vertices, return_stats=True) group avgExpr logFC ustat auc pval padj pct_in pct_out feature 0 CH 0.000000 0.000000 5000.0 0.5000 NaN NaN 0.0 0.0 Rp1 1 CH 0.000000 0.000000 5000.0 0.5000 NaN NaN 0.0 0.0 Sox17 2 CH 8.413918 5.771032 6948.0 0.6948 7.674938e-08 9.017050e-07 64.0 19.0 Mrpl15 3 CH 4.627888 2.507528 5894.0 0.5894 4.888534e-03 1.529972e-02 36.0 15.5 Lypla1 4 CH 0.256851 0.256851 5100.0 0.5100 4.999579e-02 1.110755e-01 2.0 0.0 Gm37988 ... ... ... ... ... ... ... ... ... ... ... 141696 RE 2.269271 0.260934 5105.0 0.5105 7.154050e-01 1.000000e+00 16.0 14.5 PISD 141697 RE 2.593425 -0.267436 4968.0 0.4968 9.268655e-01 1.000000e+00 18.0 22.0 DHRSX 141698 RE 0.000000 -0.185628 4925.0 0.4925 3.973749e-01 9.619204e-01 0.0 1.5 CAAA01147332.1 141699 RE 16.347951 3.563943 7373.0 0.7373 2.048452e-07 1.117704e-05 100.0 80.0 tdT-WPRE_trans 141700 RE 0.000000 0.000000 5000.0 0.5000 NaN NaN 0.0 0.0 CreER-WPRE_trans """ if isinstance(x, AnnData): feature_names = x.var_names else: if feature_names is None: feature_names = np.arange(x.shape[1]) else: assert len(feature_names) == x.shape[1], \ "feature_names must have the same length as the number of " \ "columns in X." mat, grouping, vertices, cluster_var = \ _check_cluster_vertices(x, cluster_var, vertices) if not processed: mat = row_normalize(mat) mat = np.log1p(1e10*mat) if (grouping.value_counts() > 0).sum() < 2: raise ValueError("Must have at least 2 non-empty groups defined.") stats = wilcoxon(mat, grouping) stats['feature'] = np.tile(feature_names, len(grouping.cat.categories)) if return_stats: return stats stats = stats[stats['group'].isin(vertices)] stats = stats[stats['logFC'] > lfc_thresh] selected = [] for i, sub in stats.groupby('group', sort=False): sub = sub.sort_values(['padj', 'logFC'], ascending=[True, False]) selected.extend(sub['feature'].values[:n_top]) return selected
def wilcoxon(X, y): """ X - n cells by m features sparse matrix y - n cells categorical vector """ # Calculate Primary Statistics if isinstance(X, csr_matrix): rank_mat, ties = _ranking_csr(X) # rankdata(X.toarray().T, axis=1) else: rank_mat = rankdata(X.T, axis=1) ties = [_count_ties(rank_mat[i]) for i in range(rank_mat.shape[0])] # rank_mat: m features by n cells # Get mann-whitney u stats group_size = np.array([y.value_counts(sort=False)]).T ustats = _compute_ustats(rank_mat, y, group_size) # Get p-values n1n2 = group_size * (X.shape[0] - group_size) auc = ustats / n1n2 pvals = _compute_pval(ustats, ties, X.shape[0], n1n2) fdr = _compute_fdr(pvals) # Calculate Auxiliary Statistics group_sums = np.zeros((len(y.cat.categories), X.shape[1])) for i, cat in enumerate(y.cat.categories): group_sums[i, :] = X[y == cat, :].sum(axis=0) group_nnz = np.zeros((len(y.cat.categories), X.shape[1])) for i, cat in enumerate(y.cat.categories): group_nnz[i, :] = (X[y == cat, :] > 0).sum(axis=0) group_mean = group_sums / group_size group_pct_in = group_nnz / group_size group_pct_out = group_nnz.sum(0) - group_nnz group_pct_out = group_pct_out / (X.shape[0] - group_size) lfc = group_mean - (X.sum(0) - group_sums) / (X.shape[0] - group_size) if isinstance(X, csr_matrix): lfc_flat = np.array(lfc.flatten())[0] else: lfc_flat = lfc.flatten() final = pd.DataFrame({ 'group': np.repeat(y.cat.categories, X.shape[1]), 'avgExpr': group_mean.flatten(), 'logFC': lfc_flat, 'ustat': ustats.flatten(), 'auc': auc.flatten(), 'pval': pvals.flatten(), 'padj': fdr.flatten(), 'pct_in': 100*group_pct_in.flatten(), 'pct_out': 100*group_pct_out.flatten() }) return final def _count_ties(rank_mat): """ rank_mat - 1D array, length n cells """ _, counts = np.unique(rank_mat, return_counts=True) return counts[counts > 1] def _compute_ustats(Rank, y, group_size): """ Rank - m features x n cells matrix of ranking y - n cells categorical pd.Series group_size - n groups x 1 array of group sizes """ sums = np.zeros((len(y.cat.categories), Rank.shape[0])) for i, cat in enumerate(y.cat.categories): sums[i, :] = Rank[:, y == cat].sum(axis=1).T rhs = group_size * (group_size + 1) / 2 if isinstance(Rank, np.ndarray): ustat = sums - rhs else: # Count for the rank sums of zeros, because ranking for csr matrix # does not count for zeros. group_nnz = np.zeros((len(y.cat.categories), Rank.shape[0])) for i, cat in enumerate(y.cat.categories): group_nnz[i, :] = Rank[:, y == cat].getnnz(axis=1) group_n_zero = group_size - group_nnz zero_ranks = (Rank.shape[1] - np.diff(Rank.indptr) + 1)/2 ustat = group_n_zero * zero_ranks + sums - rhs return ustat def _compute_pval(ustats, ties, N, n1n2): z = ustats - np.array([n1n2/2]).T z = z - np.sign(z)/0.5 x1 = N ** 3 - N x2 = 1 / (12 * (N ** 2 - N)) rhs = np.array([x1 - np.sum(tvals ** 3 - tvals) for tvals in ties]) * x2 usigma = np.dot(n1n2, np.array([rhs])) usigma = np.sqrt(usigma) z = z / usigma pvals = 2 * stats.norm.cdf(-np.abs(z)) return pvals[0] def _compute_fdr(p_vals): ranked_p_values = rankdata(p_vals, axis=1) fdr = p_vals * p_vals.shape[1] / ranked_p_values fdr[fdr > 1] = 1 return fdr def _ranking_csr(X): """ X - n cells by m features sparse matrix of gene expression This function only counts for the ranking of nonzero values in the csr_matrix. """ rank_csr = X.copy() # Create a transposed row major csr matrix of ranks that has the same # sparse structure. So each row is a feaeture and diff(indptr) helps easily # locate nonzero values in each row. rank_csr = csr_matrix(rank_csr.T) all_ties = [] nz_data_idx = 0 # Vector of number of nonzero values per row nnz_per_row = np.diff(rank_csr.indptr) for nnz in nnz_per_row: # nnz: number of nonzero values in the row nz_slice = slice(nz_data_idx, nz_data_idx+nnz) rank = rankdata(rank_csr.data[nz_slice], method='average') n_zero = rank_csr.shape[1] - nnz rank += n_zero ties = [n_zero] ties.extend(np.unique(rank, return_counts=True)[1]) all_ties.append(np.array(ties)) # Inserting the rank values into the data array of the csr matrix # bc using [row,col] index creates copies of the matrix and is slow rank_csr.data[nz_slice] = rank nz_data_idx += nnz return rank_csr, all_ties