Package cNMF
Consensus Non-negative Matrix Factorization
Expand source code
# -*- coding: utf-8 -*-
"""
Consensus Non-negative Matrix Factorization
"""
from .cnmf import (
cNMF,
cnmf_load_results,
prepare,
factorize,
combine,
consensus,
k_selection,
)
__all__ = [
"cNMF",
"cnmf_load_results",
"prepare",
"factorize",
"combine",
"consensus",
"k_selection",
]
from ._version import get_versions
__version__ = get_versions()["version"]
del get_versions
Sub-modules
cNMF.cnmf
-
Consensus non-negative matrix factorization (cNMF) adapted from (Kotliar, et al. 2019)
cNMF.cnmf_parallel
-
Entire cNMF pipeline run in parallel using GNU parallel adapted from (Kotliar, et al. 2019)
Functions
def cnmf_load_results(adata, cnmf_dir, name, k, dt, key='cnmf', **kwargs)
-
Load results of cNMF
Given adata object and corresponding cNMF output (cnmf_dir, name, k, dt to identify), read in relevant results and save to adata object inplace, and output plot of gene loadings for each GEP usage.
Parameters
adata
:AnnData.AnnData
- AnnData object
cnmf_dir
:str
- relative path to directory containing cNMF outputs
name
:str
- name of cNMF replicate
k
:int
- value used for consensus factorization
dt
:int
- distance threshold value used for consensus clustering
key
:str
, optional(default="cnmf")
- prefix of adata.uns keys to save
n_points
:int
- how many top genes to include in rank_genes() plot
**kwargs
:optional (default=None)
- keyword args to pass to cnmf_markers()
Returns
adata
:AnnData.AnnData
adata
is edited in place to include overdispersed genes (adata.var["cnmf_overdispersed"]
), usages (adata.obs["usage_#"]
,adata.obsm["cnmf_usages"]
), gene spectra scores (adata.varm["cnmf_spectra"]
), and list of top genes by spectra score (adata.uns["cnmf_markers"]
).
Expand source code
def cnmf_load_results(adata, cnmf_dir, name, k, dt, key="cnmf", **kwargs): """ Load results of cNMF Given adata object and corresponding cNMF output (cnmf_dir, name, k, dt to identify), read in relevant results and save to adata object inplace, and output plot of gene loadings for each GEP usage. Parameters ---------- adata : AnnData.AnnData AnnData object cnmf_dir : str relative path to directory containing cNMF outputs name : str name of cNMF replicate k : int value used for consensus factorization dt : int distance threshold value used for consensus clustering key : str, optional (default="cnmf") prefix of adata.uns keys to save n_points : int how many top genes to include in rank_genes() plot **kwargs : optional (default=None) keyword args to pass to cnmf_markers() Returns ------- adata : AnnData.AnnData `adata` is edited in place to include overdispersed genes (`adata.var["cnmf_overdispersed"]`), usages (`adata.obs["usage_#"]`, `adata.obsm["cnmf_usages"]`), gene spectra scores (`adata.varm["cnmf_spectra"]`), and list of top genes by spectra score (`adata.uns["cnmf_markers"]`). """ # read in cell usages usage = pd.read_csv( "{}/{}/{}.usages.k_{}.dt_{}.consensus.txt".format( cnmf_dir, name, name, str(k), str(dt).replace(".", "_") ), sep="\t", index_col=0, ) usage.columns = ["usage_" + str(col) for col in usage.columns] # normalize usages to total for each cell usage_norm = usage.div(usage.sum(axis=1), axis=0) usage_norm.index = usage_norm.index.astype(str) # add usages to .obs for visualization adata.obs = pd.merge( left=adata.obs, right=usage_norm, how="left", left_index=True, right_index=True ) # replace missing values with zeros for all factors adata.obs.loc[:, usage_norm.columns].fillna(value=0, inplace=True) # add usages as array in .obsm for dimension reduction adata.obsm["cnmf_usages"] = adata.obs.loc[:, usage_norm.columns].values # read in overdispersed genes determined by cNMF and add as metadata to adata.var overdispersed = np.genfromtxt( "{}/{}/{}.overdispersed_genes.txt".format(cnmf_dir, name, name), delimiter="\t", dtype=str, ) adata.var["cnmf_overdispersed"] = 0 adata.var.loc[ [x for x in adata.var.index if x in overdispersed], "cnmf_overdispersed" ] = 1 # read top gene loadings for each GEP usage and save to adata.uns['cnmf_markers'] cnmf_markers( adata, "{}/{}/{}.gene_spectra_score.k_{}.dt_{}.txt".format( cnmf_dir, name, name, str(k), str(dt).replace(".", "_") ), key=key, **kwargs )
def combine(args)
-
Expand source code
def combine(args): argdict = vars(args) cnmf_obj = cNMF(output_dir=argdict["output_dir"], name=argdict["name"]) cnmf_obj._initialize_dirs() run_params = load_df_from_npz(cnmf_obj.paths["nmf_replicate_parameters"]) if type(args.components) is int: ks = [args.components] elif argdict["components"] is None: ks = sorted(set(run_params.n_components)) else: ks = argdict["components"] for k in ks: cnmf_obj.combine_nmf(k)
def consensus(args)
-
Expand source code
def consensus(args): argdict = vars(args) cnmf_obj = cNMF(output_dir=argdict["output_dir"], name=argdict["name"]) cnmf_obj._initialize_dirs() run_params = load_df_from_npz(cnmf_obj.paths["nmf_replicate_parameters"]) if argdict["auto_k"]: argdict["components"] = pick_k(cnmf_obj.paths["k_selection_stats"]) if type(argdict["components"]) is int: ks = [argdict["components"]] elif argdict["components"] is None: ks = sorted(set(run_params.n_components)) else: ks = argdict["components"] for k in ks: merged_spectra = load_df_from_npz(cnmf_obj.paths["merged_spectra"] % k) cnmf_obj.consensus( k, argdict["local_density_threshold"], argdict["local_neighborhood_size"], ) tpm = sc.read(cnmf_obj.paths["tpm"]) tpm.X = tpm.layers["raw_counts"].copy() cnmf_load_results( tpm, cnmf_dir=cnmf_obj.output_dir, name=cnmf_obj.name, k=k, dt=argdict["local_density_threshold"], key="cnmf", ) tpm.write( os.path.join( cnmf_obj.output_dir, cnmf_obj.name, cnmf_obj.name + "_k{}_dt{}.h5ad".format( str(k), str(argdict["local_density_threshold"]).replace(".", "_"), ), ), compression="gzip", ) if argdict["cleanup"]: files = ( glob.glob("{}/{}/*.consensus.*".format(args.output_dir, args.name)) + glob.glob( "{}/{}/cnmf_tmp/*.consensus.*".format(args.output_dir, args.name) ) + glob.glob("{}/{}/*.gene_spectra_*".format(args.output_dir, args.name)) + glob.glob( "{}/{}/cnmf_tmp/*.gene_spectra_*".format(args.output_dir, args.name) ) + glob.glob( "{}/{}/cnmf_tmp/*.local_density_cache.*".format( args.output_dir, args.name ) ) + glob.glob("{}/{}/cnmf_tmp/*.stats.*".format(args.output_dir, args.name)) ) for file in files: os.remove(file)
def factorize(args)
-
Expand source code
def factorize(args): argdict = vars(args) cnmf_obj = cNMF(output_dir=argdict["output_dir"], name=argdict["name"]) cnmf_obj._initialize_dirs() cnmf_obj.run_nmf(worker_i=argdict["worker_index"], total_workers=argdict["n_jobs"])
def k_selection(args)
-
Expand source code
def k_selection(args): argdict = vars(args) cnmf_obj = cNMF(output_dir=argdict["output_dir"], name=argdict["name"]) cnmf_obj._initialize_dirs() cnmf_obj.k_selection_plot()
def prepare(args)
-
Expand source code
def prepare(args): argdict = vars(args) cnmf_obj = cNMF(output_dir=argdict["output_dir"], name=argdict["name"]) cnmf_obj._initialize_dirs() print("Reading in counts from {} - ".format(argdict["counts"]), end="") if argdict["counts"].endswith(".h5ad"): input_counts = sc.read(argdict["counts"]) else: ## Load txt or compressed dataframe and convert to scanpy object if argdict["counts"].endswith(".npz"): input_counts = load_df_from_npz(argdict["counts"]) else: input_counts = pd.read_csv(argdict["counts"], sep="\t", index_col=0) if argdict["densify"]: input_counts = sc.AnnData( X=input_counts.values, obs=pd.DataFrame(index=input_counts.index), var=pd.DataFrame(index=input_counts.columns), ) else: input_counts = sc.AnnData( X=sp.csr_matrix(input_counts.values), obs=pd.DataFrame(index=input_counts.index), var=pd.DataFrame(index=input_counts.columns), ) print("{} cells and {} genes".format(input_counts.n_obs, input_counts.n_vars)) # use desired layer if not .X if args.layer is not None: print("Using layer '{}' for cNMF".format(args.layer)) input_counts.X = input_counts.layers[args.layer].copy() if sp.issparse(input_counts.X) & argdict["densify"]: input_counts.X = np.array(input_counts.X.todense()) if argdict["tpm"] is None: tpm = compute_tpm(input_counts) elif argdict["tpm"].endswith(".h5ad"): subprocess.call( "cp %s %s" % (argdict["tpm"], cnmf_obj.paths["tpm"]), shell=True ) tpm = sc.read(cnmf_obj.paths["tpm"]) else: if argdict["tpm"].endswith(".npz"): tpm = load_df_from_npz(argdict["tpm"]) else: tpm = pd.read_csv(argdict["tpm"], sep="\t", index_col=0) if argdict["densify"]: tpm = sc.AnnData( X=tpm.values, obs=pd.DataFrame(index=tpm.index), var=pd.DataFrame(index=tpm.columns), ) else: tpm = sc.AnnData( X=sp.csr_matrix(tpm.values), obs=pd.DataFrame(index=tpm.index), var=pd.DataFrame(index=tpm.columns), ) if argdict["subset"]: tpm = subset_adata(tpm, subset=argdict["subset"]) n_null = tpm.n_vars - tpm.X.sum(axis=0).astype(bool).sum() if n_null > 0: sc.pp.filter_genes(tpm, min_counts=1) print( "Removing {} genes with zero counts; final shape {}".format( n_null, tpm.shape ) ) # replace var_names with desired .var column (to switch symbol for Ensembl ID) if args.gene_symbol_col is not None: input_counts = replace_var_names_adata( input_counts, var_col=args.gene_symbol_col ) # replace var_names with desired .var column (to switch symbol for Ensembl ID) if args.gene_symbol_col is not None: tpm = replace_var_names_adata(tpm, var_col=args.gene_symbol_col, verbose=False) tpm.write(cnmf_obj.paths["tpm"], compression="gzip") if sp.issparse(tpm.X): gene_tpm_mean = np.array(tpm.X.mean(axis=0)).reshape(-1) gene_tpm_stddev = var_sparse_matrix(tpm.X) ** 0.5 else: gene_tpm_mean = np.array(tpm.X.mean(axis=0)).reshape(-1) gene_tpm_stddev = np.array(tpm.X.std(axis=0, ddof=0)).reshape(-1) input_tpm_stats = pd.DataFrame( [gene_tpm_mean, gene_tpm_stddev], index=["__mean", "__std"] ).T save_df_to_npz(input_tpm_stats, cnmf_obj.paths["tpm_stats"]) if argdict["genes_file"] is not None: highvargenes = open(argdict["genes_file"]).read().rstrip().split("\n") else: highvargenes = None norm_counts = cnmf_obj.get_norm_counts( input_counts, tpm, num_highvar_genes=argdict["numgenes"], high_variance_genes_filter=highvargenes, ) cnmf_obj.save_norm_counts(norm_counts) (replicate_params, run_params) = cnmf_obj.get_nmf_iter_params( ks=argdict["components"], n_iter=argdict["n_iter"], random_state_seed=argdict["seed"], beta_loss=argdict["beta_loss"], ) cnmf_obj.save_nmf_iter_params(replicate_params, run_params)
Classes
class cNMF (output_dir='.', name=None)
-
Consensus NMF object
Containerizes the cNMF inputs and outputs to allow for easy pipelining
Parameters
output_dir
:path
, optional(default=".")
- Output directory for analysis files.
name
:string
, optional(default=None)
- A name for this analysis. Will be prefixed to all output files. If set to None, will be automatically generated from date (and random string).
Expand source code
class cNMF: """ Consensus NMF object Containerizes the cNMF inputs and outputs to allow for easy pipelining """ def __init__(self, output_dir=".", name=None): """ Parameters ---------- output_dir : path, optional (default=".") Output directory for analysis files. name : string, optional (default=None) A name for this analysis. Will be prefixed to all output files. If set to None, will be automatically generated from date (and random string). """ self.output_dir = output_dir if name is None: now = datetime.datetime.now() rand_hash = uuid.uuid4().hex[:6] name = "%s_%s" % (now.strftime("%Y_%m_%d"), rand_hash) self.name = name self.paths = None def _initialize_dirs(self): if self.paths is None: # Check that output directory exists, create it if needed. check_dir_exists(self.output_dir) check_dir_exists(os.path.join(self.output_dir, self.name)) check_dir_exists(os.path.join(self.output_dir, self.name, "cnmf_tmp")) self.paths = { "normalized_counts": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".norm_counts.h5ad", ), "nmf_replicate_parameters": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".nmf_params.df.npz", ), "nmf_run_parameters": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".nmf_idvrun_params.yaml", ), "nmf_genes_list": os.path.join( self.output_dir, self.name, self.name + ".overdispersed_genes.txt" ), "tpm": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".tpm.h5ad" ), "tpm_stats": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".tpm_stats.df.npz", ), "iter_spectra": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".spectra.k_%d.iter_%d.df.npz", ), "iter_usages": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".usages.k_%d.iter_%d.df.npz", ), "merged_spectra": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".spectra.k_%d.merged.df.npz", ), "local_density_cache": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".local_density_cache.k_%d.merged.df.npz", ), "consensus_spectra": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".spectra.k_%d.dt_%s.consensus.df.npz", ), "consensus_spectra__txt": os.path.join( self.output_dir, self.name, self.name + ".spectra.k_%d.dt_%s.consensus.txt", ), "consensus_usages": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".usages.k_%d.dt_%s.consensus.df.npz", ), "consensus_usages__txt": os.path.join( self.output_dir, self.name, self.name + ".usages.k_%d.dt_%s.consensus.txt", ), "consensus_stats": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".stats.k_%d.dt_%s.df.npz", ), "clustering_plot": os.path.join( self.output_dir, self.name, self.name + ".clustering.k_%d.dt_%s.png" ), "gene_spectra_score": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".gene_spectra_score.k_%d.dt_%s.df.npz", ), "gene_spectra_score__txt": os.path.join( self.output_dir, self.name, self.name + ".gene_spectra_score.k_%d.dt_%s.txt", ), "gene_spectra_tpm": os.path.join( self.output_dir, self.name, "cnmf_tmp", self.name + ".gene_spectra_tpm.k_%d.dt_%s.df.npz", ), "gene_spectra_tpm__txt": os.path.join( self.output_dir, self.name, self.name + ".gene_spectra_tpm.k_%d.dt_%s.txt", ), "k_selection_plot": os.path.join( self.output_dir, self.name, self.name + ".k_selection.png" ), "k_selection_stats": os.path.join( self.output_dir, self.name, self.name + ".k_selection_stats.df.npz" ), } def get_norm_counts( self, counts, tpm, high_variance_genes_filter=None, num_highvar_genes=None ): """ Parameters ---------- counts : anndata.AnnData Scanpy AnnData object (cells x genes) containing raw counts. Filtered such that no genes or cells with 0 counts tpm : anndata.AnnData Scanpy AnnData object (cells x genes) containing tpm normalized data matching counts high_variance_genes_filter : np.array, optional (default=None) A pre-specified list of genes considered to be high-variance. Only these genes will be used during factorization of the counts matrix. Must match the .var index of counts and tpm. If set to `None`, high-variance genes will be automatically computed, using the parameters below. num_highvar_genes : int, optional (default=None) Instead of providing an array of high-variance genes, identify this many most overdispersed genes for filtering Returns ------- normcounts : anndata.AnnData, shape (cells, num_highvar_genes) A counts matrix containing only the high variance genes and with columns (genes) normalized to unit variance """ if high_variance_genes_filter is None: ## Get list of high-var genes if one wasn't provided if sp.issparse(tpm.X): (gene_counts_stats, gene_fano_params) = get_highvar_genes_sparse( tpm.X, numgenes=num_highvar_genes ) else: (gene_counts_stats, gene_fano_params) = get_highvar_genes( np.array(tpm.X), numgenes=num_highvar_genes ) high_variance_genes_filter = list( tpm.var.index[gene_counts_stats.high_var.values] ) ## Subset out high-variance genes print( "Selecting {} highly variable genes".format(len(high_variance_genes_filter)) ) norm_counts = counts[:, high_variance_genes_filter] norm_counts = norm_counts[tpm.obs_names, :].copy() ## Scale genes to unit variance if sp.issparse(tpm.X): sc.pp.scale(norm_counts, zero_center=False) if np.isnan(norm_counts.X.data).sum() > 0: print("Warning: NaNs in normalized counts matrix") else: norm_counts.X /= norm_counts.X.std(axis=0, ddof=1) if np.isnan(norm_counts.X).sum().sum() > 0: print("Warning: NaNs in normalized counts matrix") ## Save a \n-delimited list of the high-variance genes used for factorization open(self.paths["nmf_genes_list"], "w").write( "\n".join(high_variance_genes_filter) ) ## Check for any cells that have 0 counts of the overdispersed genes zerocells = norm_counts.X.sum(axis=1) == 0 if zerocells.sum() > 0: print( "Warning: %d cells have zero counts of overdispersed genes - ignoring these cells for factorization." % (zerocells.sum()) ) sc.pp.filter_cells(norm_counts, min_counts=1) return norm_counts def save_norm_counts(self, norm_counts): self._initialize_dirs() norm_counts.write(self.paths["normalized_counts"], compression="gzip") def get_nmf_iter_params( self, ks, n_iter=100, random_state_seed=None, beta_loss="kullback-leibler" ): """ Creates a DataFrame with parameters for NMF iterations Parameters ---------- ks : integer, or list-like. Number of topics (components) for factorization. Several values can be specified at the same time, which will be run independently. n_iter : integer, optional (defailt=100) Number of iterations for factorization. If several `k` are specified, this many iterations will be run for each value of `k`. random_state_seed : int or None, optional (default=None) Seed for sklearn random state. """ if type(ks) is int: ks = [ks] # Remove any repeated k values, and order. k_list = sorted(set(list(ks))) n_runs = len(ks) * n_iter np.random.seed(seed=random_state_seed) nmf_seeds = np.random.randint(low=1, high=(2**32) - 1, size=n_runs) replicate_params = [] for i, (k, r) in enumerate(itertools.product(k_list, range(n_iter))): replicate_params.append([k, r, nmf_seeds[i]]) replicate_params = pd.DataFrame( replicate_params, columns=["n_components", "iter", "nmf_seed"] ) _nmf_kwargs = dict( alpha_W=0.0, l1_ratio=0.0, beta_loss=beta_loss, solver="mu", tol=1e-4, max_iter=400, alpha_H="same", init="random", ) ## Coordinate descent is faster than multiplicative update but only works for frobenius if beta_loss == "frobenius": _nmf_kwargs["solver"] = "cd" return (replicate_params, _nmf_kwargs) def save_nmf_iter_params(self, replicate_params, run_params): self._initialize_dirs() save_df_to_npz(replicate_params, self.paths["nmf_replicate_parameters"]) with open(self.paths["nmf_run_parameters"], "w") as F: yaml.dump(run_params, F) def _nmf(self, X, nmf_kwargs): """ Parameters ---------- X : pandas.DataFrame, Normalized counts dataFrame to be factorized. nmf_kwargs : dict, Arguments to be passed to `non_negative_factorization` """ (usages, spectra, niter) = non_negative_factorization(X, **nmf_kwargs) return (spectra, usages) def run_nmf( self, worker_i=1, total_workers=1, ): """ Iteratively runs NMF with prespecified parameters Use the `worker_i` and `total_workers` parameters for parallelization. Generic kwargs for NMF are loaded from `self.paths['nmf_run_parameters']`, defaults below:: `non_negative_factorization` default arguments: alpha_W=0.0 l1_ratio=0.0 beta_loss='kullback-leibler' solver='mu' tol=1e-4, max_iter=200 alpha_H=None init='random' random_state, n_components are both set by the prespecified self.paths['nmf_replicate_parameters']. Parameters ---------- norm_counts : pandas.DataFrame, Normalized counts dataFrame to be factorized. (Output of `normalize_counts`) run_params : pandas.DataFrame, Parameters for NMF iterations. (Output of `prepare_nmf_iter_params`) """ self._initialize_dirs() run_params = load_df_from_npz(self.paths["nmf_replicate_parameters"]) norm_counts = sc.read(self.paths["normalized_counts"]) _nmf_kwargs = yaml.load( open(self.paths["nmf_run_parameters"]), Loader=yaml.FullLoader ) jobs_for_this_worker = worker_filter( range(len(run_params)), worker_i, total_workers ) for idx in jobs_for_this_worker: p = run_params.iloc[idx, :] print("[Worker %d]. Starting task %d." % (worker_i, idx)) _nmf_kwargs["random_state"] = p["nmf_seed"] _nmf_kwargs["n_components"] = p["n_components"] (spectra, usages) = self._nmf(norm_counts.X, _nmf_kwargs) spectra = pd.DataFrame( spectra, index=np.arange(1, _nmf_kwargs["n_components"] + 1), columns=norm_counts.var.index, ) save_df_to_npz( spectra, self.paths["iter_spectra"] % (p["n_components"], p["iter"]) ) def combine_nmf(self, k, remove_individual_iterations=False): run_params = load_df_from_npz(self.paths["nmf_replicate_parameters"]) print("Combining factorizations for k=%d." % k) self._initialize_dirs() combined_spectra = None n_iter = sum(run_params.n_components == k) run_params_subset = run_params[run_params.n_components == k].sort_values("iter") spectra_labels = [] for i, p in run_params_subset.iterrows(): spectra = load_df_from_npz( self.paths["iter_spectra"] % (p["n_components"], p["iter"]) ) if combined_spectra is None: combined_spectra = np.zeros((n_iter, k, spectra.shape[1])) combined_spectra[p["iter"], :, :] = spectra.values for t in range(k): spectra_labels.append("iter%d_topic%d" % (p["iter"], t + 1)) combined_spectra = combined_spectra.reshape(-1, combined_spectra.shape[-1]) combined_spectra = pd.DataFrame( combined_spectra, columns=spectra.columns, index=spectra_labels ) save_df_to_npz(combined_spectra, self.paths["merged_spectra"] % k) return combined_spectra def consensus( self, k, density_threshold_str="0.5", local_neighborhood_size=0.30, show_clustering=True, skip_density_and_return_after_stats=False, close_clustergram_fig=True, ): merged_spectra = load_df_from_npz(self.paths["merged_spectra"] % k) norm_counts = sc.read(self.paths["normalized_counts"]) if skip_density_and_return_after_stats: density_threshold_str = "2" density_threshold_repl = density_threshold_str.replace(".", "_") density_threshold = float(density_threshold_str) n_neighbors = int(local_neighborhood_size * merged_spectra.shape[0] / k) # Rescale topics such to length of 1. l2_spectra = (merged_spectra.T / np.sqrt((merged_spectra**2).sum(axis=1))).T if not skip_density_and_return_after_stats: # Compute the local density matrix (if not previously cached) topics_dist = None if os.path.isfile(self.paths["local_density_cache"] % k): local_density = load_df_from_npz(self.paths["local_density_cache"] % k) else: # first find the full distance matrix topics_dist = squareform(fast_euclidean(l2_spectra.values)) # partition based on the first n neighbors partitioning_order = np.argpartition(topics_dist, n_neighbors + 1)[ :, : n_neighbors + 1 ] # find the mean over those n_neighbors (excluding self, which has a distance of 0) distance_to_nearest_neighbors = topics_dist[ np.arange(topics_dist.shape[0])[:, None], partitioning_order ] local_density = pd.DataFrame( distance_to_nearest_neighbors.sum(1) / (n_neighbors), columns=["local_density"], index=l2_spectra.index, ) save_df_to_npz(local_density, self.paths["local_density_cache"] % k) del partitioning_order del distance_to_nearest_neighbors density_filter = local_density.iloc[:, 0] < density_threshold l2_spectra = l2_spectra.loc[density_filter, :] kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=1) kmeans_model.fit(l2_spectra) kmeans_cluster_labels = pd.Series( kmeans_model.labels_ + 1, index=l2_spectra.index ) # Find median usage for each gene across cluster median_spectra = l2_spectra.groupby(kmeans_cluster_labels).median() # Normalize median spectra to probability distributions. median_spectra = (median_spectra.T / median_spectra.sum(1)).T # Compute the silhouette score stability = silhouette_score( l2_spectra.values, kmeans_cluster_labels, metric="euclidean" ) # Obtain the reconstructed count matrix by re-fitting the usage matrix and computing the dot product: usage.dot(spectra) refit_nmf_kwargs = yaml.load( open(self.paths["nmf_run_parameters"]), Loader=yaml.FullLoader ) refit_nmf_kwargs.update( dict(n_components=k, H=median_spectra.values, update_H=False) ) # ensure dtypes match for factorization if median_spectra.values.dtype != norm_counts.X.dtype: norm_counts.X = norm_counts.X.astype(median_spectra.values.dtype) _, rf_usages = self._nmf(norm_counts.X, nmf_kwargs=refit_nmf_kwargs) rf_usages = pd.DataFrame( rf_usages, index=norm_counts.obs.index, columns=median_spectra.index ) rf_pred_norm_counts = rf_usages.dot(median_spectra) # Compute prediction error as a frobenius norm if sp.issparse(norm_counts.X): prediction_error = ( ((norm_counts.X.todense() - rf_pred_norm_counts) ** 2).sum().sum() ) else: prediction_error = ((norm_counts.X - rf_pred_norm_counts) ** 2).sum().sum() consensus_stats = pd.DataFrame( [k, density_threshold, stability, prediction_error], index=["k", "local_density_threshold", "stability", "prediction_error"], columns=["stats"], ) if skip_density_and_return_after_stats: return consensus_stats save_df_to_npz( median_spectra, self.paths["consensus_spectra"] % (k, density_threshold_repl), ) save_df_to_npz( rf_usages, self.paths["consensus_usages"] % (k, density_threshold_repl) ) save_df_to_npz( consensus_stats, self.paths["consensus_stats"] % (k, density_threshold_repl) ) save_df_to_text( median_spectra, self.paths["consensus_spectra__txt"] % (k, density_threshold_repl), ) save_df_to_text( rf_usages, self.paths["consensus_usages__txt"] % (k, density_threshold_repl) ) # Compute gene-scores for each GEP by regressing usage on Z-scores of TPM tpm = sc.read(self.paths["tpm"]) # ignore cells not present in norm_counts if tpm.n_obs != norm_counts.n_obs: tpm = tpm[norm_counts.obs_names, :].copy() tpm_stats = load_df_from_npz(self.paths["tpm_stats"]) if sp.issparse(tpm.X): norm_tpm = ( np.array(tpm.X.todense()) - tpm_stats["__mean"].values ) / tpm_stats["__std"].values else: norm_tpm = (tpm.X - tpm_stats["__mean"].values) / tpm_stats["__std"].values usage_coef = fast_ols_all_cols(rf_usages.values, norm_tpm) usage_coef = pd.DataFrame( usage_coef, index=rf_usages.columns, columns=tpm.var.index ) save_df_to_npz( usage_coef, self.paths["gene_spectra_score"] % (k, density_threshold_repl) ) save_df_to_text( usage_coef, self.paths["gene_spectra_score__txt"] % (k, density_threshold_repl), ) # Convert spectra to TPM units, and obtain results for all genes by running # last step of NMF with usages fixed and TPM as the input matrix norm_usages = rf_usages.div(rf_usages.sum(axis=1), axis=0) refit_nmf_kwargs.update( dict( H=norm_usages.T.values, ) ) # ensure dtypes match for factorization if norm_usages.values.dtype != tpm.X.dtype: tpm.X = tpm.X.astype(norm_usages.values.dtype) _, spectra_tpm = self._nmf(tpm.X.T, nmf_kwargs=refit_nmf_kwargs) spectra_tpm = pd.DataFrame( spectra_tpm.T, index=rf_usages.columns, columns=tpm.var.index ) save_df_to_npz( spectra_tpm, self.paths["gene_spectra_tpm"] % (k, density_threshold_repl) ) save_df_to_text( spectra_tpm, self.paths["gene_spectra_tpm__txt"] % (k, density_threshold_repl), ) if show_clustering: if topics_dist is None: topics_dist = squareform(fast_euclidean(l2_spectra.values)) # (l2_spectra was already filtered using the density filter) else: # (but the previously computed topics_dist was not!) topics_dist = topics_dist[density_filter.values, :][ :, density_filter.values ] spectra_order = [] for cl in sorted(set(kmeans_cluster_labels)): cl_filter = kmeans_cluster_labels == cl if cl_filter.sum() > 1: cl_dist = squareform(topics_dist[cl_filter, :][:, cl_filter]) cl_dist[ cl_dist < 0 ] = 0 # Rarely get floating point arithmetic issues cl_link = linkage(cl_dist, "average") cl_leaves_order = leaves_list(cl_link) spectra_order += list(np.where(cl_filter)[0][cl_leaves_order]) else: ## Corner case where a component only has one element spectra_order += list(np.where(cl_filter)[0]) from matplotlib import gridspec import matplotlib.pyplot as plt width_ratios = [0.5, 9, 0.5, 4, 1] height_ratios = [0.5, 9] fig = plt.figure(figsize=(sum(width_ratios), sum(height_ratios))) gs = gridspec.GridSpec( len(height_ratios), len(width_ratios), fig, 0.01, 0.01, 0.98, 0.98, height_ratios=height_ratios, width_ratios=width_ratios, wspace=0, hspace=0, ) dist_ax = fig.add_subplot( gs[1, 1], xscale="linear", yscale="linear", xticks=[], yticks=[], xlabel="", ylabel="", frameon=True, ) D = topics_dist[spectra_order, :][:, spectra_order] dist_im = dist_ax.imshow( D, interpolation="none", cmap="viridis", aspect="auto", rasterized=True ) left_ax = fig.add_subplot( gs[1, 0], xscale="linear", yscale="linear", xticks=[], yticks=[], xlabel="", ylabel="", frameon=True, ) left_ax.imshow( kmeans_cluster_labels.values[spectra_order].reshape(-1, 1), interpolation="none", cmap="Spectral", aspect="auto", rasterized=True, ) top_ax = fig.add_subplot( gs[0, 1], xscale="linear", yscale="linear", xticks=[], yticks=[], xlabel="", ylabel="", frameon=True, ) top_ax.imshow( kmeans_cluster_labels.values[spectra_order].reshape(1, -1), interpolation="none", cmap="Spectral", aspect="auto", rasterized=True, ) hist_gs = gridspec.GridSpecFromSubplotSpec( 3, 1, subplot_spec=gs[1, 3], wspace=0, hspace=0 ) hist_ax = fig.add_subplot( hist_gs[0, 0], xscale="linear", yscale="linear", xlabel="", ylabel="", frameon=True, title="Local density histogram", ) hist_ax.hist(local_density.values, bins=np.linspace(0, 1, 50)) hist_ax.yaxis.tick_right() xlim = hist_ax.get_xlim() ylim = hist_ax.get_ylim() if density_threshold < xlim[1]: hist_ax.axvline(density_threshold, linestyle="--", color="k") hist_ax.text( density_threshold + 0.02, ylim[1] * 0.95, "filtering\nthreshold\n\n", va="top", ) hist_ax.set_xlim(xlim) hist_ax.set_xlabel( "Mean distance to k nearest neighbors\n\n%d/%d (%.0f%%) spectra above threshold\nwere removed prior to clustering" % ( sum(~density_filter), len(density_filter), 100 * (~density_filter).mean(), ) ) fig.savefig( self.paths["clustering_plot"] % (k, density_threshold_repl), dpi=250 ) if close_clustergram_fig: plt.close(fig) def k_selection_plot(self, close_fig=True): """ Borrowed from Alexandrov Et Al. 2013 Deciphering Mutational Signatures publication in Cell Reports """ run_params = load_df_from_npz(self.paths["nmf_replicate_parameters"]) stats = [] for k in sorted(set(run_params.n_components)): stats.append( self.consensus(k, skip_density_and_return_after_stats=True).stats ) stats = pd.DataFrame(stats) stats.reset_index(drop=True, inplace=True) save_df_to_npz(stats, self.paths["k_selection_stats"]) fig = plt.figure(figsize=(6, 4)) ax1 = fig.add_subplot(111) ax2 = ax1.twinx() ax1.plot(stats.k, stats.stability, "o-", color="b") ax1.set_ylabel("Stability", color="b", fontsize=15) for tl in ax1.get_yticklabels(): tl.set_color("b") # ax1.set_xlabel('K', fontsize=15) ax2.plot(stats.k, stats.prediction_error, "o-", color="r") ax2.set_ylabel("Error", color="r", fontsize=15) for tl in ax2.get_yticklabels(): tl.set_color("r") ax1.set_xlabel("Number of Components", fontsize=15) ax1.grid(True) plt.tight_layout() fig.savefig(self.paths["k_selection_plot"], dpi=250) if close_fig: plt.close(fig)
Methods
def combine_nmf(self, k, remove_individual_iterations=False)
-
Expand source code
def combine_nmf(self, k, remove_individual_iterations=False): run_params = load_df_from_npz(self.paths["nmf_replicate_parameters"]) print("Combining factorizations for k=%d." % k) self._initialize_dirs() combined_spectra = None n_iter = sum(run_params.n_components == k) run_params_subset = run_params[run_params.n_components == k].sort_values("iter") spectra_labels = [] for i, p in run_params_subset.iterrows(): spectra = load_df_from_npz( self.paths["iter_spectra"] % (p["n_components"], p["iter"]) ) if combined_spectra is None: combined_spectra = np.zeros((n_iter, k, spectra.shape[1])) combined_spectra[p["iter"], :, :] = spectra.values for t in range(k): spectra_labels.append("iter%d_topic%d" % (p["iter"], t + 1)) combined_spectra = combined_spectra.reshape(-1, combined_spectra.shape[-1]) combined_spectra = pd.DataFrame( combined_spectra, columns=spectra.columns, index=spectra_labels ) save_df_to_npz(combined_spectra, self.paths["merged_spectra"] % k) return combined_spectra
def consensus(self, k, density_threshold_str='0.5', local_neighborhood_size=0.3, show_clustering=True, skip_density_and_return_after_stats=False, close_clustergram_fig=True)
-
Expand source code
def consensus( self, k, density_threshold_str="0.5", local_neighborhood_size=0.30, show_clustering=True, skip_density_and_return_after_stats=False, close_clustergram_fig=True, ): merged_spectra = load_df_from_npz(self.paths["merged_spectra"] % k) norm_counts = sc.read(self.paths["normalized_counts"]) if skip_density_and_return_after_stats: density_threshold_str = "2" density_threshold_repl = density_threshold_str.replace(".", "_") density_threshold = float(density_threshold_str) n_neighbors = int(local_neighborhood_size * merged_spectra.shape[0] / k) # Rescale topics such to length of 1. l2_spectra = (merged_spectra.T / np.sqrt((merged_spectra**2).sum(axis=1))).T if not skip_density_and_return_after_stats: # Compute the local density matrix (if not previously cached) topics_dist = None if os.path.isfile(self.paths["local_density_cache"] % k): local_density = load_df_from_npz(self.paths["local_density_cache"] % k) else: # first find the full distance matrix topics_dist = squareform(fast_euclidean(l2_spectra.values)) # partition based on the first n neighbors partitioning_order = np.argpartition(topics_dist, n_neighbors + 1)[ :, : n_neighbors + 1 ] # find the mean over those n_neighbors (excluding self, which has a distance of 0) distance_to_nearest_neighbors = topics_dist[ np.arange(topics_dist.shape[0])[:, None], partitioning_order ] local_density = pd.DataFrame( distance_to_nearest_neighbors.sum(1) / (n_neighbors), columns=["local_density"], index=l2_spectra.index, ) save_df_to_npz(local_density, self.paths["local_density_cache"] % k) del partitioning_order del distance_to_nearest_neighbors density_filter = local_density.iloc[:, 0] < density_threshold l2_spectra = l2_spectra.loc[density_filter, :] kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=1) kmeans_model.fit(l2_spectra) kmeans_cluster_labels = pd.Series( kmeans_model.labels_ + 1, index=l2_spectra.index ) # Find median usage for each gene across cluster median_spectra = l2_spectra.groupby(kmeans_cluster_labels).median() # Normalize median spectra to probability distributions. median_spectra = (median_spectra.T / median_spectra.sum(1)).T # Compute the silhouette score stability = silhouette_score( l2_spectra.values, kmeans_cluster_labels, metric="euclidean" ) # Obtain the reconstructed count matrix by re-fitting the usage matrix and computing the dot product: usage.dot(spectra) refit_nmf_kwargs = yaml.load( open(self.paths["nmf_run_parameters"]), Loader=yaml.FullLoader ) refit_nmf_kwargs.update( dict(n_components=k, H=median_spectra.values, update_H=False) ) # ensure dtypes match for factorization if median_spectra.values.dtype != norm_counts.X.dtype: norm_counts.X = norm_counts.X.astype(median_spectra.values.dtype) _, rf_usages = self._nmf(norm_counts.X, nmf_kwargs=refit_nmf_kwargs) rf_usages = pd.DataFrame( rf_usages, index=norm_counts.obs.index, columns=median_spectra.index ) rf_pred_norm_counts = rf_usages.dot(median_spectra) # Compute prediction error as a frobenius norm if sp.issparse(norm_counts.X): prediction_error = ( ((norm_counts.X.todense() - rf_pred_norm_counts) ** 2).sum().sum() ) else: prediction_error = ((norm_counts.X - rf_pred_norm_counts) ** 2).sum().sum() consensus_stats = pd.DataFrame( [k, density_threshold, stability, prediction_error], index=["k", "local_density_threshold", "stability", "prediction_error"], columns=["stats"], ) if skip_density_and_return_after_stats: return consensus_stats save_df_to_npz( median_spectra, self.paths["consensus_spectra"] % (k, density_threshold_repl), ) save_df_to_npz( rf_usages, self.paths["consensus_usages"] % (k, density_threshold_repl) ) save_df_to_npz( consensus_stats, self.paths["consensus_stats"] % (k, density_threshold_repl) ) save_df_to_text( median_spectra, self.paths["consensus_spectra__txt"] % (k, density_threshold_repl), ) save_df_to_text( rf_usages, self.paths["consensus_usages__txt"] % (k, density_threshold_repl) ) # Compute gene-scores for each GEP by regressing usage on Z-scores of TPM tpm = sc.read(self.paths["tpm"]) # ignore cells not present in norm_counts if tpm.n_obs != norm_counts.n_obs: tpm = tpm[norm_counts.obs_names, :].copy() tpm_stats = load_df_from_npz(self.paths["tpm_stats"]) if sp.issparse(tpm.X): norm_tpm = ( np.array(tpm.X.todense()) - tpm_stats["__mean"].values ) / tpm_stats["__std"].values else: norm_tpm = (tpm.X - tpm_stats["__mean"].values) / tpm_stats["__std"].values usage_coef = fast_ols_all_cols(rf_usages.values, norm_tpm) usage_coef = pd.DataFrame( usage_coef, index=rf_usages.columns, columns=tpm.var.index ) save_df_to_npz( usage_coef, self.paths["gene_spectra_score"] % (k, density_threshold_repl) ) save_df_to_text( usage_coef, self.paths["gene_spectra_score__txt"] % (k, density_threshold_repl), ) # Convert spectra to TPM units, and obtain results for all genes by running # last step of NMF with usages fixed and TPM as the input matrix norm_usages = rf_usages.div(rf_usages.sum(axis=1), axis=0) refit_nmf_kwargs.update( dict( H=norm_usages.T.values, ) ) # ensure dtypes match for factorization if norm_usages.values.dtype != tpm.X.dtype: tpm.X = tpm.X.astype(norm_usages.values.dtype) _, spectra_tpm = self._nmf(tpm.X.T, nmf_kwargs=refit_nmf_kwargs) spectra_tpm = pd.DataFrame( spectra_tpm.T, index=rf_usages.columns, columns=tpm.var.index ) save_df_to_npz( spectra_tpm, self.paths["gene_spectra_tpm"] % (k, density_threshold_repl) ) save_df_to_text( spectra_tpm, self.paths["gene_spectra_tpm__txt"] % (k, density_threshold_repl), ) if show_clustering: if topics_dist is None: topics_dist = squareform(fast_euclidean(l2_spectra.values)) # (l2_spectra was already filtered using the density filter) else: # (but the previously computed topics_dist was not!) topics_dist = topics_dist[density_filter.values, :][ :, density_filter.values ] spectra_order = [] for cl in sorted(set(kmeans_cluster_labels)): cl_filter = kmeans_cluster_labels == cl if cl_filter.sum() > 1: cl_dist = squareform(topics_dist[cl_filter, :][:, cl_filter]) cl_dist[ cl_dist < 0 ] = 0 # Rarely get floating point arithmetic issues cl_link = linkage(cl_dist, "average") cl_leaves_order = leaves_list(cl_link) spectra_order += list(np.where(cl_filter)[0][cl_leaves_order]) else: ## Corner case where a component only has one element spectra_order += list(np.where(cl_filter)[0]) from matplotlib import gridspec import matplotlib.pyplot as plt width_ratios = [0.5, 9, 0.5, 4, 1] height_ratios = [0.5, 9] fig = plt.figure(figsize=(sum(width_ratios), sum(height_ratios))) gs = gridspec.GridSpec( len(height_ratios), len(width_ratios), fig, 0.01, 0.01, 0.98, 0.98, height_ratios=height_ratios, width_ratios=width_ratios, wspace=0, hspace=0, ) dist_ax = fig.add_subplot( gs[1, 1], xscale="linear", yscale="linear", xticks=[], yticks=[], xlabel="", ylabel="", frameon=True, ) D = topics_dist[spectra_order, :][:, spectra_order] dist_im = dist_ax.imshow( D, interpolation="none", cmap="viridis", aspect="auto", rasterized=True ) left_ax = fig.add_subplot( gs[1, 0], xscale="linear", yscale="linear", xticks=[], yticks=[], xlabel="", ylabel="", frameon=True, ) left_ax.imshow( kmeans_cluster_labels.values[spectra_order].reshape(-1, 1), interpolation="none", cmap="Spectral", aspect="auto", rasterized=True, ) top_ax = fig.add_subplot( gs[0, 1], xscale="linear", yscale="linear", xticks=[], yticks=[], xlabel="", ylabel="", frameon=True, ) top_ax.imshow( kmeans_cluster_labels.values[spectra_order].reshape(1, -1), interpolation="none", cmap="Spectral", aspect="auto", rasterized=True, ) hist_gs = gridspec.GridSpecFromSubplotSpec( 3, 1, subplot_spec=gs[1, 3], wspace=0, hspace=0 ) hist_ax = fig.add_subplot( hist_gs[0, 0], xscale="linear", yscale="linear", xlabel="", ylabel="", frameon=True, title="Local density histogram", ) hist_ax.hist(local_density.values, bins=np.linspace(0, 1, 50)) hist_ax.yaxis.tick_right() xlim = hist_ax.get_xlim() ylim = hist_ax.get_ylim() if density_threshold < xlim[1]: hist_ax.axvline(density_threshold, linestyle="--", color="k") hist_ax.text( density_threshold + 0.02, ylim[1] * 0.95, "filtering\nthreshold\n\n", va="top", ) hist_ax.set_xlim(xlim) hist_ax.set_xlabel( "Mean distance to k nearest neighbors\n\n%d/%d (%.0f%%) spectra above threshold\nwere removed prior to clustering" % ( sum(~density_filter), len(density_filter), 100 * (~density_filter).mean(), ) ) fig.savefig( self.paths["clustering_plot"] % (k, density_threshold_repl), dpi=250 ) if close_clustergram_fig: plt.close(fig)
def get_nmf_iter_params(self, ks, n_iter=100, random_state_seed=None, beta_loss='kullback-leibler')
-
Creates a DataFrame with parameters for NMF iterations
Parameters
ks : integer, or list-like. Number of topics (components) for factorization. Several values can be specified at the same time, which will be run independently.
n_iter
:integer
, optional(defailt=100)
- Number of iterations for factorization. If several
k
are specified, this many iterations will be run for each value ofk
. random_state_seed
:int
orNone
, optional(default=None)
- Seed for sklearn random state.
Expand source code
def get_nmf_iter_params( self, ks, n_iter=100, random_state_seed=None, beta_loss="kullback-leibler" ): """ Creates a DataFrame with parameters for NMF iterations Parameters ---------- ks : integer, or list-like. Number of topics (components) for factorization. Several values can be specified at the same time, which will be run independently. n_iter : integer, optional (defailt=100) Number of iterations for factorization. If several `k` are specified, this many iterations will be run for each value of `k`. random_state_seed : int or None, optional (default=None) Seed for sklearn random state. """ if type(ks) is int: ks = [ks] # Remove any repeated k values, and order. k_list = sorted(set(list(ks))) n_runs = len(ks) * n_iter np.random.seed(seed=random_state_seed) nmf_seeds = np.random.randint(low=1, high=(2**32) - 1, size=n_runs) replicate_params = [] for i, (k, r) in enumerate(itertools.product(k_list, range(n_iter))): replicate_params.append([k, r, nmf_seeds[i]]) replicate_params = pd.DataFrame( replicate_params, columns=["n_components", "iter", "nmf_seed"] ) _nmf_kwargs = dict( alpha_W=0.0, l1_ratio=0.0, beta_loss=beta_loss, solver="mu", tol=1e-4, max_iter=400, alpha_H="same", init="random", ) ## Coordinate descent is faster than multiplicative update but only works for frobenius if beta_loss == "frobenius": _nmf_kwargs["solver"] = "cd" return (replicate_params, _nmf_kwargs)
def get_norm_counts(self, counts, tpm, high_variance_genes_filter=None, num_highvar_genes=None)
-
Parameters
counts
:anndata.AnnData
- Scanpy AnnData object (cells x genes) containing raw counts. Filtered such that no genes or cells with 0 counts
tpm
:anndata.AnnData
- Scanpy AnnData object (cells x genes) containing tpm normalized data matching counts
high_variance_genes_filter
:np.array
, optional(default=None)
- A pre-specified list of genes considered to be high-variance.
Only these genes will be used during factorization of the counts matrix.
Must match the .var index of counts and tpm.
If set to
None
, high-variance genes will be automatically computed, using the parameters below. num_highvar_genes
:int
, optional(default=None)
- Instead of providing an array of high-variance genes, identify this many most overdispersed genes for filtering
Returns
normcounts
:anndata.AnnData, shape (cells, num_highvar_genes)
- A counts matrix containing only the high variance genes and with columns (genes) normalized to unit variance
Expand source code
def get_norm_counts( self, counts, tpm, high_variance_genes_filter=None, num_highvar_genes=None ): """ Parameters ---------- counts : anndata.AnnData Scanpy AnnData object (cells x genes) containing raw counts. Filtered such that no genes or cells with 0 counts tpm : anndata.AnnData Scanpy AnnData object (cells x genes) containing tpm normalized data matching counts high_variance_genes_filter : np.array, optional (default=None) A pre-specified list of genes considered to be high-variance. Only these genes will be used during factorization of the counts matrix. Must match the .var index of counts and tpm. If set to `None`, high-variance genes will be automatically computed, using the parameters below. num_highvar_genes : int, optional (default=None) Instead of providing an array of high-variance genes, identify this many most overdispersed genes for filtering Returns ------- normcounts : anndata.AnnData, shape (cells, num_highvar_genes) A counts matrix containing only the high variance genes and with columns (genes) normalized to unit variance """ if high_variance_genes_filter is None: ## Get list of high-var genes if one wasn't provided if sp.issparse(tpm.X): (gene_counts_stats, gene_fano_params) = get_highvar_genes_sparse( tpm.X, numgenes=num_highvar_genes ) else: (gene_counts_stats, gene_fano_params) = get_highvar_genes( np.array(tpm.X), numgenes=num_highvar_genes ) high_variance_genes_filter = list( tpm.var.index[gene_counts_stats.high_var.values] ) ## Subset out high-variance genes print( "Selecting {} highly variable genes".format(len(high_variance_genes_filter)) ) norm_counts = counts[:, high_variance_genes_filter] norm_counts = norm_counts[tpm.obs_names, :].copy() ## Scale genes to unit variance if sp.issparse(tpm.X): sc.pp.scale(norm_counts, zero_center=False) if np.isnan(norm_counts.X.data).sum() > 0: print("Warning: NaNs in normalized counts matrix") else: norm_counts.X /= norm_counts.X.std(axis=0, ddof=1) if np.isnan(norm_counts.X).sum().sum() > 0: print("Warning: NaNs in normalized counts matrix") ## Save a \n-delimited list of the high-variance genes used for factorization open(self.paths["nmf_genes_list"], "w").write( "\n".join(high_variance_genes_filter) ) ## Check for any cells that have 0 counts of the overdispersed genes zerocells = norm_counts.X.sum(axis=1) == 0 if zerocells.sum() > 0: print( "Warning: %d cells have zero counts of overdispersed genes - ignoring these cells for factorization." % (zerocells.sum()) ) sc.pp.filter_cells(norm_counts, min_counts=1) return norm_counts
def k_selection_plot(self, close_fig=True)
-
Borrowed from Alexandrov Et Al. 2013 Deciphering Mutational Signatures publication in Cell Reports
Expand source code
def k_selection_plot(self, close_fig=True): """ Borrowed from Alexandrov Et Al. 2013 Deciphering Mutational Signatures publication in Cell Reports """ run_params = load_df_from_npz(self.paths["nmf_replicate_parameters"]) stats = [] for k in sorted(set(run_params.n_components)): stats.append( self.consensus(k, skip_density_and_return_after_stats=True).stats ) stats = pd.DataFrame(stats) stats.reset_index(drop=True, inplace=True) save_df_to_npz(stats, self.paths["k_selection_stats"]) fig = plt.figure(figsize=(6, 4)) ax1 = fig.add_subplot(111) ax2 = ax1.twinx() ax1.plot(stats.k, stats.stability, "o-", color="b") ax1.set_ylabel("Stability", color="b", fontsize=15) for tl in ax1.get_yticklabels(): tl.set_color("b") # ax1.set_xlabel('K', fontsize=15) ax2.plot(stats.k, stats.prediction_error, "o-", color="r") ax2.set_ylabel("Error", color="r", fontsize=15) for tl in ax2.get_yticklabels(): tl.set_color("r") ax1.set_xlabel("Number of Components", fontsize=15) ax1.grid(True) plt.tight_layout() fig.savefig(self.paths["k_selection_plot"], dpi=250) if close_fig: plt.close(fig)
def run_nmf(self, worker_i=1, total_workers=1)
-
Iteratively runs NMF with prespecified parameters
Use the
worker_i
andtotal_workers
parameters for parallelization. Generic kwargs for NMF are loaded fromself.paths['nmf_run_parameters']
, defaults below::<code>non\_negative\_factorization</code> default arguments: alpha_W=0.0 l1_ratio=0.0 beta_loss='kullback-leibler' solver='mu' tol=1e-4, max_iter=200 alpha_H=None init='random' random_state, n_components are both set by the prespecified self.paths['nmf_replicate_parameters'].
Parameters
norm_counts
:pandas.DataFrame,
- Normalized counts dataFrame to be factorized.
(Output of
normalize_counts
) run_params
:pandas.DataFrame,
- Parameters for NMF iterations.
(Output of
prepare_nmf_iter_params
)
Expand source code
def run_nmf( self, worker_i=1, total_workers=1, ): """ Iteratively runs NMF with prespecified parameters Use the `worker_i` and `total_workers` parameters for parallelization. Generic kwargs for NMF are loaded from `self.paths['nmf_run_parameters']`, defaults below:: `non_negative_factorization` default arguments: alpha_W=0.0 l1_ratio=0.0 beta_loss='kullback-leibler' solver='mu' tol=1e-4, max_iter=200 alpha_H=None init='random' random_state, n_components are both set by the prespecified self.paths['nmf_replicate_parameters']. Parameters ---------- norm_counts : pandas.DataFrame, Normalized counts dataFrame to be factorized. (Output of `normalize_counts`) run_params : pandas.DataFrame, Parameters for NMF iterations. (Output of `prepare_nmf_iter_params`) """ self._initialize_dirs() run_params = load_df_from_npz(self.paths["nmf_replicate_parameters"]) norm_counts = sc.read(self.paths["normalized_counts"]) _nmf_kwargs = yaml.load( open(self.paths["nmf_run_parameters"]), Loader=yaml.FullLoader ) jobs_for_this_worker = worker_filter( range(len(run_params)), worker_i, total_workers ) for idx in jobs_for_this_worker: p = run_params.iloc[idx, :] print("[Worker %d]. Starting task %d." % (worker_i, idx)) _nmf_kwargs["random_state"] = p["nmf_seed"] _nmf_kwargs["n_components"] = p["n_components"] (spectra, usages) = self._nmf(norm_counts.X, _nmf_kwargs) spectra = pd.DataFrame( spectra, index=np.arange(1, _nmf_kwargs["n_components"] + 1), columns=norm_counts.var.index, ) save_df_to_npz( spectra, self.paths["iter_spectra"] % (p["n_components"], p["iter"]) )
def save_nmf_iter_params(self, replicate_params, run_params)
-
Expand source code
def save_nmf_iter_params(self, replicate_params, run_params): self._initialize_dirs() save_df_to_npz(replicate_params, self.paths["nmf_replicate_parameters"]) with open(self.paths["nmf_run_parameters"], "w") as F: yaml.dump(run_params, F)
def save_norm_counts(self, norm_counts)
-
Expand source code
def save_norm_counts(self, norm_counts): self._initialize_dirs() norm_counts.write(self.paths["normalized_counts"], compression="gzip")