Package `kitchen`

Manipulate counts matrix files and cook scRNA-seq data from command line

Expand source code

# -*- coding: utf-8 -*-
"""
Manipulate counts matrix files and cook scRNA-seq data from command line
"""
from .kitchen import (
    info,
    to_h5ad,
    transpose,
    rename_obs,
    add_label,
    knee_point,
    subset,
    concatenate,
    recipe,
)

__all__ = [
    "info",
    "to_h5ad",
    "transpose",
    "rename_obs",
    "add_label",
    "knee_point",
    "subset",
    "concatenate",
    "recipe",
]

from ._version import get_versions

__version__ = get_versions()["version"]
del get_versions

Sub-modules

kitchen.ingredients: Functions for manipulating .h5ad objects and automated processing of scRNA-seq data
kitchen.kitchen: Manipulate .h5ad files and cook scRNA-seq data from command line

Functions

def add_label(args)

Uses .obs_names from filtered counts matrix to add binary label to a reference anndata object, "True" = present in filt, "False" = not present. Overwrites reference .h5ad file.

Expand source code

def add_label(args):
    """
    Uses .obs_names from filtered counts matrix to add binary label to a reference
    anndata object, "True" = present in filt, "False" = not present.
    Overwrites reference .h5ad file.
    """
    # read reference file into anndata obj
    if args.verbose:
        print("Reading {}".format(args.ref_file))
    a = sc.read(args.ref_file)
    if args.verbose:
        print("\t", a)
    # read query file into anndata obj
    if args.verbose:
        print("\nReading {}".format(args.filt_file))
    b = sc.read(args.filt_file)
    if args.verbose:
        print("\t", b)
    # add .obs column to ref_file
    a.obs[args.label] = "False"
    a.obs.loc[b.obs_names, args.label] = "True"
    if args.verbose:
        print(
            "\nTransferring labels to {}:\n{}".format(
                args.ref_file, a.obs[args.label].value_counts()
            )
        )
    # save file as .h5ad
    if args.verbose:
        print("\nWriting counts to {}".format(args.ref_file))
    a.write(args.ref_file, compression="gzip")
    if args.rm_orig_file:
        # remove filtered file
        if args.verbose:
            print("\nRemoving {}".format(args.filt_file))
        os.remove(args.filt_file)

def concatenate(args)

Concatenates list of anndata objects in .h5ad format, keeping union of genes

Expand source code

def concatenate(args):
    """Concatenates list of anndata objects in .h5ad format, keeping union of genes"""
    # read first file
    if args.verbose:
        print("Reading {}".format(args.files[0]))
    adata_0 = sc.read(args.files[0])
    # read the rest of the files into list
    adatas = []
    for f in args.files[1:]:
        # read file into anndata obj
        if args.verbose:
            print("Reading {}".format(f))
        adatas.append(sc.read(f))
    # concatenate all files
    if args.verbose:
        print("Concatenating files...")
    concat = adata_0.concatenate(
        adatas,
        join="outer",
        batch_categories=[os.path.splitext(os.path.basename(x))[0] for x in args.files],
        fill_value=0,
    )
    if args.verbose:
        print(
            "Final shape: {} cells and {} genes".format(
                concat.shape[0], concat.shape[1]
            )
        )
    # save file as .h5ad
    if args.verbose:
        print("Writing counts to {}".format(args.out))
    concat.write(args.out, compression="gzip")

def info(args)

Prints information about .h5ad file to console

Expand source code

def info(args):
    """Prints information about .h5ad file to console"""
    print("Reading {}\n".format(args.file))
    adata = sc.read(args.file)
    print(adata, "\n")
    print(".X: {} with {}\n".format(type(adata.X), adata.X.dtype))
    print("obs_names: {}".format(adata.obs_names))
    print("var_names: {}".format(adata.var_names))

def knee_point(args)

Labels cells using "knee point" method from CellRanger 2.1

Expand source code

def knee_point(args):
    """Labels cells using "knee point" method from CellRanger 2.1"""
    # read file into anndata obj
    if args.verbose:
        print("Reading {}".format(args.file), end="")
    a = sc.read(args.file)
    if args.verbose:
        print(" - {} cells and {} genes".format(a.shape[0], a.shape[1]))
    # add knee_point label to anndata
    cellranger2(
        a,
        expected=args.expected,
        upper_quant=args.upper_quant,
        lower_prop=args.lower_prop,
        label=args.label,
        verbose=args.verbose,
    )
    # save file as .h5ad
    print("Writing counts to {}".format(args.file))
    a.write(args.file, compression="gzip")

def recipe(args)

Full automated processing of scRNA-seq data

Expand source code

def recipe(args):
    """Full automated processing of scRNA-seq data"""
    # get basename of file for writing outputs
    name = [os.path.splitext(os.path.basename(args.file))[0]]
    if args.subset is not None:
        name.append("_".join(args.subset))
    if args.layer is not None:
        name.append(args.layer)
    if args.use_rep is not None:
        name.append(args.use_rep)
    # read file into anndata obj
    if args.verbose:
        print("Reading {}".format(args.file), end="")
    a = sc.read(args.file)
    if args.verbose:
        print(" - {} cells and {} genes".format(a.shape[0], a.shape[1]))
    # subset anndata on .obs column if desired
    if args.subset is not None:
        a = subset_adata(a, subset=args.subset)
    if args.process:
        # switch to proper layer
        if args.layer is not None:
            if args.verbose:
                print("Using layer {} to reduce dimensions".format(args.layer))
            a.X = a.layers[args.layer].copy()
        # preprocess with dropkick recipe
        a = recipe_dropkick(
            a,
            X_final="arcsinh_norm",
            verbose=args.verbose,
            filter=True,
            min_genes=args.min_genes,
        )
        # reduce dimensions
        dim_reduce(
            a,
            use_rep=args.use_rep,
            clust_resolution=args.resolution,
            paga=args.paga,
            verbose=args.verbose,
            seed=args.seed,
        )
    # run cell cycle inference
    if args.cell_cycle:
        cc_score(a, verbose=args.verbose)
        args.colors = ["phase"] + args.colors
    # make sure output dir exists before saving plots
    check_dir_exists(args.outdir)
    # if there's DE to do, plot genes
    if args.diff_expr is not None:
        if isinstance(args.diff_expr, str):
            args.diff_expr = [args.diff_expr]
        for de in args.diff_expr:
            plot_genes(
                a,
                de_method=args.de_method,
                plot_type=de,
                groupby="leiden",
                n_genes=5,
                cmap=args.cmap,
                save_to="{}/{}_{}.png".format(args.outdir, de, "_".join(name)),
                verbose=args.verbose,
            )
        # if there's cnmf results, plot those on a heatmap/matrix/dotplot too
        if "cnmf_spectra" in a.varm:
            for de in args.diff_expr:
                plot_genes_cnmf(
                    a,
                    plot_type=de,
                    groupby="leiden",
                    attr="varm",
                    keys="cnmf_spectra",
                    indices=None,
                    n_genes=5,
                    cmap=args.cmap,
                    save_to="{}/{}_cnmf_{}.png".format(args.outdir, de, "_".join(name)),
                )
    # if there's a cnmf flag, try to plot loadings
    if args.cnmf:
        # check for cnmf results in anndata object
        if "cnmf_spectra" in a.varm:
            _ = rank_genes_cnmf(a, show=False)
            if args.verbose:
                print(
                    "Saving cNMF loadings to {}/{}_cnmfspectra.png".format(
                        args.outdir, "_".join(name)
                    )
                )
            plt.savefig("{}/{}_cnmfspectra.png".format(args.outdir, "_".join(name)))
            if args.verbose:
                print(
                    "Saving embeddings to {}/{}_embedding.png".format(
                        args.outdir, "_".join(name)
                    )
                )
            # save embedding plot with cNMF loadings
            if args.colors is None:
                args.colors = []
            plot_embedding(
                a,
                colors=args.colors
                + a.obs.columns[a.obs.columns.str.startswith("usage_")].tolist(),
                show_clustering=True,
                n_cnmf_markers=args.n_cnmf_markers,
                cmap=args.cmap,
                seed=args.seed,
                save_to="{}/{}_embedding.png".format(args.outdir, "_".join(name)),
                verbose=args.verbose,
                size=args.point_size,
            )
        else:
            print(
                "cNMF results not detected in {}. Skipping cNMF overlay for embedding.".format(
                    args.file
                )
            )
            # save embedding plot without cNMF loadings
            if args.verbose:
                print(
                    "Saving embeddings to {}/{}_embedding.png".format(
                        args.outdir, "_".join(name)
                    )
                )
            plot_embedding(
                a,
                colors=args.colors,
                show_clustering=True,
                cmap=args.cmap,
                seed=args.seed,
                save_to="{}/{}_embedding.png".format(args.outdir, "_".join(name)),
                verbose=args.verbose,
                size=args.point_size,
            )
    else:
        # save embedding plot
        if args.verbose:
            print(
                "Saving embeddings to {}/{}_embedding.png".format(
                    args.outdir, "_".join(name)
                )
            )
        plot_embedding(
            a,
            colors=args.colors,
            show_clustering=True,
            cmap=args.cmap,
            seed=args.seed,
            save_to="{}/{}_embedding.png".format(args.outdir, "_".join(name)),
            verbose=args.verbose,
            size=args.point_size,
        )
    # save file as .h5ad
    if args.save_adata:
        if args.verbose:
            print(
                "Saving AnnData object to to {}/{}_processed.h5ad".format(
                    args.outdir, "_".join(name)
                )
            )
        a.write(
            "{}/{}_processed.h5ad".format(args.outdir, "_".join(name)),
            compression="gzip",
        )

def rename_obs(args)

Renames .obs columns in anndata object, and overwrites .h5ad file

Expand source code

def rename_obs(args):
    """Renames .obs columns in anndata object, and overwrites .h5ad file"""
    if args.verbose:
        print("Reading {}".format(args.file))
    adata = sc.read(args.file)
    if args.verbose:
        print("Renaming columns {} to {}".format(args.old_names, args.new_names))
    adata.obs.rename(columns=dict(zip(args.old_names, args.new_names)), inplace=True)
    adata.write(args.file, compression="gzip")

def subset(args)

Subsets anndata object on binary .obs label(s), saves to new .h5ad file

Expand source code

def subset(args):
    """Subsets anndata object on binary .obs label(s), saves to new .h5ad file"""
    if args.verbose:
        print("Reading {}".format(args.file), end="")
    a = sc.read(args.file)
    if args.verbose:
        print(" - {} cells and {} genes".format(a.shape[0], a.shape[1]))
    a = subset_adata(a, subset=args.subset, verbose=args.verbose)
    if args.verbose:
        print("Writing subsetted counts to {}".format(args.out))
    a.write(args.out, compression="gzip")

def to_h5ad(args)

Converts counts matrix from flat file (.txt, .csv) to .h5ad

Expand source code

def to_h5ad(args):
    """Converts counts matrix from flat file (.txt, .csv) to .h5ad"""
    # get basename of file for writing outputs
    name = os.path.splitext(os.path.basename(args.file))[0]
    # check to make sure it's an .h5ad file
    if os.path.splitext(args.file)[1] == ".h5ad":
        raise ValueError("Input file already in .h5ad format")
    # read file into anndata obj
    if args.verbose:
        print("Reading {}".format(args.file), end="")
    a = sc.read(args.file)
    if args.verbose:
        # print information about counts, including names of cells and genes
        print(" - {} cells and {} genes".format(a.shape[0], a.shape[1]))
        print("obs_names: {}".format(a.obs_names))
        print("var_names: {}".format(a.var_names))
    # sparsify counts slot
    if args.verbose:
        print("sparsifying counts...")
    a.X = sparse.csr_matrix(a.X, dtype=int)
    # save file as .h5ad
    if args.verbose:
        print("Writing counts to {}/{}.h5ad".format(args.outdir, name))
    check_dir_exists(args.outdir)
    a.write("{}/{}.h5ad".format(args.outdir, name), compression="gzip")
    if args.rm_flat_file:
        # remove original, noncompressed flat file
        if args.verbose:
            print("Removing {}".format(args.file))
        os.remove(args.file)

def transpose(args)

Transposes anndata object, replacing obs with var, and overwrites .h5ad file

Expand source code

def transpose(args):
    """Transposes anndata object, replacing obs with var, and overwrites .h5ad file"""
    # read file into anndata obj
    if args.verbose:
        print("Reading {}".format(args.file))
    a = sc.read(args.file)
    if args.verbose:
        print(a)
    # transpose file
    if args.verbose:
        print("transposing file and saving...")
    a = a.T
    # save file as .h5ad
    a.write(args.file, compression="gzip")