Source code for trisicell.io._genotype

import os

import anndata as ad
import ete3
import networkx as nx
import pandas as pd

import trisicell as tsc


[docs]def read(filepath):
    """Read genotype matrix and read-count matrix.

    The genotype matrix must be in the in format of :class:`pandas.DataFrame`
    The read-count matrix must be in the format of :class:`anndata.AnnData`.

    Parameters
    ----------
    filepath : :obj:`str`
        The path to the file. The extension must be one of
        [`.tsv`, `.SC`, `.CFMatrix`, `.h5ad`, `.h5ad.gz`, `.nwk`]

    Returns
    -------
    :class:`pandas.DataFrame` or :class:`anndata.AnnData`
        Depends on the format of the input file the output type is different.
    """

    ext = os.path.splitext(filepath)[-1]
    if ext in [".SC", ".CFMatrix", ".before_FP_FN_NA", ".tsv"]:
        sc = pd.read_table(filepath, index_col=0)
        if len(sc.columns) != len(set(sc.columns)):
            tsc.logg.error("Mutation ids must be unique!")
        return sc
    elif ext in [".h5ad", ".gz"]:
        return ad.read(filepath)
    elif ext in [".nwk"]:
        return _read_nwk(filepath)
    else:
        tsc.logg.error("Extension is wrong!")


[docs]def write(obj, filepath):
    """Write genotype matrix or read-count matrix into a file.

    Parameters
    ----------
    obj : :class:`pandas.DataFrame` or :class:`anndata.AnnData`
        The input object which is going to be written in a file.
    filepath : :obj:`str`
        The file path where the `obj` must be written in.
    """

    if isinstance(obj, pd.DataFrame):
        obj.index.name = "cellIDxmutID"
        obj.to_csv(filepath, sep="\t")
    elif isinstance(obj, ad.AnnData):
        obj.write(filepath + ".h5ad.gz", compression="gzip")
    else:
        tsc.logg.error("Object instance is wrong!")


def _read_nwk(filepath):
    tree = ete3.Tree(filepath, format=1)
    G = nx.DiGraph()
    node2id = {}
    i = 0
    for n in tree.traverse("postorder"):
        if n.name == "" or "Inner" in n.name:
            G.add_node(i, label="––")
        else:
            G.add_node(i, label=str(n.name))
        node2id[n] = i
        i += 1

    for p in tree.traverse("postorder"):
        pn = node2id[p]
        for c in p.children:
            cn = node2id[c]
            G.add_edge(pn, cn)

    i = 0
    for e, u, _ in G.edges.data("label"):
        G.edges[(e, u)]["label"] = f"m{i}"
        i += 1
    G.graph["normal_cells"] = []
    G.graph["splitter_mut"] = "\n"
    G.graph["splitter_cell"] = "\n"
    data = tsc.ul.to_cfmatrix(G)
    return data