Source code for trisicell.datasets._datasets

import mudata as md
import scanpy as sc

import trisicell as tsc

url = "https://github.com/faridrashidi/trisicell/releases/download/v0.0.0"


[docs]def treated_igg_sw(): """Trisicell treated mice (igg, seq-well) scRNAseq data. The size is n_cells × n_muts = 163 × 1453 Returns ------- :class:`mudata.MuData` A mudata with two modalities (`.mod`) Examples -------- >>> mdata = tsc.datasets.treated_igg_sw() >>> mdata MuData object with n_obs × n_vars = 163 × 56854 2 modalities mutation: 163 x 1453 obs: 'group', 'Mouse_ID', 'Batch', 'Axl', 'Erbb3', 'Mitf', 'H2_K1', ... var: 'kind', 'amino_acid_change', 'ensemble', 'gene', 'chrom', ... uns: 'trisicell_input', 'trisicell_output' layers: 'genotype', 'mutant', 'total' expression: 163 x 55401 obs: 'uniquely_mapped_percent', 'num_splices', 'num_GCAG_splices', ... layers: 'fpkm', 'tpm' See Also -------- :func:`trisicell.datasets.treated_actla4`. :func:`trisicell.datasets.treated_igg_ss2`. """ name = "treated_igg_sw.h5md.gz" sc.readwrite._check_datafile_present_and_download( f"data/{name}", backup_url=f"{url}/{name}" ) mdata = md.read_h5mu(f"data/{name}") return mdata
[docs]def treated_igg_ss2(): """Trisicell treated mice (igg, smart-seq2) scRNAseq data. The size is n_cells × n_muts = 163 × 1453 Returns ------- :class:`mudata.MuData` A mudata with two modalities (`.mod`) Examples -------- >>> mdata = tsc.datasets.treated_igg_ss2() >>> mdata MuData object with n_obs × n_vars = 163 × 56854 2 modalities mutation: 163 x 1453 obs: 'group', 'Mouse_ID', 'Batch', 'Axl', 'Erbb3', 'Mitf', 'H2_K1', ... var: 'kind', 'amino_acid_change', 'ensemble', 'gene', 'chrom', ... uns: 'trisicell_input', 'trisicell_output' layers: 'genotype', 'mutant', 'total' expression: 163 x 55401 obs: 'uniquely_mapped_percent', 'num_splices', 'num_GCAG_splices', ... layers: 'fpkm', 'tpm' See Also -------- :func:`trisicell.datasets.treated_actla4`. :func:`trisicell.datasets.treated_igg_sw`. """ name = "treated_igg_ss2.h5md.gz" sc.readwrite._check_datafile_present_and_download( f"data/{name}", backup_url=f"{url}/{name}" ) mdata = md.read_h5mu(f"data/{name}") return mdata
[docs]def treated_actla4(): """Trisicell treated mice (anti-ctla-4) scRNAseq data. The size is n_cells × n_muts = 508 × 3309 Returns ------- :class:`mudata.MuData` A mudata with two modalities (`.mod`) Examples -------- >>> mdata = tsc.datasets.treated_actla4() >>> mdata MuData object with n_obs × n_vars = 508 × 58710 2 modalities mutation: 508 x 3309 obs: 'group', 'Mouse_ID', 'Batch', 'Axl', 'Erbb3', 'Mitf', 'H2-K1', ... var: 'kind', 'amino_acid_change', 'ensemble', 'gene', 'chrom', ... uns: 'trisicell_input', 'trisicell_output' layers: 'genotype', 'mutant', 'total' expression: 508 x 55401 obs: 'uniquely_mapped_percent', 'num_splices', 'num_GCAG_splices', ... layers: 'fpkm', 'tpm' See Also -------- :func:`trisicell.datasets.treated_igg_ss2`. :func:`trisicell.datasets.treated_igg_sw`. """ name = "treated_actla4.h5md.gz" sc.readwrite._check_datafile_present_and_download( f"data/{name}", backup_url=f"{url}/{name}" ) mdata = md.read_h5mu(f"data/{name}") return mdata
[docs]def sublines_scrnaseq(): """Trisicell sublines scRNAseq data. The size is n_cells × n_muts = 175 × 450 Returns ------- :class:`mudata.MuData` A mudata with two modalities (`.mod`) Examples -------- >>> mdata = tsc.datasets.sublines_scrnaseq() >>> mdata MuData object with n_obs × n_vars = 175 × 55851 2 modalities expression: 175 x 55401 obs: 'cells', 'uniquely_mapped_percent', 'num_splices', ... layers: 'fpkm', 'tpm' mutation: 175 x 450 obs: 'cells', 'clone', 'group', 'group_color', 'is_red', 'is_sub', ... var: 'kind', 'amino_acid_change', 'ensemble', 'gene', 'chrom', ... layers: 'genotype', 'mutant', 'total', 'trisicell_input', 'trisicell_output' See Also -------- :func:`trisicell.datasets.sublines_bwes`. :func:`trisicell.datasets.sublines_bwts`. """ name = "sublines_scrnaseq.h5md.gz" sc.readwrite._check_datafile_present_and_download( f"data/{name}", backup_url=f"{url}/{name}" ) mdata = md.read_h5mu(f"data/{name}") return mdata
[docs]def sublines_bwes(): """Trisicell sublines bWES data. The size is n_sublines × n_muts = 24 × 6653 Returns ------- :class:`anndata.AnnData` An anndata in which `.var` contains information about the mutations. - `.layers['trisicell_input']` the binary input genotype matrix used as input to the Trisicell. - `.layers['trisicell_output']` the binary input genotype matrix inferred by Trisicell-boost(SCITE). - `.layers['genotype']` noisy genotype matrix, 0: reference, 1: heterozygous 2: unknown and 3: homozygous_alt. - `.layers['mutant']` number of mutant reads. - `.layers['total']` number of total reads. Examples -------- >>> adata = tsc.datasets.sublines_bwes() >>> adata AnnData object with n_obs × n_vars = 24 × 6653 var: 'kind', 'amino_acid_change', 'ensemble', 'gene', 'chrom', 'position', ... layers: 'genotype', 'mutant', 'total', 'trisicell_input', 'trisicell_output' See Also -------- :func:`trisicell.datasets.sublines_scrnaseq`. :func:`trisicell.datasets.sublines_bwts`. """ name = "sublines_bwes.h5ad.gz" sc.readwrite._check_datafile_present_and_download( f"data/{name}", backup_url=f"{url}/{name}" ) adata = sc.read_h5ad(f"data/{name}") return adata
[docs]def sublines_bwts(): """Trisicell sublines bWTS data. The size is n_cells × n_muts = 33 × 536 Returns ------- :class:`mudata.MuData` A mudata with two modalities (`.mod`) Examples -------- >>> mdata = tsc.datasets.sublines_bwts() >>> mdata MuData object with n_obs × n_vars = 33 × 55851 2 modalities expression: 175 x 55401 obs: 'cells', 'uniquely_mapped_percent', 'num_splices', ... layers: 'fpkm', 'tpm' mutation: 33 x 536 obs: 'cells', 'clone', 'mps', 'zscore', 'group', 'Axl', 'Mitf', 'day', ... var: 'kind', 'amino_acid_change', 'ensemble', 'gene', 'chrom', 'position',... layers: 'genotype', 'mutant', 'total', 'trisicell_input', 'trisicell_output' See Also -------- :func:`trisicell.datasets.sublines_bwes`. :func:`trisicell.datasets.sublines_scrnaseq`. """ name = "sublines_bwts.h5md.gz" sc.readwrite._check_datafile_present_and_download( f"data/{name}", backup_url=f"{url}/{name}" ) mdata = md.read_h5mu(f"data/{name}") return mdata
[docs]def example(is_expression=False): """Return an example for sanity checking and playing with Trisicell. is_expression : :obj:`bool`, optional Returns the expression dataset instead of the genotype one, by default False Returns ------- :class:`anndata.AnnData` An object that cells are in `.obs` and mutations are in `.var`. """ if is_expression: return tsc.io.read( tsc.ul.get_file("trisicell.datasets/data/expression.h5ad.gz") ) else: return tsc.io.read(tsc.ul.get_file("trisicell.datasets/data/genotype.h5ad.gz"))
def test(): df = tsc.io.read(tsc.ul.get_file("trisicell.datasets/test/test.tsv")) return df def colorectal2(readcount=False): """Human Colorectal Cancer (Patient 2). This dataset was introduced in :cite:`Leung_2017` and was used in: * :cite:`PhISCS` Figure 7. * :cite:`B-SCITE` Figure 8b. * :cite:`SiCloneFit` Figure 4. * :cite:`SCARLET` Figure 4. The size is n_cells × n_muts = 78 × 25 Parameters ---------- readcount : :obj:`str` Return the readcount information of the data. Returns ------- :class:`anndata.AnnData` An anndata in which `.X` is the input noisy. - `.layers['solution_fig7a']` is the solution presented in Figure 7a of PhISCS paper. - `.layers['solution_fig7b']` is the solution presented in Figure 7b of PhISCS paper. - `.uns['params_fig7a']` is parameters used as input to get 'solution_7a'. - `.uns['params_fig7b']` is parameters used as input to get 'solution_7b'. - `.var` includes information of the bulk samples. """ if readcount: adata = tsc.io.read( tsc.ul.get_file("trisicell.datasets/real/colorectal2.rc.h5ad") ) else: adata = tsc.io.read(tsc.ul.get_file("trisicell.datasets/real/colorectal2.h5ad")) return adata def high_grade_serous_ovarian_cancer_3celllines(): """High Grade Serous Ovarian Cancer (3 cell lines). This dataset was introduced in :cite:`Laks_2019` and preprocessed in :cite:`McPherson_2019`. The phylogeny is presented in Figure 3H. The size is n_cells × n_muts = 891 × 13666 Note that 402 mutations were deleted during evolution in the inferred tree by original study So, here they were filtered out (meaning they are 14068 mutations in total in the original study). Returns ------- :class:`anndata.AnnData` An anndata in which `.X` is the input noisy (it is obtained based on the number of mutant/total reads). - `.obs['clone_id']` is the clone id to which the cell is assigned in Figure 3H. - `.obs['group_color']` is unique colors for each 'clone_id'. - `.obs['cell_name']` is a new name for each cell based on the 'group_color'. - `.layers['mutant']` is the number of mutant reads at each locus in each cell. - `.layers['total']` is the total number of reads at each locus in each cell. - `.layers['ground']` is the solution inferred in Figure 3H of the original paper. - `.uns['params_ground']` is parameters inferred by comparing ground and noisy matrices. - `.var` includes information of the bulk samples. Examples -------- >>> sc = tsc.datasets.high_grade_serous_ovarian_cancer_3celllines() >>> print(sc) AnnData object with n_obs × n_vars = 891 × 13666 obs: 'clone_id', 'group_color', 'cell_name' uns: 'params_ground' layers: 'ground', 'mutant', 'total' """ adata = tsc.io.read(tsc.ul.get_file("trisicell.datasets/real/ovarian.h5ad.gz")) return adata