Skip to content

Python API

Quantification

Factory Function

from mokume.quantification import get_quantification_method, list_quantification_methods

# Create method by name
method = get_quantification_method("top3")        # TopNQuantification(n=3)
method = get_quantification_method("top10")       # TopNQuantification(n=10)
method = get_quantification_method("maxlfq", min_peptides=2, threads=4)

# List available methods
available = list_quantification_methods()
# {'topn': True, 'maxlfq': True, 'directlfq': False, 'sum': True}

TopNQuantification

from mokume.quantification import TopNQuantification

quant = TopNQuantification(n=3)  # or n=5, n=10, etc.
result = quant.quantify(
    peptides,
    protein_column="ProteinName",
    peptide_column="PeptideSequence",
    intensity_column="NormIntensity",
    sample_column="SampleID",
)

MaxLFQQuantification

from mokume.quantification import MaxLFQQuantification

quant = MaxLFQQuantification(
    min_peptides=2,      # Min peptides for MaxLFQ (uses median for fewer)
    threads=4,           # Parallel cores (-1 for all)
    force_builtin=False, # Force built-in implementation
)
result = quant.quantify(peptides, protein_column="ProteinName", ...)

# Check backend
quant.using_directlfq  # True/False
quant.name             # "MaxLFQ (DirectLFQ)" or "MaxLFQ (built-in)"

DirectLFQQuantification

from mokume.quantification import is_directlfq_available

if is_directlfq_available():
    from mokume.quantification import DirectLFQQuantification
    quant = DirectLFQQuantification(min_nonan=2)
    result = quant.quantify(peptides, protein_column="ProteinName", ...)

AllPeptidesQuantification (Sum)

from mokume.quantification import AllPeptidesQuantification

quant = AllPeptidesQuantification()
result = quant.quantify(peptides, protein_column="ProteinName", ...)

peptides_to_protein (iBAQ)

from mokume.quantification import peptides_to_protein

peptides_to_protein(
    fasta="proteome.fasta",
    peptides="peptides.csv",
    enzyme="Trypsin",
    normalize=True,
    tpa=True,
    ruler=True,
    ploidy=2,
    cpc=200,
    organism="human",
    output="proteins-ibaq.tsv",
    min_aa=7,
    max_aa=30,
    verbose=True,
    qc_report="QC.pdf",
)

Pipeline

PipelineConfig

from mokume.pipeline import QuantificationPipeline, PipelineConfig
from mokume.pipeline.config import (
    InputConfig,
    FilterConfig,
    NormalizationConfig,
    QuantificationConfig,
    IRSConfig,
    BatchCorrectionConfig,
    DEConfig,
    OutputConfig,
)

config = PipelineConfig(
    input=InputConfig(
        parquet="data.parquet",
        sdrf="experiment.sdrf.tsv",
        fasta_file="proteome.fasta",
    ),
    filtering=FilterConfig(
        min_aa=7,
        min_unique_peptides=2,
        remove_contaminants=True,
    ),
    normalization=NormalizationConfig(
        run_method="median",
        sample_method="globalMedian",
    ),
    quantification=QuantificationConfig(
        method="maxlfq",
    ),
)

pipeline = QuantificationPipeline(config)
proteins = pipeline.run()

features_to_proteins (functional)

from mokume.pipeline import features_to_proteins

proteins = features_to_proteins(
    parquet="data.parquet",
    output="proteins.csv",
    sdrf="experiment.sdrf.tsv",
    quant_method="maxlfq",
    run_normalization="median",
    sample_normalization="globalMedian",
    batch_correction=True,
    batch_method="sample_prefix",
    batch_covariates=["characteristics[sex]"],
)

Normalization

Peptide Normalization Pipeline

from mokume.normalization.peptide import peptide_normalization

peptide_normalization(
    parquet="features.parquet",
    sdrf="experiment.sdrf.tsv",
    min_aa=7,
    min_unique=2,
    remove_ids=None,
    remove_decoy_contaminants=True,
    remove_low_frequency_peptides=True,
    output="peptides.csv",
    skip_normalization=False,
    nmethod="median",          # Feature-level: mean, median, iqr, none
    pnmethod="globalMedian",   # Sample-level: globalMedian, conditionMedian, hierarchical, none
    log2=True,
    save_parquet=False,
)

IRS Normalization

from mokume.normalization.irs import (
    IRSNormalizer,
    detect_pooled_from_sdrf,
    detect_plexes_from_sdrf,
)

ref_samples = detect_pooled_from_sdrf("experiment.sdrf.tsv")
sample_to_plex = detect_plexes_from_sdrf("experiment.sdrf.tsv")

normalizer = IRSNormalizer(reference_samples=ref_samples, stat="median")
protein_df = normalizer.fit_transform(protein_df, sample_to_plex)

Preprocessing Filters

from mokume.preprocessing.filters import (
    load_filter_config,
    save_filter_config,
    generate_example_config,
    get_filter_pipeline,
)
from mokume.model.filters import PreprocessingFilterConfig

# Generate example
generate_example_config("filters.yaml")

# Load from file
config = load_filter_config("filters.yaml")

# Create programmatically
config = PreprocessingFilterConfig(name="custom", enabled=True)
config.intensity.min_intensity = 1000.0
config.peptide.allowed_charge_states = [2, 3, 4]
config.protein.min_unique_peptides = 2
config.run_qc.max_missing_rate = 0.5

# Apply CLI-style overrides
config.apply_overrides({
    "min_intensity": 500,
    "charge_states": [2, 3],
    "max_missing_rate": 0.3,
})

# Save
save_filter_config(config, "my_filters.yaml")

# Apply
pipeline = get_filter_pipeline(config)
filtered_df, results = pipeline.apply(df)

for result in results:
    print(f"{result.filter_name}: removed {result.removed_count} ({result.removal_rate:.1%})")

summary = pipeline.summary(results)
print(f"Total removed: {summary['total_removed']} / {summary['total_input']}")

Postprocessing

Batch Correction

from mokume.postprocessing import (
    apply_batch_correction,
    detect_batches,
    extract_covariates_from_sdrf,
)

batch_indices = detect_batches(
    sample_ids=df_wide.columns.tolist(),
    method="sample_prefix",  # "sample_prefix", "run", or "column"
)

covariates = extract_covariates_from_sdrf(
    "experiment.sdrf.tsv",
    sample_ids=df_wide.columns.tolist(),
    covariate_columns=["characteristics[sex]"],
)

df_corrected = apply_batch_correction(
    df=df_wide, batch=batch_indices, covs=covariates,
)

Data Reshaping

from mokume.postprocessing.reshape import (
    pivot_wider,
    pivot_longer,
    remove_samples_low_protein_number,
    remove_missing_values,
    describe_expression_metrics,
)

# Long to wide
df_wide = pivot_wider(df, row_name="ProteinName", col_name="SampleID", values="Ibaq")

# Wide to long
df_long = pivot_longer(df_wide, row_name="ProteinName", col_name="SampleID", values="Ibaq")

# Quality filtering
df = remove_samples_low_protein_number(df, min_protein_num=100)
df = remove_missing_values(df, missingness_percentage=20, expression_column="Ibaq")

# Statistics
metrics = describe_expression_metrics(df)

IO

AnnData

from mokume.io.parquet import create_anndata

adata = create_anndata(
    df,
    obs_col="SampleID",
    var_col="ProteinName",
    value_col="Ibaq",
    layer_cols=["IbaqNorm", "IbaqLog"],
    obs_metadata_cols=["Condition"],
    var_metadata_cols=["GeneName"],
)
adata.write("proteins.h5ad")

FASTA

from mokume.io.fasta import (
    load_fasta,
    digest_protein,
    extract_fasta,
    get_protein_molecular_weights,
)

proteins = load_fasta("proteome.fasta")

peptides = digest_protein(
    sequence="MKWVTFISLLFLFSSAYS...",
    enzyme="Trypsin",
    min_aa=7, max_aa=30,
)

unique_peptide_counts, mw_dict, found_proteins = extract_fasta(
    fasta="proteome.fasta",
    enzyme="Trypsin",
    proteins=["P12345", "P67890"],
    min_aa=7, max_aa=30,
    tpa=True,
)

mw_dict = get_protein_molecular_weights("proteome.fasta", ["P12345", "P67890"])

Organism Metadata

from mokume.model.organism import OrganismDescription

organisms = OrganismDescription.registered_organisms()
# ['human', 'mouse', 'yeast', 'drome', 'caeel', 'schpo']

human = OrganismDescription.get("human")
print(human.genome_size)
print(human.histone_entries)