OpenMS-PEP Google Summer of Code 2018

IPRG 2016 Study

The Proteome Informatics Research Group (iPRG) is a collective within the ABRF (The Association of Biomolecular Resource Facilities) promoting best practices of bioinformatic tools in the analysis of proteomics data. In 2016, the iPRG challenged the scientific community with a study: Inferring Proteoforms from Bottom-up Proteomics Data. To this end, the iPRG produced and published MS data from known pools of proteins. The study is composed of four E. Coli samples á three replicates that were prepared by spiking different combinations of PrESTs (Protein Epitope Signature Tags; # = 383) into a common background. Given two pools of known proteins (A and B), the samples are explained as:

  • Sample A: Pool A (192) + Pool B (191)
  • Sample B: Pool B (191)
  • Sample C: Pool A (192)
  • Sample D: blank (only background)

For fair comparison of different techniques used, a common database was given as well containing 5592 entries where 383 are PrESTs present in the samples, 1000 are PrESTs not present in any sample, and the rest are other E. Coli “background” proteins.

IDPEP and Percolator are compared for each of the 12 data sets. Only PrESTs are considered for evaluation. Reported are Percolator results for train and test FDR = 0.05. Results for train and test FDR = 0.01 look similar. Sample D is often ignored since there are no true positives.

%matplotlib inline
import os
import json
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import auc
import analyse
_MAP_SAMPLE_TO_POOL = {
    "A": ["A", "B"],
    "B": ["B"],
    "C": ["A"],
    "D": []
}
_PEP_THRESH_LIST = [
    0,
    0.000001, 0.000005,
    0.00001, 0.00005,
    0.0001, 0.0005,
    0.001, 0.005,
    0.01, 0.05,
    0.1
]

Read in data

filenames = [
    f for f in os.listdir(".")
    if f.endswith(".json") and
    ("IDPEP" in f or "0.05" in f)
]
data = {}
for fn in filenames:
    sample, rep = fn[0], int(fn[1])
    engine = fn.split("_")[1]
    method = "Percolator" if "percolator" in fn else "IDPEP"

    key = (sample, rep, engine, method)
    with open(fn) as f:
        data[key] = json.load(f)
pools_fn = [
    (pool_key, "../mixtures/prest_pool_{}.txt".format(pool_key.lower()))
    for pool_key in list("AB")
]
pools_fn
[('A', '../mixtures/prest_pool_a.txt'), ('B', '../mixtures/prest_pool_b.txt')]
def read_pool(fn):
    with open(fn) as f:
        content = [line.rstrip() for line in f]
    return content
pools = {
    pools_key: read_pool(fn)
    for pools_key, fn in pools_fn
}

Distributions

def is_target(psm):
    return any(p.startswith("HPRR") for p in psm["proteins"])
def is_decoy(psm):
    return any(p.startswith("random_") for p in psm["proteins"])
def is_correct(psm, sample):
    protein_lists = [
        pools[pool_key]
        for pool_key in _MAP_SAMPLE_TO_POOL[sample]
    ]

    present_proteins = [p for ls in protein_lists for p in ls]
    detected_proteins = [p for p in psm["proteins"] if p.startswith("HPRR")]

    return len(set(detected_proteins) & set(present_proteins)) > 0
def plot_feature_distributions(feature, methods=["IDPEP", "Percolator"]):
    for sample in list("ABCD"):
        for rep in [1, 2, 3]:
            fig, axarr = plt.subplots(1, 3, figsize=(20, 5))
            for i, engine in enumerate(["xtandem", "msgf", "comet"]):
                for j, method in enumerate(methods):
                    key = (sample, rep, engine, method)
                    if key in data:
                        d = data[key]
                        target_psms = [
                            psm for psm in d
                            if feature in psm and is_target(psm)
                        ]

                        correct_peps = [
                            psm[feature] for psm in target_psms
                            if is_correct(psm, sample)
                        ]
                        incorrect_peps = [
                            psm[feature] for psm in target_psms
                            if not is_correct(psm, sample)
                        ]

                        axarr[i].hist(correct_peps, bins=50, color="C{}".format(j), label="{} (correct)".format(method), alpha=0.3)
                        axarr[i].hist(incorrect_peps, bins=50, color="C{}".format(j), label="{} (incorrect)".format(method), histtype="step")
                        axarr[i].set_xlabel(feature)
                        axarr[i].set_ylabel("Count")
                        axarr[i].legend()
                        axarr[i].set_title(engine)
            plt.suptitle("Sample {}{}".format(sample, rep));
            plt.show(fig)
            plt.close(fig)
def plot_feature_distributions_decoys(feature, methods=["Percolator"]):
    for sample in list("ABCD"):
        for rep in [1, 2, 3]:
            fig, axarr = plt.subplots(1, 3, figsize=(20, 5))
            for i, engine in enumerate(["xtandem", "msgf", "comet"]):
                for j, method in enumerate(methods):
                    key = (sample, rep, engine, method)
                    if key in data:
                        d = data[key]

                        decoy_peps = [
                            psm[feature] for psm in d
                            if feature in psm and is_decoy(psm)
                        ]

                        axarr[i].hist(decoy_peps, bins=50, color="C{}".format(j), label="{} (decoys)".format(method))
                        axarr[i].set_xlabel(feature)
                        axarr[i].set_ylabel("Count")
                        axarr[i].legend()
                        axarr[i].set_title(engine)
            plt.suptitle("Sample {}{}".format(sample, rep));
            plt.show(fig)
            plt.close(fig)

PEPs

plot_feature_distributions("PEP")

png

png

png

png

png

png

png

png

png

png

png

png

PEPs of decoys

plot_feature_distributions_decoys("PEP")

png

png

png

png

png

png

png

png

png

png

png

png

SVM Scores

plot_feature_distributions("SVM score", methods=["Percolator"])

png

png

png

png

png

png

png

png

png

png

png

png

Note that Percolator’s SVM score seems to be quite successful in discriminating correct and incorrect identifications for sample B.

SVM Scores of decoys

plot_feature_distributions_decoys("SVM score")

png

png

png

png

png

png

png

png

png

png

png

png

Correct Identifications

def get_pool_proteins(sample):
    protein_lists = [
        pools[pool_key]
        for pool_key in _MAP_SAMPLE_TO_POOL[sample]
    ]

    return [p for ls in protein_lists for p in ls]
def plot_correct_identifications(absolute=True):
    for sample in list("ABC"):
        present_proteins = get_pool_proteins(sample)
        n_present_proteins = len(present_proteins)
        for rep in [1, 2, 3]:
            fig, axarr = plt.subplots(1, 3, figsize=(20, 5))
            for i, engine in enumerate(["xtandem", "msgf", "comet"]):
                for j, method in enumerate(["IDPEP", "Percolator"]):
                    key = (sample, rep, engine, method)
                    if key in data:
                        d = data[key]
                        target_psms = [
                            psm for psm in d
                            if "PEP" in psm and is_target(psm)
                        ]

                        n_correct, n_unique = [], []
                        for pep in _PEP_THRESH_LIST:
                            correct_psms = [
                                psm for psm in target_psms
                                if is_correct(psm, sample) and
                                psm["PEP"] < pep
                            ]

                            correct_proteins = []
                            for psm in correct_psms:
                                psm_correct_proteins = list(filter(
                                    lambda p: p.startswith("HPRR") and p in present_proteins,
                                    psm["proteins"]
                                ))      
                                correct_proteins.extend(psm_correct_proteins)

                            n_correct.append(len(correct_proteins))
                            n_unique.append(len(set(correct_proteins)))

                        if not absolute:
                            n_unique = list(map(
                                lambda n: n / n_present_proteins,
                                n_unique
                            ))

                        if absolute:
                            axarr[i].plot(_PEP_THRESH_LIST, n_correct, color="C{}".format(j), label="{}".format(method))
                        axarr[i].plot(_PEP_THRESH_LIST, n_unique, color="C{}".format(j), label="{} (unique)".format(method), ls="--")
                        axarr[i].set_xlabel("PEP")
                        axarr[i].set_ylabel("Number of correct identifications")
                        axarr[i].legend()
                        axarr[i].set_title(engine)

                        if not absolute:
                            axarr[i].set_ylim(0, 1)
            plt.suptitle("Sample {}{}".format(sample, rep));
            plt.show(fig)
            plt.close(fig)

Number of correct identifications

plot_correct_identifications()

png

png

png

png

png

png

png

png

png

IDPEP seems to perform poorly for data preprocessed with X!Tandem. For sample A, IDPEP could not even fit the data; for other samples the data could be fitted but results do not look good. In the case of X!Tandem, using Percolator brings a great advantage.

For search engines MSGF+ and Comet, using Percolator (compared to IDPEP) results in more correct PSMs found - at least when duplicates are allowed. Percolator improves MSGF+ results more than Comet results.

Proportion of correct identifications

plot_correct_identifications(absolute=False)

png

png

png

png

png

png

png

png

png

Here, we focus on uniquely correctly identified PSMs. In general, the differences between IDPEP and Percolator are much smaller for unique correct identifications. However, Percolator does have an advantage over IDPEP, especially for PEPs < 0.05 (which is the interesting region).

ROC curves

for sample in list("ABCD"):
    for rep in [1, 2, 3]:
        for engine in ["xtandem", "msgf", "comet"]:
            for method in ["IDPEP", "Percolator"]:
                key = (sample, rep, engine, method)
                if key in data:
                    d = data[key]
                    for psm in d:
                        psm["correct"] = is_correct(psm, sample)
for sample in list("ABC"):
    for rep in [1, 2, 3]:
        fig, axarr = plt.subplots(1, 3, figsize=(20, 5))
        for i, engine in enumerate(["xtandem", "msgf", "comet"]):
            for j, method in enumerate(["IDPEP", "Percolator"]):
                key = (sample, rep, engine, method)
                if key in data:
                    d = data[key]
                    d = [psm for psm in d if "PEP" in psm]
                    thresh_list, tp_rates, fp_rates = analyse.roc_curve(d)
                    auc_val = auc(fp_rates, tp_rates)
                    axarr[i].plot([0, 1], [0, 1], ls="--", color="k")
                    axarr[i].plot(fp_rates, tp_rates, label="{} (AUC={:.2f})".format(method, auc_val), color="C{}".format(j))
                    axarr[i].set_xlabel("FPR")
                    axarr[i].set_ylabel("TPR")
                    axarr[i].legend()
                    axarr[i].set_title(engine)
                    axarr[i].set_xlim(0 - 0.05, 1 + 0.05)
                    axarr[i].set_ylim(0 - 0.05, 1 + 0.05)
        plt.suptitle("Sample {}{}".format(sample, rep));
        plt.show(fig)
        plt.close(fig)

png

png

png

png

png

png

png

png

png

For almost all samples and replicates, Percolator performs better than IDPEP.

Precision-Recall Curves

for sample in list("ABC"):
    for rep in [1, 2, 3]:
        fig, axarr = plt.subplots(1, 3, figsize=(20, 5))
        for i, engine in enumerate(["xtandem", "msgf", "comet"]):
            for j, method in enumerate(["IDPEP", "Percolator"]):
                key = (sample, rep, engine, method)
                if key in data:
                    d = data[key]
                    d = [psm for psm in d if "PEP" in psm]
                    thresh_list, precision, recall = analyse.precision_recall_curve(d)
                    axarr[i].plot([0, 1], [0.5, 0.5], ls="--", color="k")
                    axarr[i].plot(recall, precision, label=method, color="C{}".format(j))
                    axarr[i].set_xlabel("Recall")
                    axarr[i].set_ylabel("Precision")
                    axarr[i].legend()
                    axarr[i].set_title(engine)
                    axarr[i].set_xlim(0 - 0.05, 1 + 0.05)
                    axarr[i].set_ylim(0 - 0.05, 1 + 0.05)
        plt.suptitle("Sample {}{}".format(sample, rep));
        plt.show(fig)
        plt.close(fig)

png

png

png

png

png

png

png

png

png

I am not sure what happened here.

Actual vs computed PEP

for sample in list("ABC"):
    for rep in [1, 2, 3]:
        fig, axarr = plt.subplots(1, 3, figsize=(20, 5))
        for i, engine in enumerate(["xtandem", "msgf", "comet"]):
            for j, method in enumerate(["IDPEP", "Percolator"]):
                key = (sample, rep, engine, method)
                if key in data:
                    d = data[key]
                    d = [psm for psm in d if "PEP" in psm]
                    actual, computed = analyse.actual_vs_computed_pep(d)
                    axarr[i].plot([0, 1], [0, 1], ls="--", color="k")
                    axarr[i].plot(computed, actual, label=method, color="C{}".format(j))
                    axarr[i].set_xlabel("Computed PEP")
                    axarr[i].set_ylabel("Actual PEP")
                    axarr[i].legend()
                    axarr[i].set_title(engine)
                    axarr[i].set_xlim(0 - 0.05, 1 + 0.05)
                    axarr[i].set_ylim(0 - 0.05, 1 + 0.05)
        plt.suptitle("Sample {}{}".format(sample, rep));
        plt.show(fig)
        plt.close(fig)

png

png

png

png

png

png

png

png

png

From a theoretical perspective, this doesn’t look to good since all plotted lines are far from the “ideal” case, a diagonal line. It is important to note, however, that a perfect diagonal line results from all incorrect identifications being assigned a perfect one, and all correct identifications being assigned a perfect 0. In a practical scenario, that’s not necessarily desired. So, I think, this kind of measure is not sensitive to the problem at hand in the first place.