OpenMS-PEP Google Summer of Code 2018

PEP Protein Analysis - Comet

%matplotlib inline
import json
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.metrics import auc
from utils import annotate_correctness
import analyse

Read and prepare data

### idpep_fn = "OR20070924_S_mix7_02-10_xtandem_pmt15ppm_target_decoy_merged_IDPEP_pp_v2_fido.json"
percolator_fn = "OR20070924_S_mix7_02-10_comet_pmt15ppm_o1_percolator_fido_F0.05_t0.05_protein_target_pout_psms.json"
percolator_decoy_fn = "OR20070924_S_mix7_02-10_comet_pmt15ppm_o1_percolator_fido_F0.05_t0.05_protein_decoy_pout_psms.json"
data = {}
for key, fn in [
###     ("IDPEP", idpep_fn),
    ("Percolator", percolator_fn)
]:
    with open(fn) as f:
        data[key] = json.load(f)
    data[key] = annotate_correctness(data[key], protein_key="accession")
with open(percolator_decoy_fn) as f:
    decoy = json.load(f)
decoy = annotate_correctness(decoy, protein_key="accession")

Stats

print("key", "Number of Proteins", "Number of unique proteins", sep="\t")
for key, d in data.items():
    print(key, analyse.n_psms(d), analyse.n_unique_peptides(d, protein_key="accession"), sep="\t")
key	Number of Proteins	Number of unique proteins
Percolator	700	700

Number of correct identifications

for i, (key, d) in enumerate(data.items()):
    thresh_list, n_correct_ident = analyse.n_correct_identifications(d)
    _, n_unique_ident = analyse.n_unique_correct_identifications(d, protein_key="accession")
    plt.plot(thresh_list, n_correct_ident, label=key, color="C{}".format(i))
    plt.plot(thresh_list, n_unique_ident, label="{} (unique)".format(key), color="C{}".format(i), ls="--")
plt.title("Number of correct identifications")
plt.xlabel("PEP")
plt.ylabel("Number of correct identifications")
plt.legend();

png

PEP distributions

### fig, axarr = plt.subplots(1, 2, figsize=(20, 5))
### for i, (key, d) in enumerate(data.items()):
###     correct_peps = analyse.reduce_to_list(analyse.correct_psms(d))
###     incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(d))
###     axarr[i].hist(correct_peps, bins=50, color="C{}".format(i), label="{} (correct)".format(key), alpha=0.3)
###     axarr[i].hist(incorrect_peps, bins=50, color="C{}".format(i), label="{} (incorrect)".format(key), histtype="step")
###     axarr[i].set_xlabel("PEP")
###     axarr[i].set_ylabel("Count")
###     axarr[i].legend()
### decoy_peps = analyse.reduce_to_list(decoy)
### plt.suptitle("PEP distributions");
plt.figure(figsize=(10, 5))
correct_peps = analyse.reduce_to_list(analyse.correct_psms(data["Percolator"]))
incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(data["Percolator"]))
decoy_peps = analyse.reduce_to_list(decoy)
plt.hist(correct_peps, bins=50, color="C1", label="Percolator (correct)", alpha=0.3)
plt.hist(incorrect_peps, bins=50, color="C1", label="Percolator (incorrect)", histtype="step")
plt.hist(decoy_peps, bins=50, color="k", label="Percolator (decoys)", histtype="step")
plt.legend()
plt.xlabel("PEP")
plt.ylabel("Count")
plt.title("PEP distributions");

png

ROC curves

imp_pep = [0.01, 0.05]
pep_inds = [
    analyse._DEFAULT_PEP_THRESH_LIST.index(pep)
    for pep in imp_pep
]
pep_inds
[9, 10]
percolator_joined = data["Percolator"] + decoy
for i, (key, d) in enumerate(data.items()):
    thresh_list, tp_rates, fp_rates = analyse.roc_curve(d)
    plt.plot(fp_rates, tp_rates, label="{} (AUC={:.2f})".format(key, auc(tp_rates, fp_rates)), color="C{}".format(i))
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, tp_rates, fp_rates = analyse.roc_curve(percolator_joined)
plt.plot(fp_rates, tp_rates, label="Percolator (including decoys; AUC={:.2f})".format(auc(tp_rates, fp_rates)), color="C3", ls="--")
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot([0, 1], [0, 1], ls="--", color="k")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Precision-Recall curves

for i, (key, d) in enumerate(data.items()):
    thresh_list, precision, recall = analyse.precision_recall_curve(d)
    plt.plot(recall, precision, label=key)
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, precision, recall = analyse.precision_recall_curve(percolator_joined)
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot(recall, precision, label="Percolator (including decoys)", color="C3", ls="--")
plt.plot([0, 1], [0.5, 0.5], ls="--", color="k")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Precision-Recall Curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Actual vs computed PEP

plt.figure(figsize=(6, 6))
for i, (key, d) in enumerate(data.items()):
    actual, computed = analyse.actual_vs_computed_pep(d)
    plt.plot(computed, actual, label=key)
actual, computed = analyse.actual_vs_computed_pep(percolator_joined)
plt.plot(computed, actual, label="Percolator (including decoys)", color="C1", ls=":")
plt.plot([0, 1], [0, 1], color="k", ls="--")
plt.xlabel("Computed PEP")
plt.ylabel("Actual PEP")
plt.xlim(0 - 0.05, 1 + 0.05)
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Actual vs. Computed PEP")
plt.legend();

png

PEP Protein Analysis - MSGF+

%matplotlib inline
import json
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.metrics import auc
from utils import annotate_correctness
import analyse

Read and prepare data

idpep_fn = "OR20070924_S_mix7_02-10_msgf_fmHCD_iHighRes_pmt15_target_decoy_merged_IDPEP_v2_fido.json"
percolator_fn = "OR20070924_S_mix7_02-10_msgf_fmHCD_iHighRes_pmt15_percolator_pin_v3_fido_F0.05_t0.05_protein_target_pout_psms.json"
percolator_decoy_fn = "OR20070924_S_mix7_02-10_msgf_fmHCD_iHighRes_pmt15_percolator_pin_v3_fido_F0.05_t0.05_protein_decoy_pout_psms.json"
data = {}
for key, fn in [
    ("IDPEP", idpep_fn),
    ("Percolator", percolator_fn)
]:
    with open(fn) as f:
        data[key] = json.load(f)
    data[key] = annotate_correctness(data[key], protein_key="accession")
with open(percolator_decoy_fn) as f:
    decoy = json.load(f)
decoy = annotate_correctness(decoy, protein_key="accession")

Stats

print("key", "Number of Proteins", "Number of unique proteins", sep="\t")
for key, d in data.items():
    print(key, analyse.n_psms(d), analyse.n_unique_peptides(d, protein_key="accession"), sep="\t")
key	Number of Proteins	Number of unique proteins
IDPEP	1439	1439
Percolator	688	688

Number of correct identifications

for i, (key, d) in enumerate(data.items()):
    thresh_list, n_correct_ident = analyse.n_correct_identifications(d)
    _, n_unique_ident = analyse.n_unique_correct_identifications(d, protein_key="accession")
    plt.plot(thresh_list, n_correct_ident, label=key, color="C{}".format(i))
    plt.plot(thresh_list, n_unique_ident, label="{} (unique)".format(key), color="C{}".format(i), ls="--")
plt.title("Number of correct identifications")
plt.xlabel("PEP")
plt.ylabel("Number of correct identifications")
plt.legend();

png

PEP distributions

fig, axarr = plt.subplots(1, 2, figsize=(20, 5))
for i, (key, d) in enumerate(data.items()):
    correct_peps = analyse.reduce_to_list(analyse.correct_psms(d))
    incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(d))
    axarr[i].hist(correct_peps, bins=50, color="C{}".format(i), label="{} (correct)".format(key), alpha=0.3)
    axarr[i].hist(incorrect_peps, bins=50, color="C{}".format(i), label="{} (incorrect)".format(key), histtype="step")
    axarr[i].set_xlabel("PEP")
    axarr[i].set_ylabel("Count")
    axarr[i].legend()
decoy_peps = analyse.reduce_to_list(decoy)
plt.suptitle("PEP distributions");

png

plt.figure(figsize=(10, 5))
correct_peps = analyse.reduce_to_list(analyse.correct_psms(data["Percolator"]))
incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(data["Percolator"]))
decoy_peps = analyse.reduce_to_list(decoy)
plt.hist(correct_peps, bins=50, color="C1", label="Percolator (correct)", alpha=0.3)
plt.hist(incorrect_peps, bins=50, color="C1", label="Percolator (incorrect)", histtype="step")
plt.hist(decoy_peps, bins=50, color="k", label="Percolator (decoys)", histtype="step")
plt.legend()
plt.xlabel("PEP")
plt.ylabel("Count")
plt.title("PEP distributions");

png

ROC curves

imp_pep = [0.01, 0.05]
pep_inds = [
    analyse._DEFAULT_PEP_THRESH_LIST.index(pep)
    for pep in imp_pep
]
pep_inds
[9, 10]
percolator_joined = data["Percolator"] + decoy
for i, (key, d) in enumerate(data.items()):
    thresh_list, tp_rates, fp_rates = analyse.roc_curve(d)
    plt.plot(fp_rates, tp_rates, label="{} (AUC={:.2f})".format(key, auc(tp_rates, fp_rates)), color="C{}".format(i))
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, tp_rates, fp_rates = analyse.roc_curve(percolator_joined)
plt.plot(fp_rates, tp_rates, label="Percolator (including decoys; AUC={:.2f})".format(auc(tp_rates, fp_rates)), color="C3", ls="--")
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot([0, 1], [0, 1], ls="--", color="k")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Precision-Recall curves

for i, (key, d) in enumerate(data.items()):
    thresh_list, precision, recall = analyse.precision_recall_curve(d)
    plt.plot(recall, precision, label=key)
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, precision, recall = analyse.precision_recall_curve(percolator_joined)
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot(recall, precision, label="Percolator (including decoys)", color="C3", ls="--")
plt.plot([0, 1], [0.5, 0.5], ls="--", color="k")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Precision-Recall Curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Actual vs computed PEP

plt.figure(figsize=(6, 6))
for i, (key, d) in enumerate(data.items()):
    actual, computed = analyse.actual_vs_computed_pep(d)
    plt.plot(computed, actual, label=key)
actual, computed = analyse.actual_vs_computed_pep(percolator_joined)
plt.plot(computed, actual, label="Percolator (including decoys)", color="C1", ls=":")
plt.plot([0, 1], [0, 1], color="k", ls="--")
plt.xlabel("Computed PEP")
plt.ylabel("Actual PEP")
plt.xlim(0 - 0.05, 1 + 0.05)
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Actual vs. Computed PEP")
plt.legend();

png

Protein PEP analysis - X!Tandem

%matplotlib inline
import json
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.metrics import auc
from utils import annotate_correctness
import analyse

Read and prepare data

idpep_fn = "OR20070924_S_mix7_02-10_xtandem_pmt15ppm_target_decoy_merged_IDPEP_pp_v2_fido.json"
percolator_fn = "OR20070924_S_mix7_02-10_xtandem_pmt15ppm_percolator_v3_fido_F0.05_t0.05_protein_target_pout_psms.json"
percolator_decoy_fn = "OR20070924_S_mix7_02-10_xtandem_pmt15ppm_percolator_v3_fido_F0.05_t0.05_protein_decoy_pout_psms.json"
data = {}
for key, fn in [
    ("IDPEP", idpep_fn),
    ("Percolator", percolator_fn)
]:
    with open(fn) as f:
        data[key] = json.load(f)
    data[key] = annotate_correctness(data[key], protein_key="accession")
with open(percolator_decoy_fn) as f:
    decoy = json.load(f)
decoy = annotate_correctness(decoy, protein_key="accession")

Stats

print("key", "Number of Proteins", "Number of unique proteins", sep="\t")
for key, d in data.items():
    print(key, analyse.n_psms(d), analyse.n_unique_peptides(d, protein_key="accession"), sep="\t")
key	Number of Proteins	Number of unique proteins
IDPEP	717	717
Percolator	327	327

Number of correct identifications

for i, (key, d) in enumerate(data.items()):
    thresh_list, n_correct_ident = analyse.n_correct_identifications(d)
    _, n_unique_ident = analyse.n_unique_correct_identifications(d, protein_key="accession")
    plt.plot(thresh_list, n_correct_ident, label=key, color="C{}".format(i))
    plt.plot(thresh_list, n_unique_ident, label="{} (unique)".format(key), color="C{}".format(i), ls="--")
plt.title("Number of correct identifications")
plt.xlabel("PEP")
plt.ylabel("Number of correct identifications")
plt.legend();

png

PEP distributions

fig, axarr = plt.subplots(1, 2, figsize=(20, 5))
for i, (key, d) in enumerate(data.items()):
    correct_peps = analyse.reduce_to_list(analyse.correct_psms(d))
    incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(d))
    axarr[i].hist(correct_peps, bins=50, color="C{}".format(i), label="{} (correct)".format(key), alpha=0.3)
    axarr[i].hist(incorrect_peps, bins=50, color="C{}".format(i), label="{} (incorrect)".format(key), histtype="step")
    axarr[i].set_xlabel("PEP")
    axarr[i].set_ylabel("Count")
    axarr[i].legend()
decoy_peps = analyse.reduce_to_list(decoy)
plt.suptitle("PEP distributions");

png

plt.figure(figsize=(10, 5))
correct_peps = analyse.reduce_to_list(analyse.correct_psms(data["Percolator"]))
incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(data["Percolator"]))
decoy_peps = analyse.reduce_to_list(decoy)
plt.hist(correct_peps, bins=50, color="C1", label="Percolator (correct)", alpha=0.3)
plt.hist(incorrect_peps, bins=50, color="C1", label="Percolator (incorrect)", histtype="step")
plt.hist(decoy_peps, bins=50, color="k", label="Percolator (decoys)", histtype="step")
plt.legend()
plt.xlabel("PEP")
plt.ylabel("Count")
plt.title("PEP distributions");

png

ROC curves

imp_pep = [0.01, 0.05]
pep_inds = [
    analyse._DEFAULT_PEP_THRESH_LIST.index(pep)
    for pep in imp_pep
]
pep_inds
[9, 10]
percolator_joined = data["Percolator"] + decoy
for i, (key, d) in enumerate(data.items()):
    thresh_list, tp_rates, fp_rates = analyse.roc_curve(d)
    plt.plot(fp_rates, tp_rates, label="{} (AUC={:.2f})".format(key, auc(tp_rates, fp_rates)), color="C{}".format(i))
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, tp_rates, fp_rates = analyse.roc_curve(percolator_joined)
plt.plot(fp_rates, tp_rates, label="Percolator (including decoys; AUC={:.2f})".format(auc(tp_rates, fp_rates)), color="C3", ls="--")
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot([0, 1], [0, 1], ls="--", color="k")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Precision-Recall curves

for i, (key, d) in enumerate(data.items()):
    thresh_list, precision, recall = analyse.precision_recall_curve(d)
    plt.plot(recall, precision, label=key)
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, precision, recall = analyse.precision_recall_curve(percolator_joined)
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot(recall, precision, label="Percolator (including decoys)", color="C3", ls="--")
plt.plot([0, 1], [0.5, 0.5], ls="--", color="k")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Precision-Recall Curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Actual vs computed PEP

plt.figure(figsize=(6, 6))
for i, (key, d) in enumerate(data.items()):
    actual, computed = analyse.actual_vs_computed_pep(d)
    plt.plot(computed, actual, label=key)
actual, computed = analyse.actual_vs_computed_pep(percolator_joined)
plt.plot(computed, actual, label="Percolator (including decoys)", color="C1", ls=":")
plt.plot([0, 1], [0, 1], color="k", ls="--")
plt.xlabel("Computed PEP")
plt.ylabel("Actual PEP")
plt.xlim(0 - 0.05, 1 + 0.05)
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Actual vs. Computed PEP")
plt.legend();

png

PEP Peptide Analysis - Comet

Percolator = Percolator with train/test FDR = 0.05

For search results from Comet, IDPEP (a simplified reimplementation of PeptideProphet in OpenMS) was not able to fit the data.

Stats

print("key", "Number of PSMs", "Number of unique peptides", sep="\t")
for key, d in data.items():
    print(key, analyse.n_psms(d), analyse.n_unique_peptides(d), sep="\t")
key	Number of PSMs	Number of unique peptides
Percolator	24180	5299

Number of correct identifications

for i, (key, d) in enumerate(data.items()):
    thresh_list, n_correct_ident = analyse.n_correct_identifications(d)
    _, n_unique_ident = analyse.n_unique_correct_identifications(d)
    plt.plot(thresh_list, n_correct_ident, label=key, color="C{}".format(i))
    plt.plot(thresh_list, n_unique_ident, label="{} (unique)".format(key), color="C{}".format(i), ls="--")
plt.title("Number of correct identifications")
plt.xlabel("PEP")
plt.ylabel("Number of correct identifications")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

PEP distributions

plt.figure(figsize=(10, 5))
correct_peps = analyse.reduce_to_list(analyse.correct_psms(data["Percolator"]))
incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(data["Percolator"]))
decoy_peps = analyse.reduce_to_list(decoy)
plt.hist(correct_peps, bins=50, color="C1", label="Percolator (correct)", alpha=0.3)
plt.hist(incorrect_peps, bins=50, color="C1", label="Percolator (incorrect)", histtype="step")
plt.hist(decoy_peps, bins=50, color="k", label="Percolator (decoys)", histtype="step")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("PEP")
plt.ylabel("Count")
plt.title("PEP distributions");

png

ROC curves

imp_pep = [0.01, 0.05]
pep_inds = [
    analyse._DEFAULT_PEP_THRESH_LIST.index(pep)
    for pep in imp_pep
]
percolator_joined = data["Percolator"] + decoy
for i, (key, d) in enumerate(data.items()):
    thresh_list, tp_rates, fp_rates = analyse.roc_curve(d)
    plt.plot(fp_rates, tp_rates, label="{} (AUC={:.2f})".format(key, auc(tp_rates, fp_rates)), color="C{}".format(i))
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, tp_rates, fp_rates = analyse.roc_curve(percolator_joined)
plt.plot(fp_rates, tp_rates, label="Percolator (including decoys; AUC={:.2f})".format(auc(tp_rates, fp_rates)), color="C3", ls="--")
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot([0, 1], [0, 1], ls="--", color="k")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Precision-Recall curves

for i, (key, d) in enumerate(data.items()):
    thresh_list, precision, recall = analyse.precision_recall_curve(d)
    plt.plot(recall, precision, label=key)
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, precision, recall = analyse.precision_recall_curve(percolator_joined)
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot(recall, precision, label="Percolator (including decoys)", color="C3", ls="--")
plt.plot([0, 1], [0.5, 0.5], ls="--", color="k")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Precision-Recall Curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Actual vs computed PEP

How actual and computed PEPs are calculated:

  1. Sort PSMs accoring to PEPs; a sliding window (of size 100) then moves from the top to the bottom of that ordered list
  2. For each sliding window, the computed average PEP is simply is the mean of all reported PEPs in that window (i.e. sum(PEPs) / window_size), and the actual average PEP is the proportion of incorrect matches in the current sliding window (the idea here is that the actual PEP of a correct match is 0 while the actual PEP of an incorrect match is 1)

If actual and computed PEPs are plotted against each other, then a diagonal line is the ideal case.

plt.figure(figsize=(6, 6))
for i, (key, d) in enumerate(data.items()):
    actual, computed = analyse.actual_vs_computed_pep(d)
    plt.plot(computed, actual, label=key)
actual, computed = analyse.actual_vs_computed_pep(percolator_joined)
plt.plot(computed, actual, label="Percolator (including decoys)", color="C1", ls=":")
plt.plot([0, 1], [0, 1], color="k", ls="--")
plt.xlabel("Computed PEP")
plt.ylabel("Actual PEP")
plt.xlim(0 - 0.05, 1 + 0.05)
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Actual vs. Computed PEP")
plt.legend();

png

PEP Peptide Analysis - MSGF+

IDPEP = Simplified reimplementation of PeptideProphet in OpenMS
Percolator = Percolator with train/test FDR = 0.05

Stats

print("key", "Number of PSMs", "Number of unique peptides", sep="\t")
for key, d in data.items():
    print(key, analyse.n_psms(d), analyse.n_unique_peptides(d), sep="\t")
key	Number of PSMs	Number of unique peptides
IDPEP	37795	10763
Percolator	18725	5771

Number of correct identifications

for i, (key, d) in enumerate(data.items()):
    thresh_list, n_correct_ident = analyse.n_correct_identifications(d)
    _, n_unique_ident = analyse.n_unique_correct_identifications(d)
    plt.plot(thresh_list, n_correct_ident, label=key, color="C{}".format(i))
    plt.plot(thresh_list, n_unique_ident, label="{} (unique)".format(key), color="C{}".format(i), ls="--")
plt.title("Number of correct identifications")
plt.xlabel("PEP")
plt.ylabel("Number of correct identifications")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

PEP distributions

fig, axarr = plt.subplots(1, 2, figsize=(20, 5))
for i, (key, d) in enumerate(data.items()):
    correct_peps = analyse.reduce_to_list(analyse.correct_psms(d))
    incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(d))
    axarr[i].hist(correct_peps, bins=50, color="C{}".format(i), label="{} (correct)".format(key), alpha=0.3)
    axarr[i].hist(incorrect_peps, bins=50, color="C{}".format(i), label="{} (incorrect)".format(key), histtype="step")
    axarr[i].set_xlabel("PEP")
    axarr[i].set_ylabel("Count")
    axarr[i].legend()
decoy_peps = analyse.reduce_to_list(decoy)
plt.suptitle("PEP distributions");

png

plt.figure(figsize=(10, 5))
correct_peps = analyse.reduce_to_list(analyse.correct_psms(data["Percolator"]))
incorrect_peps = analyse.reduce_to_list(analyse.incorrect_psms(data["Percolator"]))
decoy_peps = analyse.reduce_to_list(decoy)
plt.hist(correct_peps, bins=50, color="C1", label="Percolator (correct)", alpha=0.3)
plt.hist(incorrect_peps, bins=50, color="C1", label="Percolator (incorrect)", histtype="step")
plt.hist(decoy_peps, bins=50, color="k", label="Percolator (decoys)", histtype="step")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("PEP")
plt.ylabel("Count")
plt.title("PEP distributions");

png

ROC curves

imp_pep = [0.01, 0.05]
pep_inds = [
    analyse._DEFAULT_PEP_THRESH_LIST.index(pep)
    for pep in imp_pep
]
percolator_joined = data["Percolator"] + decoy
for i, (key, d) in enumerate(data.items()):
    thresh_list, tp_rates, fp_rates = analyse.roc_curve(d)
    plt.plot(fp_rates, tp_rates, label="{} (AUC={:.2f})".format(key, auc(tp_rates, fp_rates)), color="C{}".format(i))
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, tp_rates, fp_rates = analyse.roc_curve(percolator_joined)
plt.plot(fp_rates, tp_rates, label="Percolator (including decoys; AUC={:.2f})".format(auc(tp_rates, fp_rates)), color="C3", ls="--")
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(fp_rates[ind], tp_rates[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot([0, 1], [0, 1], ls="--", color="k")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Precision-Recall curves

for i, (key, d) in enumerate(data.items()):
    thresh_list, precision, recall = analyse.precision_recall_curve(d)
    plt.plot(recall, precision, label=key)
    for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="{} (PEP={})".format(key, pep), color="C{}".format(i))
thresh_list, precision, recall = analyse.precision_recall_curve(percolator_joined)
for pep, ind, marker in zip(imp_pep, pep_inds, ["x", "o"]):
        plt.scatter(recall[ind], precision[ind], marker=marker, label="Percolator (including decoys; PEP={})".format(pep), color="C3")
plt.plot(recall, precision, label="Percolator (including decoys)", color="C3", ls="--")
plt.plot([0, 1], [0.5, 0.5], ls="--", color="k")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Precision-Recall Curve")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

png

Actual vs computed PEP

How actual and computed PEPs are calculated:

  1. Sort PSMs accoring to PEPs; a sliding window (of size 100) then moves from the top to the bottom of that ordered list
  2. For each sliding window, the computed average PEP is simply is the mean of all reported PEPs in that window (i.e. sum(PEPs) / window_size), and the actual average PEP is the proportion of incorrect matches in the current sliding window (the idea here is that the actual PEP of a correct match is 0 while the actual PEP of an incorrect match is 1)

If actual and computed PEPs are plotted against each other, then a diagonal line is the ideal case.

plt.figure(figsize=(6, 6))
for i, (key, d) in enumerate(data.items()):
    actual, computed = analyse.actual_vs_computed_pep(d)
    plt.plot(computed, actual, label=key)
actual, computed = analyse.actual_vs_computed_pep(percolator_joined)
plt.plot(computed, actual, label="Percolator (including decoys)", color="C1", ls=":")
plt.plot([0, 1], [0, 1], color="k", ls="--")
plt.xlabel("Computed PEP")
plt.ylabel("Actual PEP")
plt.xlim(0 - 0.05, 1 + 0.05)
plt.ylim(0 - 0.05, 1 + 0.05)
plt.title("Actual vs. Computed PEP")
plt.legend();

png