Evaluating Imputation Against Ground Truth Fold Change

%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns

from pyproteonet.simulation import molecule_set_from_degree_distribution, simulate_protein_peptide_dataset, simulate_mcars, simulate_mnars_thresholding
from pyproteonet.aggregation import maxlfq
from pyproteonet.processing import logarithmize

Simulating a Dataset

# We define some degree distributions of protein and peptide nodes roughly assembling those of a real world dataset
protein_deg_distribution = [0, 0.1445, 0.1221, 0.1151, 0.0933, 0.0692, 0.0655, 0.0508, 0.0472, 0.0362, 0.0311, 0.0277, 0.0209, 0.0199, 0.0163, 0.0143,
                            0.012, 0.0105, 0.0093, 0.0087, 0.0081, 0.0063, 0.0063, 0.0055, 0.0054, 0.0043, 0.0043, 0.0042, 0.0039, 0.0037, 0.0034,
                            0.0031, 0.0022, 0.0021, 0.0019, 0.0019, 0.0019, 0.0015, 0.0012, 0.001, 0.001]
peptide_deg_distribution = [0, 0.9591, 0.0341, 0.0046, 0.0014]

First, we create a set of proteins with related peptides. Next, we simulate abundance values for those peptides. During the simulation we also simulate a fold change of 30% of all proteins. This is done by simulating a single condition group affecting 30% of all proteins inthe last five samples.

# We aim for a simulated dataset with 100 proteins and 10 samples
num_proteins = 100
num_samples = 10

# We use a simple heuristic to determine the number of peptides for the given number of proteins while still closely matching the degree distributions
protein_degs = np.round(num_proteins * np.array(protein_deg_distribution))
prot_edges = np.sum(np.arange(len(protein_deg_distribution)) * protein_degs)
num_peptides = 1
pep_edges = 0
while pep_edges < prot_edges:
    num_peptides += 1
    peptide_degs = np.round(num_peptides * np.array(peptide_deg_distribution))
    pep_edges = np.sum(np.arange(len(peptide_deg_distribution)) * peptide_degs)
if pep_edges > prot_edges:
    diff = pep_edges - prot_edges
    for i in range(len(peptide_degs)-1, -1, -1):
        if peptide_degs[i] > 0 and i <= diff:
            peptide_degs[i] -= 1
            diff -= i
        if diff == 0:
            break

# Create a protein peptide molecule set for the given number of proteins/peptides and degree distribution
ms = molecule_set_from_degree_distribution(molecule1_name='protein', molecule2_name='peptide', mapping_name='peptide-protein',
                                           molecule1_degree_distribution=protein_degs, molecule2_degree_distribution=peptide_degs)
# Lets simulate some abundance values for the given molecule set
condition_proteins = ms.molecules['protein'].sample(frac=0.3).index
ds = simulate_protein_peptide_dataset(molecule_set=ms, mapping='peptide-protein', samples=num_samples,
                                      log_abundance_mu=15.9, log_abundance_sigma=1.8,
                                      log_protein_error_sigma=0.3, peptide_noise_sigma= 115005.3,
                                      flyability_alpha=0.7, flyability_beta=2.1, simulate_flyability=True,
                                      condition_samples=[[f'sample{s}' for s in (range(5,10))]], condition_affected=[condition_proteins],
                                      log2_condition_means=[2.0], log2_condition_stds=[0.66])

Finally, we incorporate some missing values (MNARs and MCARs)

simulate_mnars_thresholding(dataset=ds, thresh_mu=115005.3 / 2, thresh_sigma=115005.3 / 4, molecule='peptide', column='abundance',
                            result_column='abundance_missing', mask_column='is_mnar', inplace=True)
simulate_mcars(dataset=ds, amount=0.3, molecule='peptide', column='abundance', result_column='abundance_missing', mask_column='is_mcar', inplace=True)
<pyproteonet.data.dataset.Dataset at 0x7f98331499f0>
#We look at the number of MNAR's and MCAR's
df = ds.values['peptide'].df
df.is_mnar.sum() / df.shape[0], df.is_mcar.sum() / df.shape[0]
(0.025, 0.2996688741721854)

In the end all abundance/aggregated values are logarithmized as it is commonly done in proteomics because logarithmized values are more normally distributed.

ds = logarithmize(data=ds, columns=['abundance', 'abundance_gt', 'abundance_missing'])

MaxLFQ aggregation

ds.values['protein']['aggregated'] = maxlfq(dataset=ds, molecule='protein', mapping='peptide-protein', partner_column='abundance_missing',
                                            min_ratios=2, median_fallback=False, is_log=True)

Now the ‘aggregated’ value column holds the aggregated values and the ‘abundance_gt’ value column which was written during the simulation holds the ground truth values

ds.values['protein'].df
abundance_gt aggregated
sample id
sample0 0 17.169365 NaN
1 20.560688 NaN
2 15.000974 NaN
3 14.404043 NaN
4 15.042509 NaN
... ... ... ...
sample9 92 16.595688 15.078889
93 18.847798 17.029333
94 19.827979 17.730384
95 13.603674 12.169696
96 17.982749 16.312612

970 rows × 2 columns

Plotting the aggregated abundance ratios we see two clusters corresponding to the proteins affected by the condition and the unaffected ones.

from pyproteonet.visualization import plot_ratio_scatter

categories = pd.Series(data='no fold change', index=ds.molecules['protein'].index)
categories[condition_proteins] = 'fold change'
a,b = plot_ratio_scatter(dataset=ds, molecule='protein', columns=['aggregated'],
                         numerator_samples=[f'sample{s}' for s in (range(5,10))], denominator_samples=[f'sample{s}' for s in (range(5))],
                         plot_density=False, is_log=True, alpha=0.5, categories=categories)
../_images/7707b1f532153a257e638df319caca9aa03feee8dc111e43edc61feb16a093fb.png

Missing Value Imputation

Pyproteonet provides a wide range of established imputation functions combining both native python implementations and wrappers around R packages for imputation functions where no Python implementation is available yet.

Here we use the high level api to impute on both protein and peptide level using a bunch of different imputation functions.

from pyproteonet.imputation import impute_molecule

imputation_methods = ["minprob", "mindet", "bpca", "missforest", "knn", "isvd", "dae"]

impute_molecule(dataset=ds, molecule='protein', column='aggregated', methods=imputation_methods)
impute_molecule(dataset=ds, molecule='peptide', column='abundance_missing', methods=imputation_methods)
Hide code cell output
R[write to console]: Loading required package: tmvtnorm

R[write to console]: Loading required package: mvtnorm

R[write to console]: Loading required package: Matrix

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: gmm

R[write to console]: Loading required package: sandwich

R[write to console]: Loading required package: norm

R[write to console]: This package has some major limitations
(for example, it does not work reliably when
the number of variables exceeds 30),
and has been superseded by the norm2 package.

R[write to console]: Loading required package: pcaMethods

R[write to console]: Loading required package: Biobase

R[write to console]: Loading required package: BiocGenerics

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


R[write to console]: Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


R[write to console]: 
Attaching package: ‘pcaMethods’


R[write to console]: The following object is masked from ‘package:stats’:

    loadings


R[write to console]: Loading required package: impute

R[write to console]: In addition: 
R[write to console]: There were 11 warnings (use warnings() to see them)
R[write to console]: 
minprob minprob
[1] 0.3002219
mindet mindet
bpca bpca
missforest missforest
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
knn knn
isvd isvd
[IterativeSVD] Iter 1: observed MAE=1.029508
[IterativeSVD] Iter 2: observed MAE=0.361089
[IterativeSVD] Iter 3: observed MAE=0.277206
[IterativeSVD] Iter 4: observed MAE=0.251593
[IterativeSVD] Iter 5: observed MAE=0.239997
[IterativeSVD] Iter 6: observed MAE=0.233418
[IterativeSVD] Iter 7: observed MAE=0.228806
[IterativeSVD] Iter 8: observed MAE=0.225354
[IterativeSVD] Iter 9: observed MAE=0.222133
[IterativeSVD] Iter 10: observed MAE=0.219068
[IterativeSVD] Iter 11: observed MAE=0.216221
[IterativeSVD] Iter 12: observed MAE=0.213613
[IterativeSVD] Iter 13: observed MAE=0.211435
[IterativeSVD] Iter 14: observed MAE=0.209869
[IterativeSVD] Iter 15: observed MAE=0.208677
[IterativeSVD] Iter 16: observed MAE=0.207568
[IterativeSVD] Iter 17: observed MAE=0.206538
[IterativeSVD] Iter 18: observed MAE=0.205668
[IterativeSVD] Iter 19: observed MAE=0.204957
13.719917530273548
dae dae
epoch train_loss valid_loss time
0 771.526489 92.160431 00:00
1 772.366943 92.353256 00:00
2 769.833069 92.375992 00:00
3 768.604248 92.269379 00:00
4 758.263245 91.880707 00:00
5 747.080200 91.415390 00:00
6 728.280090 90.840988 00:00
7 712.120911 90.002388 00:00
8 691.932617 88.560638 00:00
9 668.623352 86.934418 00:00
10 645.239624 84.854103 00:00
11 617.785217 82.413429 00:00
12 591.351379 79.926804 00:00
13 565.105164 76.967102 00:00
14 540.104065 74.121017 00:00
15 514.766785 71.036179 00:00
16 491.550842 67.346024 00:00
17 468.812012 63.549061 00:00
18 449.108002 59.731007 00:00
19 430.298523 55.514488 00:00
20 411.570648 52.218212 00:00
21 395.587616 49.133953 00:00
22 380.467834 45.290352 00:00
23 366.138885 41.591652 00:00
24 352.394226 38.126877 00:00
25 340.403351 35.215355 00:00
26 329.015961 31.941929 00:00
27 318.154938 29.208054 00:00
28 307.556396 27.451818 00:00
29 297.848755 25.800251 00:00
30 288.697937 24.675629 00:00
31 280.368744 22.831230 00:00
32 271.839752 21.028332 00:00
33 263.920349 20.149044 00:00
34 256.748627 19.689053 00:00
35 250.305237 19.224886 00:00
36 243.825073 18.174835 00:00
37 237.186234 17.868359 00:00
38 231.362656 17.417484 00:00
39 226.324219 17.180439 00:00
40 220.601761 16.944290 00:00
41 215.210754 16.577560 00:00
42 210.437592 16.556936 00:00
43 205.902206 16.823841 00:00
44 201.798828 15.859558 00:00
45 197.523987 15.849012 00:00
46 193.360168 15.454900 00:00
47 189.400604 14.698169 00:00
48 185.586548 14.064270 00:00
49 182.094086 14.232521 00:00
minprob minprob
[1] 0.3743327
mindet mindet
bpca bpca
missforest missforest
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
knn knn
isvd isvd
[IterativeSVD] Iter 1: observed MAE=3.794830
[IterativeSVD] Iter 2: observed MAE=1.191989
[IterativeSVD] Iter 3: observed MAE=0.644522
[IterativeSVD] Iter 4: observed MAE=0.479546
[IterativeSVD] Iter 5: observed MAE=0.414731
[IterativeSVD] Iter 6: observed MAE=0.382314
[IterativeSVD] Iter 7: observed MAE=0.361593
[IterativeSVD] Iter 8: observed MAE=0.345830
[IterativeSVD] Iter 9: observed MAE=0.332967
[IterativeSVD] Iter 10: observed MAE=0.321903
[IterativeSVD] Iter 11: observed MAE=0.313058
[IterativeSVD] Iter 12: observed MAE=0.305612
[IterativeSVD] Iter 13: observed MAE=0.298658
[IterativeSVD] Iter 14: observed MAE=0.291231
[IterativeSVD] Iter 15: observed MAE=0.281993
[IterativeSVD] Iter 16: observed MAE=0.270293
[IterativeSVD] Iter 17: observed MAE=0.257632
[IterativeSVD] Iter 18: observed MAE=0.246249
[IterativeSVD] Iter 19: observed MAE=0.237744
[IterativeSVD] Iter 20: observed MAE=0.232672
[IterativeSVD] Iter 21: observed MAE=0.229847
[IterativeSVD] Iter 22: observed MAE=0.228363
14.489411675754466
dae dae
epoch train_loss valid_loss time
0 4583.745117 426.797913 00:00
1 4558.276367 426.060699 00:00
2 4545.520020 424.628357 00:00
3 4505.635742 422.432526 00:00
4 4463.166016 419.978271 00:00
5 4402.731445 417.171478 00:00
6 4331.463379 413.485657 00:00
7 4241.655273 409.318634 00:00
8 4149.645020 403.610596 00:00
9 4035.374268 396.451233 00:00
10 3915.376221 387.736786 00:00
11 3784.795654 377.046326 00:00
12 3641.189941 365.024231 00:00
13 3500.675293 351.603699 00:00
14 3357.156250 337.225555 00:00
15 3215.336426 320.989746 00:00
16 3081.312744 304.547363 00:00
17 2951.336426 288.108887 00:00
18 2829.898438 271.372681 00:00
19 2714.050049 253.985641 00:00
20 2605.306152 239.081329 00:00
21 2500.918945 223.682159 00:00
22 2405.522217 207.702240 00:00
23 2315.056396 191.600220 00:00
24 2231.274902 176.463211 00:00
25 2152.593506 161.810226 00:00
26 2080.827637 146.981323 00:00
27 2011.464722 135.124786 00:00
28 1945.286255 122.995033 00:00
29 1883.757935 114.385330 00:00
30 1825.023438 106.605972 00:00
31 1769.324829 99.771904 00:00
32 1716.415161 94.857063 00:00
33 1669.574707 88.853897 00:00
34 1622.764526 82.341179 00:00
35 1578.125977 76.525879 00:00
36 1534.842773 72.577148 00:00
37 1494.982544 68.411743 00:00
38 1457.871216 64.500877 00:00
39 1422.914795 62.013630 00:00
40 1388.780029 60.291847 00:00
41 1357.178955 58.403954 00:00
42 1327.356812 56.958199 00:00
43 1297.922363 56.064789 00:00
44 1269.961914 54.674122 00:00
45 1242.876831 54.273338 00:00
46 1217.293823 54.264595 00:00
47 1192.430908 53.302429 00:00
48 1169.106323 52.484703 00:00
49 1146.657837 52.133827 00:00
../_images/b5da59104bb2fc89a4e8e79e376aee584b7e146698328e26a6bef60db63bab4d.png ../_images/754016e0be8e8b853e21881e18263bc1cdfc00f2bd7e1f38eb0ace90b51da557.png ../_images/e3cd0e0878a11b83735ec9bd48e159fd1e5fc2cc1a15f3172da2ca645a5eea12.png ../_images/7a00f92e515938e5d624a6521bcd0acd91a85b261c6134e2ae1fb46b76ce5a5d.png

Looking at the result we can see that the missing values are gone:

ds.values['peptide'].df
abundance abundance_gt abundance_missing is_mnar is_mcar minprob mindet bpca missforest knn isvd dae
sample id
sample0 0 15.651 17.169 NaN False True 11.000 10.565 16.101 16.062 16.047 16.135 15.313
1 20.386 20.561 20.386 False False 20.386 20.386 20.386 20.386 20.386 20.386 20.386
2 14.708 15.001 14.708 False False 14.708 14.708 14.708 14.708 14.708 14.708 14.708
3 14.319 14.404 14.319 False False 14.319 14.319 14.319 14.319 14.319 14.319 14.319
4 12.920 15.043 12.920 False False 12.920 12.920 12.920 12.920 12.920 12.920 12.920
... ... ... ... ... ... ... ... ... ... ... ... ... ...
sample9 599 14.434 18.261 14.434 False False 14.434 14.434 14.434 14.434 14.434 14.434 14.434
600 16.882 19.213 NaN False True 9.549 9.778 17.072 16.992 17.279 17.056 16.109
601 16.952 19.500 NaN False True 8.785 9.778 17.070 17.281 17.256 17.073 17.250
602 14.137 18.043 14.137 False False 14.137 14.137 14.137 14.137 14.137 14.137 14.137
603 17.885 18.359 17.885 False False 17.885 17.885 17.885 17.885 17.885 17.885 17.885

6040 rows × 12 columns

Graph Neural Network Imputation

from pyproteonet.imputation.dnn.gnn import impute_heterogeneous_gnn

_ = impute_heterogeneous_gnn(dataset=ds, molecule='protein', column='aggregated', mapping='peptide-protein', partner_column='abundance_missing',
                             molecule_result_column=f'gnn_hetero', partner_result_column=f'gnn_hetero',
                             max_epochs=1000, early_stopping_patience=7, epoch_size=30, training_fraction=0.25, log_every_n_steps=30)
Hide code cell output
seed: 611519400
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type            | Params
------------------------------------------------------
0 | embedding         | Embedding       | 485   
1 | molecule_fc_model | Sequential      | 11.0 K
2 | partner_fc_model  | Sequential      | 11.4 K
3 | molecule_gat      | HeteroGraphConv | 34.4 K
4 | partner_gat       | HeteroGraphConv | 50.4 K
5 | molecule_gat2     | HeteroGraphConv | 66.4 K
6 | molecule_linear   | Linear          | 820   
7 | partner_linear    | Linear          | 1.2 K 
8 | loss_fn           | GaussianNLLLoss | 0     
------------------------------------------------------
176 K     Trainable params
0         Non-trainable params
176 K     Total params
0.705     Total estimated model params size (MB)
step29: num_masked_molecule:726.0 || num_masked_partner:1100.066650390625 || molecule_loss:0.4965510070323944 || partner_loss:0.4411449432373047 || train_loss:0.9376960396766663 || epoch:0 || 
step59: num_masked_molecule:726.0 || num_masked_partner:1035.0333251953125 || molecule_loss:0.3079593777656555 || partner_loss:0.026088429614901543 || train_loss:0.3340478539466858 || epoch:1 || 
step89: num_masked_molecule:726.0 || num_masked_partner:1013.2666625976562 || molecule_loss:-0.19135525822639465 || partner_loss:-0.2356344610452652 || train_loss:-0.42698976397514343 || epoch:2 || 
step119: num_masked_molecule:726.0 || num_masked_partner:1024.5999755859375 || molecule_loss:-0.4036961495876312 || partner_loss:-0.3087603449821472 || train_loss:-0.712456464767456 || epoch:3 || 
step149: num_masked_molecule:726.0 || num_masked_partner:1068.4000244140625 || molecule_loss:-0.5029523372650146 || partner_loss:-0.3543926477432251 || train_loss:-0.857344925403595 || epoch:4 || 
step179: num_masked_molecule:726.0 || num_masked_partner:1079.4666748046875 || molecule_loss:-0.49853944778442383 || partner_loss:-0.3519468307495117 || train_loss:-0.850486159324646 || epoch:5 || 
step209: num_masked_molecule:726.0 || num_masked_partner:1202.3333740234375 || molecule_loss:-0.545192301273346 || partner_loss:-0.3749948740005493 || train_loss:-0.9201869964599609 || epoch:6 || 
step239: num_masked_molecule:726.0 || num_masked_partner:1087.5999755859375 || molecule_loss:-0.47056013345718384 || partner_loss:-0.37001076340675354 || train_loss:-0.8405709862709045 || epoch:7 || 
step269: num_masked_molecule:726.0 || num_masked_partner:1064.6666259765625 || molecule_loss:-0.5025004744529724 || partner_loss:-0.39020660519599915 || train_loss:-0.8927069902420044 || epoch:8 || 
step299: num_masked_molecule:726.0 || num_masked_partner:1119.699951171875 || molecule_loss:-0.6098951697349548 || partner_loss:-0.4187478721141815 || train_loss:-1.0286431312561035 || epoch:9 || 
step329: num_masked_molecule:726.0 || num_masked_partner:1070.0999755859375 || molecule_loss:-0.6323655247688293 || partner_loss:-0.45870205760002136 || train_loss:-1.0910675525665283 || epoch:10 || 
step359: num_masked_molecule:726.0 || num_masked_partner:1022.2666625976562 || molecule_loss:-0.6693475246429443 || partner_loss:-0.46193426847457886 || train_loss:-1.1312816143035889 || epoch:11 || 
step389: num_masked_molecule:726.0 || num_masked_partner:1037.5 || molecule_loss:-0.6690173745155334 || partner_loss:-0.4616609811782837 || train_loss:-1.130678415298462 || epoch:12 || 
step419: num_masked_molecule:726.0 || num_masked_partner:1045.3333740234375 || molecule_loss:-0.730419933795929 || partner_loss:-0.507056474685669 || train_loss:-1.2374764680862427 || epoch:13 || 
step449: num_masked_molecule:726.0 || num_masked_partner:1012.7333374023438 || molecule_loss:-0.722473680973053 || partner_loss:-0.5137044787406921 || train_loss:-1.2361780405044556 || epoch:14 || 
step479: num_masked_molecule:726.0 || num_masked_partner:1028.066650390625 || molecule_loss:-0.6839271187782288 || partner_loss:-0.5060281157493591 || train_loss:-1.189955234527588 || epoch:15 || 
step509: num_masked_molecule:726.0 || num_masked_partner:1087.699951171875 || molecule_loss:-0.7172583937644958 || partner_loss:-0.5181266069412231 || train_loss:-1.2353848218917847 || epoch:16 || 
step539: num_masked_molecule:726.0 || num_masked_partner:1008.1666870117188 || molecule_loss:-0.7352653741836548 || partner_loss:-0.5446591973304749 || train_loss:-1.2799246311187744 || epoch:17 || 
step569: num_masked_molecule:726.0 || num_masked_partner:1057.0333251953125 || molecule_loss:-0.7405372262001038 || partner_loss:-0.5526030659675598 || train_loss:-1.2931402921676636 || epoch:18 || 
step599: num_masked_molecule:726.0 || num_masked_partner:1024.0 || molecule_loss:-0.7776814103126526 || partner_loss:-0.5644718408584595 || train_loss:-1.342153549194336 || epoch:19 || 
step629: num_masked_molecule:726.0 || num_masked_partner:1110.300048828125 || molecule_loss:-0.7743927836418152 || partner_loss:-0.5480443835258484 || train_loss:-1.3224371671676636 || epoch:20 || 
step659: num_masked_molecule:726.0 || num_masked_partner:1055.9000244140625 || molecule_loss:-0.8264914751052856 || partner_loss:-0.5862985849380493 || train_loss:-1.412790060043335 || epoch:21 || 
step689: num_masked_molecule:726.0 || num_masked_partner:1136.6666259765625 || molecule_loss:-0.813257098197937 || partner_loss:-0.5911661386489868 || train_loss:-1.4044231176376343 || epoch:22 || 
step719: num_masked_molecule:726.0 || num_masked_partner:1104.7667236328125 || molecule_loss:-0.8293827772140503 || partner_loss:-0.5931750535964966 || train_loss:-1.4225578308105469 || epoch:23 || 
step749: num_masked_molecule:726.0 || num_masked_partner:1021.3666381835938 || molecule_loss:-0.8359233736991882 || partner_loss:-0.6279093623161316 || train_loss:-1.4638323783874512 || epoch:24 || 
step779: num_masked_molecule:726.0 || num_masked_partner:1085.13330078125 || molecule_loss:-0.8605287075042725 || partner_loss:-0.6250839829444885 || train_loss:-1.4856127500534058 || epoch:25 || 
step809: num_masked_molecule:726.0 || num_masked_partner:979.066650390625 || molecule_loss:-0.8720875382423401 || partner_loss:-0.6600542068481445 || train_loss:-1.5321418046951294 || epoch:26 || 
step839: num_masked_molecule:726.0 || num_masked_partner:1009.8333129882812 || molecule_loss:-0.8814008235931396 || partner_loss:-0.665562093257904 || train_loss:-1.5469627380371094 || epoch:27 || 
step869: num_masked_molecule:726.0 || num_masked_partner:1070.1666259765625 || molecule_loss:-0.8766546845436096 || partner_loss:-0.6474056839942932 || train_loss:-1.5240601301193237 || epoch:28 || 
step899: num_masked_molecule:726.0 || num_masked_partner:1109.86669921875 || molecule_loss:-0.8748965859413147 || partner_loss:-0.6583003997802734 || train_loss:-1.5331969261169434 || epoch:29 || 
step929: num_masked_molecule:726.0 || num_masked_partner:991.2666625976562 || molecule_loss:-0.9041153788566589 || partner_loss:-0.6923929452896118 || train_loss:-1.596508264541626 || epoch:30 || 
step959: num_masked_molecule:726.0 || num_masked_partner:1139.7667236328125 || molecule_loss:-0.8907600045204163 || partner_loss:-0.6543554067611694 || train_loss:-1.5451151132583618 || epoch:31 || 
step989: num_masked_molecule:726.0 || num_masked_partner:1088.4000244140625 || molecule_loss:-0.9054028391838074 || partner_loss:-0.6714745163917542 || train_loss:-1.5768771171569824 || epoch:32 || 
step1019: num_masked_molecule:726.0 || num_masked_partner:1049.8333740234375 || molecule_loss:-0.9136359691619873 || partner_loss:-0.7057571411132812 || train_loss:-1.619393229484558 || epoch:33 || 
step1049: num_masked_molecule:726.0 || num_masked_partner:1021.433349609375 || molecule_loss:-0.9122346639633179 || partner_loss:-0.7000260949134827 || train_loss:-1.6122606992721558 || epoch:34 || 
step1079: num_masked_molecule:726.0 || num_masked_partner:1064.2667236328125 || molecule_loss:-0.9102837443351746 || partner_loss:-0.7140386700630188 || train_loss:-1.6243220567703247 || epoch:35 || 
step1109: num_masked_molecule:726.0 || num_masked_partner:1119.5 || molecule_loss:-0.9347782135009766 || partner_loss:-0.7171137928962708 || train_loss:-1.6518919467926025 || epoch:36 || 
step1139: num_masked_molecule:726.0 || num_masked_partner:982.2000122070312 || molecule_loss:-0.9539870619773865 || partner_loss:-0.7451453804969788 || train_loss:-1.6991325616836548 || epoch:37 || 
step1169: num_masked_molecule:726.0 || num_masked_partner:1010.9000244140625 || molecule_loss:-0.934962272644043 || partner_loss:-0.7454560995101929 || train_loss:-1.6804183721542358 || epoch:38 || 
step1199: num_masked_molecule:726.0 || num_masked_partner:1055.0 || molecule_loss:-0.9453296065330505 || partner_loss:-0.7514131665229797 || train_loss:-1.6967426538467407 || epoch:39 || 
step1229: num_masked_molecule:726.0 || num_masked_partner:995.5333251953125 || molecule_loss:-0.9665142893791199 || partner_loss:-0.764931857585907 || train_loss:-1.731446385383606 || epoch:40 || 
step1259: num_masked_molecule:726.0 || num_masked_partner:1039.6666259765625 || molecule_loss:-0.9687608480453491 || partner_loss:-0.7705090641975403 || train_loss:-1.7392699718475342 || epoch:41 || 
step1289: num_masked_molecule:726.0 || num_masked_partner:1086.5333251953125 || molecule_loss:-0.9889089465141296 || partner_loss:-0.7700208425521851 || train_loss:-1.7589298486709595 || epoch:42 || 
step1319: num_masked_molecule:726.0 || num_masked_partner:1106.699951171875 || molecule_loss:-0.9803321361541748 || partner_loss:-0.7794199585914612 || train_loss:-1.7597521543502808 || epoch:43 || 
step1349: num_masked_molecule:726.0 || num_masked_partner:1009.9666748046875 || molecule_loss:-0.9949647784233093 || partner_loss:-0.785544216632843 || train_loss:-1.7805092334747314 || epoch:44 || 
step1379: num_masked_molecule:726.0 || num_masked_partner:1043.4000244140625 || molecule_loss:-0.9993769526481628 || partner_loss:-0.803527295589447 || train_loss:-1.8029043674468994 || epoch:45 || 
step1409: num_masked_molecule:726.0 || num_masked_partner:1140.199951171875 || molecule_loss:-1.0143009424209595 || partner_loss:-0.792439341545105 || train_loss:-1.8067405223846436 || epoch:46 || 
step1439: num_masked_molecule:726.0 || num_masked_partner:985.9000244140625 || molecule_loss:-1.0157802104949951 || partner_loss:-0.801406741142273 || train_loss:-1.817186713218689 || epoch:47 || 
step1469: num_masked_molecule:726.0 || num_masked_partner:983.4000244140625 || molecule_loss:-1.0334373712539673 || partner_loss:-0.8253453373908997 || train_loss:-1.8587825298309326 || epoch:48 || 
step1499: num_masked_molecule:726.0 || num_masked_partner:1051.7332763671875 || molecule_loss:-1.0181365013122559 || partner_loss:-0.8146819472312927 || train_loss:-1.8328182697296143 || epoch:49 || 
step1529: num_masked_molecule:726.0 || num_masked_partner:1076.566650390625 || molecule_loss:-1.0346359014511108 || partner_loss:-0.81679767370224 || train_loss:-1.851433515548706 || epoch:50 || 
step1559: num_masked_molecule:726.0 || num_masked_partner:1174.5999755859375 || molecule_loss:-1.0566203594207764 || partner_loss:-0.8248631954193115 || train_loss:-1.8814834356307983 || epoch:51 || 
step1589: num_masked_molecule:726.0 || num_masked_partner:1072.7667236328125 || molecule_loss:-1.037369728088379 || partner_loss:-0.8392760753631592 || train_loss:-1.876645565032959 || epoch:52 || 
step1619: num_masked_molecule:726.0 || num_masked_partner:1030.8333740234375 || molecule_loss:-1.0543652772903442 || partner_loss:-0.8409631252288818 || train_loss:-1.8953280448913574 || epoch:53 || 
step1649: num_masked_molecule:726.0 || num_masked_partner:1045.066650390625 || molecule_loss:-1.0495632886886597 || partner_loss:-0.8441323041915894 || train_loss:-1.8936954736709595 || epoch:54 || 
step1679: num_masked_molecule:726.0 || num_masked_partner:1146.2332763671875 || molecule_loss:-1.0817667245864868 || partner_loss:-0.8280280828475952 || train_loss:-1.9097949266433716 || epoch:55 || 
step1709: num_masked_molecule:726.0 || num_masked_partner:1037.800048828125 || molecule_loss:-1.0899382829666138 || partner_loss:-0.8692459464073181 || train_loss:-1.9591840505599976 || epoch:56 || 
step1739: num_masked_molecule:726.0 || num_masked_partner:1010.6666870117188 || molecule_loss:-1.0975162982940674 || partner_loss:-0.8926358819007874 || train_loss:-1.99015212059021 || epoch:57 || 
step1769: num_masked_molecule:726.0 || num_masked_partner:1148.0 || molecule_loss:-1.0803073644638062 || partner_loss:-0.870618462562561 || train_loss:-1.950925588607788 || epoch:58 || 
step1799: num_masked_molecule:726.0 || num_masked_partner:1000.7666625976562 || molecule_loss:-1.0922099351882935 || partner_loss:-0.8835612535476685 || train_loss:-1.975771188735962 || epoch:59 || 
step1829: num_masked_molecule:726.0 || num_masked_partner:973.3333129882812 || molecule_loss:-1.1173169612884521 || partner_loss:-0.9026615023612976 || train_loss:-2.0199780464172363 || epoch:60 || 
step1859: num_masked_molecule:726.0 || num_masked_partner:1018.1666870117188 || molecule_loss:-1.1046700477600098 || partner_loss:-0.8955172896385193 || train_loss:-2.000187635421753 || epoch:61 || 
step1889: num_masked_molecule:726.0 || num_masked_partner:924.1333618164062 || molecule_loss:-1.1399754285812378 || partner_loss:-0.9060159921646118 || train_loss:-2.0459911823272705 || epoch:62 || 
step1919: num_masked_molecule:726.0 || num_masked_partner:1186.8333740234375 || molecule_loss:-1.1187496185302734 || partner_loss:-0.8819963932037354 || train_loss:-2.000746011734009 || epoch:63 || 
step1949: num_masked_molecule:726.0 || num_masked_partner:1036.199951171875 || molecule_loss:-1.1434788703918457 || partner_loss:-0.9053875207901001 || train_loss:-2.0488662719726562 || epoch:64 || 
step1979: num_masked_molecule:726.0 || num_masked_partner:1140.63330078125 || molecule_loss:-1.1350418329238892 || partner_loss:-0.8943266272544861 || train_loss:-2.0293684005737305 || epoch:65 || 
step2009: num_masked_molecule:726.0 || num_masked_partner:1060.9666748046875 || molecule_loss:-1.168619155883789 || partner_loss:-0.9246275424957275 || train_loss:-2.0932469367980957 || epoch:66 || 
step2039: num_masked_molecule:726.0 || num_masked_partner:1102.933349609375 || molecule_loss:-1.1653698682785034 || partner_loss:-0.9179772734642029 || train_loss:-2.0833470821380615 || epoch:67 || 
step2069: num_masked_molecule:726.0 || num_masked_partner:1100.5999755859375 || molecule_loss:-1.1653379201889038 || partner_loss:-0.9129053950309753 || train_loss:-2.0782434940338135 || epoch:68 || 
step2099: num_masked_molecule:726.0 || num_masked_partner:1095.86669921875 || molecule_loss:-1.1672157049179077 || partner_loss:-0.9130270481109619 || train_loss:-2.08024263381958 || epoch:69 || 
step2129: num_masked_molecule:726.0 || num_masked_partner:1028.566650390625 || molecule_loss:-1.1828758716583252 || partner_loss:-0.9405314326286316 || train_loss:-2.1234071254730225 || epoch:70 || 
step2159: num_masked_molecule:726.0 || num_masked_partner:1104.9000244140625 || molecule_loss:-1.1876522302627563 || partner_loss:-0.9315553307533264 || train_loss:-2.1192076206207275 || epoch:71 || 
step2189: num_masked_molecule:726.0 || num_masked_partner:1019.7666625976562 || molecule_loss:-1.205991268157959 || partner_loss:-0.9466720819473267 || train_loss:-2.152663230895996 || epoch:72 || 
step2219: num_masked_molecule:726.0 || num_masked_partner:1076.6666259765625 || molecule_loss:-1.1927111148834229 || partner_loss:-0.9489060044288635 || train_loss:-2.1416170597076416 || epoch:73 || 
step2249: num_masked_molecule:726.0 || num_masked_partner:1069.1666259765625 || molecule_loss:-1.1742908954620361 || partner_loss:-0.9380027055740356 || train_loss:-2.112293243408203 || epoch:74 || 
step2279: num_masked_molecule:726.0 || num_masked_partner:1182.13330078125 || molecule_loss:-1.1886842250823975 || partner_loss:-0.9302987456321716 || train_loss:-2.1189827919006348 || epoch:75 || 
step2309: num_masked_molecule:726.0 || num_masked_partner:1056.8333740234375 || molecule_loss:-1.212022304534912 || partner_loss:-0.9502575993537903 || train_loss:-2.1622800827026367 || epoch:76 || 
step2339: num_masked_molecule:726.0 || num_masked_partner:1012.6333618164062 || molecule_loss:-1.2337737083435059 || partner_loss:-0.9571922421455383 || train_loss:-2.1909658908843994 || epoch:77 || 
step2369: num_masked_molecule:726.0 || num_masked_partner:1005.7999877929688 || molecule_loss:-1.2319817543029785 || partner_loss:-0.9658077359199524 || train_loss:-2.1977896690368652 || epoch:78 || 
step2399: num_masked_molecule:726.0 || num_masked_partner:1137.7667236328125 || molecule_loss:-1.2354296445846558 || partner_loss:-0.9447720050811768 || train_loss:-2.180201768875122 || epoch:79 || 
step2429: num_masked_molecule:726.0 || num_masked_partner:997.566650390625 || molecule_loss:-1.2405979633331299 || partner_loss:-0.9732113480567932 || train_loss:-2.2138092517852783 || epoch:80 || 
step2459: num_masked_molecule:726.0 || num_masked_partner:966.8666381835938 || molecule_loss:-1.2557622194290161 || partner_loss:-0.9808309078216553 || train_loss:-2.236593008041382 || epoch:81 || 
step2489: num_masked_molecule:726.0 || num_masked_partner:1010.0333251953125 || molecule_loss:-1.2544045448303223 || partner_loss:-0.9827736616134644 || train_loss:-2.237178325653076 || epoch:82 || 
step2519: num_masked_molecule:726.0 || num_masked_partner:1035.066650390625 || molecule_loss:-1.2518256902694702 || partner_loss:-0.9906922578811646 || train_loss:-2.242518186569214 || epoch:83 || 
step2549: num_masked_molecule:726.0 || num_masked_partner:1079.433349609375 || molecule_loss:-1.2596325874328613 || partner_loss:-0.9961376786231995 || train_loss:-2.255770206451416 || epoch:84 || 
step2579: num_masked_molecule:726.0 || num_masked_partner:957.5999755859375 || molecule_loss:-1.2579421997070312 || partner_loss:-0.9910205006599426 || train_loss:-2.248962879180908 || epoch:85 || 
step2609: num_masked_molecule:726.0 || num_masked_partner:1064.4000244140625 || molecule_loss:-1.270037293434143 || partner_loss:-0.9873427748680115 || train_loss:-2.2573800086975098 || epoch:86 || 
step2639: num_masked_molecule:726.0 || num_masked_partner:989.9000244140625 || molecule_loss:-1.2828160524368286 || partner_loss:-1.0118578672409058 || train_loss:-2.2946736812591553 || epoch:87 || 
step2669: num_masked_molecule:726.0 || num_masked_partner:1089.699951171875 || molecule_loss:-1.272154450416565 || partner_loss:-0.994814932346344 || train_loss:-2.2669694423675537 || epoch:88 || 
step2699: num_masked_molecule:726.0 || num_masked_partner:1169.36669921875 || molecule_loss:-1.2712759971618652 || partner_loss:-0.9949489235877991 || train_loss:-2.2662246227264404 || epoch:89 || 
step2729: num_masked_molecule:726.0 || num_masked_partner:1052.566650390625 || molecule_loss:-1.2798277139663696 || partner_loss:-0.9988877177238464 || train_loss:-2.2787156105041504 || epoch:90 || 
step2759: num_masked_molecule:726.0 || num_masked_partner:1027.199951171875 || molecule_loss:-1.2892494201660156 || partner_loss:-1.0110894441604614 || train_loss:-2.3003387451171875 || epoch:91 || 
step2789: num_masked_molecule:726.0 || num_masked_partner:985.8666381835938 || molecule_loss:-1.302141547203064 || partner_loss:-1.014817714691162 || train_loss:-2.3169591426849365 || epoch:92 || 
step2819: num_masked_molecule:726.0 || num_masked_partner:1065.2332763671875 || molecule_loss:-1.2997256517410278 || partner_loss:-1.020464301109314 || train_loss:-2.320189952850342 || epoch:93 || 
step2849: num_masked_molecule:726.0 || num_masked_partner:1074.5999755859375 || molecule_loss:-1.2961069345474243 || partner_loss:-1.0081396102905273 || train_loss:-2.304246425628662 || epoch:94 || 
step2879: num_masked_molecule:726.0 || num_masked_partner:1074.199951171875 || molecule_loss:-1.3059227466583252 || partner_loss:-1.0131717920303345 || train_loss:-2.3190948963165283 || epoch:95 || 
step2909: num_masked_molecule:726.0 || num_masked_partner:1047.4000244140625 || molecule_loss:-1.304827094078064 || partner_loss:-1.0202995538711548 || train_loss:-2.325126886367798 || epoch:96 || 
step2939: num_masked_molecule:726.0 || num_masked_partner:1090.0333251953125 || molecule_loss:-1.300012469291687 || partner_loss:-1.000986933708191 || train_loss:-2.300999402999878 || epoch:97 || 
step2969: num_masked_molecule:726.0 || num_masked_partner:1017.7666625976562 || molecule_loss:-1.3212788105010986 || partner_loss:-1.035373568534851 || train_loss:-2.35665225982666 || epoch:98 || 
step2999: num_masked_molecule:726.0 || num_masked_partner:993.7333374023438 || molecule_loss:-1.3178870677947998 || partner_loss:-1.036424994468689 || train_loss:-2.3543121814727783 || epoch:99 || 
step3029: num_masked_molecule:726.0 || num_masked_partner:998.5333251953125 || molecule_loss:-1.322153091430664 || partner_loss:-1.0287280082702637 || train_loss:-2.3508810997009277 || epoch:100 || 
step3059: num_masked_molecule:726.0 || num_masked_partner:1034.0333251953125 || molecule_loss:-1.2971094846725464 || partner_loss:-1.031297206878662 || train_loss:-2.328406810760498 || epoch:101 || 
step3089: num_masked_molecule:726.0 || num_masked_partner:1011.7999877929688 || molecule_loss:-1.333530068397522 || partner_loss:-1.0356030464172363 || train_loss:-2.3691327571868896 || epoch:102 || 
step3119: num_masked_molecule:726.0 || num_masked_partner:1107.9000244140625 || molecule_loss:-1.3338249921798706 || partner_loss:-1.021238088607788 || train_loss:-2.3550631999969482 || epoch:103 || 
step3149: num_masked_molecule:726.0 || num_masked_partner:971.3666381835938 || molecule_loss:-1.3429099321365356 || partner_loss:-1.0487263202667236 || train_loss:-2.391636371612549 || epoch:104 || 
step3179: num_masked_molecule:726.0 || num_masked_partner:1050.1666259765625 || molecule_loss:-1.342189073562622 || partner_loss:-1.0453040599822998 || train_loss:-2.387493371963501 || epoch:105 || 
step3209: num_masked_molecule:726.0 || num_masked_partner:1058.0 || molecule_loss:-1.342303991317749 || partner_loss:-1.041756510734558 || train_loss:-2.3840606212615967 || epoch:106 || 
step3239: num_masked_molecule:726.0 || num_masked_partner:986.0999755859375 || molecule_loss:-1.356974482536316 || partner_loss:-1.0646052360534668 || train_loss:-2.4215800762176514 || epoch:107 || 
step3269: num_masked_molecule:726.0 || num_masked_partner:1027.1666259765625 || molecule_loss:-1.3518201112747192 || partner_loss:-1.0424758195877075 || train_loss:-2.3942959308624268 || epoch:108 || 
step3299: num_masked_molecule:726.0 || num_masked_partner:974.4666748046875 || molecule_loss:-1.3549456596374512 || partner_loss:-1.052807331085205 || train_loss:-2.4077529907226562 || epoch:109 || 
step3329: num_masked_molecule:726.0 || num_masked_partner:1101.9000244140625 || molecule_loss:-1.3620893955230713 || partner_loss:-1.0447863340377808 || train_loss:-2.4068758487701416 || epoch:110 || 
step3359: num_masked_molecule:726.0 || num_masked_partner:1048.9000244140625 || molecule_loss:-1.3539717197418213 || partner_loss:-1.0623372793197632 || train_loss:-2.416309356689453 || epoch:111 || 
step3389: num_masked_molecule:726.0 || num_masked_partner:1030.3333740234375 || molecule_loss:-1.3606815338134766 || partner_loss:-1.0528395175933838 || train_loss:-2.4135208129882812 || epoch:112 || 
step3419: num_masked_molecule:726.0 || num_masked_partner:955.6666870117188 || molecule_loss:-1.372918963432312 || partner_loss:-1.0771234035491943 || train_loss:-2.450042486190796 || epoch:113 || 
step3449: num_masked_molecule:726.0 || num_masked_partner:973.8333129882812 || molecule_loss:-1.3832192420959473 || partner_loss:-1.0634123086929321 || train_loss:-2.44663143157959 || epoch:114 || 
step3479: num_masked_molecule:726.0 || num_masked_partner:1036.300048828125 || molecule_loss:-1.3683058023452759 || partner_loss:-1.0633835792541504 || train_loss:-2.4316890239715576 || epoch:115 || 
step3509: num_masked_molecule:726.0 || num_masked_partner:1010.2000122070312 || molecule_loss:-1.3830536603927612 || partner_loss:-1.068954348564148 || train_loss:-2.452008008956909 || epoch:116 || 
step3539: num_masked_molecule:726.0 || num_masked_partner:1005.7000122070312 || molecule_loss:-1.384613275527954 || partner_loss:-1.074123740196228 || train_loss:-2.4587368965148926 || epoch:117 || 
step3569: num_masked_molecule:726.0 || num_masked_partner:1051.0333251953125 || molecule_loss:-1.3920537233352661 || partner_loss:-1.0654757022857666 || train_loss:-2.4575295448303223 || epoch:118 || 
step3599: num_masked_molecule:726.0 || num_masked_partner:1147.933349609375 || molecule_loss:-1.3833810091018677 || partner_loss:-1.0549660921096802 || train_loss:-2.438347101211548 || epoch:119 || 
step3629: num_masked_molecule:726.0 || num_masked_partner:1077.5999755859375 || molecule_loss:-1.3820745944976807 || partner_loss:-1.0627564191818237 || train_loss:-2.444831371307373 || epoch:120 || 
step3659: num_masked_molecule:726.0 || num_masked_partner:1073.9000244140625 || molecule_loss:-1.3942341804504395 || partner_loss:-1.0603405237197876 || train_loss:-2.4545748233795166 || epoch:121 || 
step3689: num_masked_molecule:726.0 || num_masked_partner:1109.6666259765625 || molecule_loss:-1.3899521827697754 || partner_loss:-1.0499317646026611 || train_loss:-2.4398839473724365 || epoch:122 || 
step3719: num_masked_molecule:726.0 || num_masked_partner:1120.9666748046875 || molecule_loss:-1.4119142293930054 || partner_loss:-1.0580874681472778 || train_loss:-2.470001697540283 || epoch:123 || 
step3749: num_masked_molecule:726.0 || num_masked_partner:1094.5 || molecule_loss:-1.3814094066619873 || partner_loss:-1.0658226013183594 || train_loss:-2.4472317695617676 || epoch:124 || 
step3779: num_masked_molecule:726.0 || num_masked_partner:1041.0999755859375 || molecule_loss:-1.4080185890197754 || partner_loss:-1.0778658390045166 || train_loss:-2.485884428024292 || epoch:125 || 
step3809: num_masked_molecule:726.0 || num_masked_partner:940.6333618164062 || molecule_loss:-1.4040311574935913 || partner_loss:-1.1004084348678589 || train_loss:-2.50443959236145 || epoch:126 || 
step3839: num_masked_molecule:726.0 || num_masked_partner:1086.5333251953125 || molecule_loss:-1.416605830192566 || partner_loss:-1.0759029388427734 || train_loss:-2.49250864982605 || epoch:127 || 
step3869: num_masked_molecule:726.0 || num_masked_partner:1023.7000122070312 || molecule_loss:-1.4320847988128662 || partner_loss:-1.096185564994812 || train_loss:-2.5282704830169678 || epoch:128 || 
step3899: num_masked_molecule:726.0 || num_masked_partner:1053.63330078125 || molecule_loss:-1.415101408958435 || partner_loss:-1.0774004459381104 || train_loss:-2.492501735687256 || epoch:129 || 
step3929: num_masked_molecule:726.0 || num_masked_partner:1099.2332763671875 || molecule_loss:-1.4301247596740723 || partner_loss:-1.083634853363037 || train_loss:-2.5137593746185303 || epoch:130 || 
step3959: num_masked_molecule:726.0 || num_masked_partner:956.0 || molecule_loss:-1.4191194772720337 || partner_loss:-1.0942126512527466 || train_loss:-2.513331890106201 || epoch:131 || 
step3989: num_masked_molecule:726.0 || num_masked_partner:1024.199951171875 || molecule_loss:-1.4258414506912231 || partner_loss:-1.0949925184249878 || train_loss:-2.520833730697632 || epoch:132 || 
step4019: num_masked_molecule:726.0 || num_masked_partner:1072.4000244140625 || molecule_loss:-1.4500161409378052 || partner_loss:-1.0935717821121216 || train_loss:-2.5435876846313477 || epoch:133 || 
step4049: num_masked_molecule:726.0 || num_masked_partner:1095.6666259765625 || molecule_loss:-1.4258198738098145 || partner_loss:-1.0872559547424316 || train_loss:-2.513076066970825 || epoch:134 || 
step4079: num_masked_molecule:726.0 || num_masked_partner:1058.933349609375 || molecule_loss:-1.4403308629989624 || partner_loss:-1.090928554534912 || train_loss:-2.531259298324585 || epoch:135 || 
step4109: num_masked_molecule:726.0 || num_masked_partner:1058.566650390625 || molecule_loss:-1.4443422555923462 || partner_loss:-1.0939170122146606 || train_loss:-2.5382590293884277 || epoch:136 || 
step4139: num_masked_molecule:726.0 || num_masked_partner:1153.9666748046875 || molecule_loss:-1.4402320384979248 || partner_loss:-1.078032374382019 || train_loss:-2.5182647705078125 || epoch:137 || 
step4169: num_masked_molecule:726.0 || num_masked_partner:907.1333618164062 || molecule_loss:-1.447055697441101 || partner_loss:-1.1156059503555298 || train_loss:-2.5626611709594727 || epoch:138 || 
step4199: num_masked_molecule:726.0 || num_masked_partner:962.4666748046875 || molecule_loss:-1.453112006187439 || partner_loss:-1.1123119592666626 || train_loss:-2.5654242038726807 || epoch:139 || 
step4229: num_masked_molecule:726.0 || num_masked_partner:1076.199951171875 || molecule_loss:-1.4583872556686401 || partner_loss:-1.1005842685699463 || train_loss:-2.558971405029297 || epoch:140 || 
step4259: num_masked_molecule:726.0 || num_masked_partner:1117.7667236328125 || molecule_loss:-1.4408568143844604 || partner_loss:-1.090828537940979 || train_loss:-2.5316853523254395 || epoch:141 || 
step4289: num_masked_molecule:726.0 || num_masked_partner:1103.5 || molecule_loss:-1.4667301177978516 || partner_loss:-1.1032249927520752 || train_loss:-2.5699543952941895 || epoch:142 || 
step4319: num_masked_molecule:726.0 || num_masked_partner:1088.4000244140625 || molecule_loss:-1.4581748247146606 || partner_loss:-1.0999674797058105 || train_loss:-2.5581421852111816 || epoch:143 || 
step4349: num_masked_molecule:726.0 || num_masked_partner:1059.433349609375 || molecule_loss:-1.45566725730896 || partner_loss:-1.1075527667999268 || train_loss:-2.5632200241088867 || epoch:144 || 
step4379: num_masked_molecule:726.0 || num_masked_partner:1172.433349609375 || molecule_loss:-1.472609043121338 || partner_loss:-1.092396855354309 || train_loss:-2.5650057792663574 || epoch:145 || 
step4409: num_masked_molecule:726.0 || num_masked_partner:985.566650390625 || molecule_loss:-1.4781265258789062 || partner_loss:-1.1260569095611572 || train_loss:-2.6041834354400635 || epoch:146 || 
step4439: num_masked_molecule:726.0 || num_masked_partner:1115.8333740234375 || molecule_loss:-1.4825230836868286 || partner_loss:-1.1054446697235107 || train_loss:-2.58796763420105 || epoch:147 || 
step4469: num_masked_molecule:726.0 || num_masked_partner:974.4000244140625 || molecule_loss:-1.4903604984283447 || partner_loss:-1.1227779388427734 || train_loss:-2.61313796043396 || epoch:148 || 
step4499: num_masked_molecule:726.0 || num_masked_partner:1098.2332763671875 || molecule_loss:-1.4861323833465576 || partner_loss:-1.0995885133743286 || train_loss:-2.5857207775115967 || epoch:149 || 
step4529: num_masked_molecule:726.0 || num_masked_partner:1024.800048828125 || molecule_loss:-1.4900097846984863 || partner_loss:-1.1176047325134277 || train_loss:-2.6076149940490723 || epoch:150 || 
step4559: num_masked_molecule:726.0 || num_masked_partner:981.7000122070312 || molecule_loss:-1.4896836280822754 || partner_loss:-1.1290066242218018 || train_loss:-2.6186907291412354 || epoch:151 || 
step4589: num_masked_molecule:726.0 || num_masked_partner:1080.0999755859375 || molecule_loss:-1.4915770292282104 || partner_loss:-1.1146584749221802 || train_loss:-2.6062352657318115 || epoch:152 || 
step4619: num_masked_molecule:726.0 || num_masked_partner:1103.800048828125 || molecule_loss:-1.4884955883026123 || partner_loss:-1.1100897789001465 || train_loss:-2.598585605621338 || epoch:153 || 
step4649: num_masked_molecule:726.0 || num_masked_partner:986.1333618164062 || molecule_loss:-1.5072274208068848 || partner_loss:-1.1295430660247803 || train_loss:-2.6367697715759277 || epoch:154 || 
step4679: num_masked_molecule:726.0 || num_masked_partner:1008.2000122070312 || molecule_loss:-1.5132561922073364 || partner_loss:-1.1203256845474243 || train_loss:-2.633582592010498 || epoch:155 || 
step4709: num_masked_molecule:726.0 || num_masked_partner:1011.9000244140625 || molecule_loss:-1.484210729598999 || partner_loss:-1.1299494504928589 || train_loss:-2.6141598224639893 || epoch:156 || 
step4739: num_masked_molecule:726.0 || num_masked_partner:1091.0999755859375 || molecule_loss:-1.4978324174880981 || partner_loss:-1.112225890159607 || train_loss:-2.610058307647705 || epoch:157 || 
step4769: num_masked_molecule:726.0 || num_masked_partner:1185.066650390625 || molecule_loss:-1.504968285560608 || partner_loss:-1.1101605892181396 || train_loss:-2.615128517150879 || epoch:158 || 
step4799: num_masked_molecule:726.0 || num_masked_partner:1017.4000244140625 || molecule_loss:-1.5109955072402954 || partner_loss:-1.1441339254379272 || train_loss:-2.6551289558410645 || epoch:159 || 
step4829: num_masked_molecule:726.0 || num_masked_partner:998.6333618164062 || molecule_loss:-1.5080370903015137 || partner_loss:-1.1475878953933716 || train_loss:-2.6556246280670166 || epoch:160 || 
step4859: num_masked_molecule:726.0 || num_masked_partner:1173.4000244140625 || molecule_loss:-1.5082473754882812 || partner_loss:-1.1150858402252197 || train_loss:-2.623332977294922 || epoch:161 || 
step4889: num_masked_molecule:726.0 || num_masked_partner:1062.2332763671875 || molecule_loss:-1.5195255279541016 || partner_loss:-1.1344455480575562 || train_loss:-2.65397047996521 || epoch:162 || 
step4919: num_masked_molecule:726.0 || num_masked_partner:998.8333129882812 || molecule_loss:-1.548446536064148 || partner_loss:-1.1556682586669922 || train_loss:-2.7041144371032715 || epoch:163 || 
step4949: num_masked_molecule:726.0 || num_masked_partner:970.2999877929688 || molecule_loss:-1.5229873657226562 || partner_loss:-1.137349247932434 || train_loss:-2.6603362560272217 || epoch:164 || 
step4979: num_masked_molecule:726.0 || num_masked_partner:1113.433349609375 || molecule_loss:-1.5138256549835205 || partner_loss:-1.1198792457580566 || train_loss:-2.633704423904419 || epoch:165 || 
step5009: num_masked_molecule:726.0 || num_masked_partner:996.1333618164062 || molecule_loss:-1.532779335975647 || partner_loss:-1.149556279182434 || train_loss:-2.682335376739502 || epoch:166 || 
step5039: num_masked_molecule:726.0 || num_masked_partner:1010.566650390625 || molecule_loss:-1.5579205751419067 || partner_loss:-1.1566312313079834 || train_loss:-2.714552640914917 || epoch:167 || 
step5069: num_masked_molecule:726.0 || num_masked_partner:1057.699951171875 || molecule_loss:-1.54557204246521 || partner_loss:-1.155557632446289 || train_loss:-2.701129913330078 || epoch:168 || 
step5099: num_masked_molecule:726.0 || num_masked_partner:1003.9666748046875 || molecule_loss:-1.5569409132003784 || partner_loss:-1.1513677835464478 || train_loss:-2.7083094120025635 || epoch:169 || 
step5129: num_masked_molecule:726.0 || num_masked_partner:959.433349609375 || molecule_loss:-1.5585733652114868 || partner_loss:-1.1556090116500854 || train_loss:-2.7141823768615723 || epoch:170 || 
step5159: num_masked_molecule:726.0 || num_masked_partner:1170.566650390625 || molecule_loss:-1.540384292602539 || partner_loss:-1.125343680381775 || train_loss:-2.6657278537750244 || epoch:171 || 
step5189: num_masked_molecule:726.0 || num_masked_partner:905.566650390625 || molecule_loss:-1.5687005519866943 || partner_loss:-1.1740463972091675 || train_loss:-2.7427470684051514 || epoch:172 || 
step5219: num_masked_molecule:726.0 || num_masked_partner:1127.36669921875 || molecule_loss:-1.548170566558838 || partner_loss:-1.1296813488006592 || train_loss:-2.677851915359497 || epoch:173 || 
step5249: num_masked_molecule:726.0 || num_masked_partner:1121.933349609375 || molecule_loss:-1.5502243041992188 || partner_loss:-1.1427909135818481 || train_loss:-2.6930153369903564 || epoch:174 || 
step5279: num_masked_molecule:726.0 || num_masked_partner:1025.5999755859375 || molecule_loss:-1.549673080444336 || partner_loss:-1.1613256931304932 || train_loss:-2.710999011993408 || epoch:175 || 
step5309: num_masked_molecule:726.0 || num_masked_partner:998.2666625976562 || molecule_loss:-1.5565258264541626 || partner_loss:-1.166730523109436 || train_loss:-2.7232563495635986 || epoch:176 || 
step5339: num_masked_molecule:726.0 || num_masked_partner:1107.199951171875 || molecule_loss:-1.5741074085235596 || partner_loss:-1.1469115018844604 || train_loss:-2.7210190296173096 || epoch:177 || 
step5369: num_masked_molecule:726.0 || num_masked_partner:1165.800048828125 || molecule_loss:-1.5624641180038452 || partner_loss:-1.1312538385391235 || train_loss:-2.6937179565429688 || epoch:178 || 
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
step5399: num_masked_molecule:726.0 || num_masked_partner:1068.7332763671875 || molecule_loss:-1.551928162574768 || partner_loss:-1.159980297088623 || train_loss:-2.7119085788726807 || epoch:179 || 

Evaluation

First we evaluate on ratio level. Such an imputation can also be used for real-word spike-in datasets where the ground truth abundance is not known but the ground truth ratio between condition groups is known. Using the calculate_ratio_absolute_error function we calculate ratios from all possible sample pairs between the high and low sampls (numerator_samples and denominator_samples) and compare those ratios against the ground truth.

import math
from pyproteonet.metrics import calculate_ratio_absolute_error
from matplotlib import pyplot as plt

numerator_samples = [f'sample{s}' for s in (range(5,10))]
denominator_samples = [f'sample{s}' for s in (range(5))]
ground_truth_ratios = math.e**ds.get_wf('protein', 'abundance_gt')
ground_truth_ratios = ground_truth_ratios[numerator_samples].mean(axis=1) / ground_truth_ratios[denominator_samples].mean(axis=1)
ground_truth_ratios = ground_truth_ratios.groupby('id').mean()

#calculating ratios
ratio_mae = calculate_ratio_absolute_error(
            dataset=ds,
            numerator_samples=numerator_samples, denominator_samples=denominator_samples,
            molecule='protein',
            columns=imputation_methods + ['gnn_hetero'],
            ground_truth_ratios=ground_truth_ratios,
            calculate_log2_ratio=True,
            is_log=True,
        )
#plotting the results 
ratio_mae["gt"] = ground_truth_ratios.loc[
    ratio_mae.index.get_level_values("id")
].values
ratio_mae["gt"] = np.abs(np.log(ratio_mae["gt"])) >= np.log(2)
m = {False: "No DE", True: "DE"}
ratio_mae_subsets = {"All": ratio_mae}
for k, g in ratio_mae.groupby("gt"):
    ratio_mae_subsets[m[k]] = g
fig, axs = plt.subplots(1, 3, sharey=True, figsize=(15, 5))
res_df = []
for i, (k, data) in enumerate(ratio_mae_subsets.items()):
    data = data.groupby(["nominator_sample", "denominator_sample"]).mean()
    del data["gt"]
    data = data.melt(
        value_vars=imputation_methods + ['gnn_hetero'], var_name="Imputation Method", value_name="MAE"
    )
    sns.boxplot(data=data, x="Imputation Method", y="MAE", ax=axs[i])
    axs[i].set_title(k)
    axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=45, ha="right")
    axs[i].set_ylabel('$Log_2(Ratio)$ MAE')
../_images/134d280748fe80d8d8deb345fe4a9441b300554fffca9147400d9d7f1f3b811c.png

Often differentially expressed molecules are found by statistical testing. Therefore, it is of interest which imputation method best allows correctly finding differentially expressed molecules from the imputed values. Given the ground truth ratios we can investigate this with reference to common metrics like precision, recall etc. using the evaluate_des function.

from pyproteonet.metrics import evaluate_des

res = evaluate_des(dataset=ds, molecule='protein', columns=imputation_methods + ['gnn_hetero', 'aggregated'], numerator_samples=numerator_samples,
                   denominator_samples=denominator_samples, gt_fc=ground_truth_ratios,
                   min_fc=2, max_pvalue=0.05, is_log=True, absolute_metrics=False)
res['Method'] = res.index
res.loc[:, ['Recall', 'Precision', 'Specificity', 'Accuracy', 'FP Rate', 'F1 Score']] = (res[['Recall', 'Precision', 'Specificity', 'Accuracy', 'FP Rate', 'F1 Score']] - res.loc['aggregated'])
res = res.drop(labels=['aggregated'])
de_res = res.melt(id_vars=['Method'], value_vars=['Recall', 'Precision', 'Specificity', 'Accuracy', 'FP Rate', 'F1 Score'], value_name='%', var_name='Metric')
ax = sns.barplot(data=de_res, x='Method', y='%', hue='Metric')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
[Text(0, 0, 'minprob'),
 Text(1, 0, 'mindet'),
 Text(2, 0, 'bpca'),
 Text(3, 0, 'missforest'),
 Text(4, 0, 'knn'),
 Text(5, 0, 'isvd'),
 Text(6, 0, 'dae'),
 Text(7, 0, 'gnn_hetero')]
../_images/9cacba15aefacf62f268f3f0aa15135952d60fc6d58b11e1aed5ee2fa03a6c65.png