Chapter 5 MS2-Annotation
This chapter consists of:
Submit job of CMN and jobs based on CMN on GNPS (involved uploading files to the server of GNPS)
Submit job of FBMN and jobs based on FBMN on GNPS (involved uploading files to the server of GNPS)
Download all results from all submitted jobs and merge into a large table
Compare MS2-experimental data with our in-house library
Merge the results obtained from each GNPS-job and library-match into one table
5.1 MS2-inhouse-library v.s. experimental data
The script 1c.Pipeline_exp_vs_lib.R
is available under the folder of J:\CBMR\SUN-CBMR-Metabolomics\Workflow\Script\modules\MS2_Annotation
5.1.3 Provide path of preprocessed MS2-spectra file π΅
It is the absolute path by including the file name as the format of *.mgf
5.1.4 provide the thresholds π΅
They are for m/z and similarity score which is used for cutting off the matches with the scores lower than the threshold
5.1.6 (Optional) Provide the method for comparison βοΈ
cp_method <- "cosine"
you could also try other methods for comparison βnspecanglescoreβ, βnavdistscoreβ, or βneuclideanscoreβ
5.1.7 Conduct
if (polarity == "POS") {
path_lib_MS2 <- paste0(path_MS2_lib, "/RP_lib_MS2_60_POS.mgf")
} else if (polarity == "NEG") {
path_lib_MS2 <- paste0(path_MS2_lib, "/RP_lib_MS2_60_NEG.mgf")
}
source(paste0(path_func, "/lib_vs_exp_MS2.R"))
MS2_vs_exp(path_lib_MS2, path_mgf_expMS2, mz_threshold, cor_threshold, polarity, path_outputs, cp_method)
5.2 GNPS-Submit jobs based on CMN
The script 1b.Pipeline_GNPS_upload_submitjobs_based on CMN.R
is available under the folder of J:\CBMR\SUN-CBMR-Metabolomics\Workflow\Script\modules\MS2_Annotation
The pipeline is for:
1.Uploading files to the server of GNPS.
2.Generating parameters for each job you would like to run.
3.Running CMN (classic molecular network) and a couple of other jobs based on CMN on GNPS.
4.Changing the parameters related to the jobs you would like to run.
5.Please pay attention to the filenames no space within each filename
5.2.1 Upload files
5.2.2 Prerequisites before generating parameters and submitting jobs π΅
!!! Run the chunk below whenever start running any job
5.2.3 CMN
5.2.3.1 Generate parameters π΅
parameters_CMN <- list()
parameters_CMN["precursor_ion_mass_tolerance"] <- "0.02"
parameters_CMN["fragment_ion_mass_tolerance"] <- "0.02"
parameters_CMN["min_pairs_cos"] <- "0.7"
parameters_CMN["minimum_matched_fragment_ions"] <- "6"
parameters_CMN["maximum_shift"] <- "500"
parameters_CMN["network_topk"] <- "10"
parameters_CMN["minimum_cluster_size"] <- "2"
parameters_CMN["run_mscluster"] <- "on"
parameters_CMN["maximum_connected_component_size"] <- "100"
parameters_CMN["library_search_min_matched_peaks"] <- "6"
parameters_CMN["score_threshold"] <- "0.7"
parameters_CMN["search_analogs"] <- "0"
parameters_CMN["maximum_analog_search_mass_difference"] <- "100"
parameters_CMN["filter_below_std_dev"] <- "0.0"
parameters_CMN["minimum_peak_intensity"] <- "0.0"
parameters_CMN["filter_precursor_window"] <- "1"
parameters_CMN["filter_library"] <- "1"
parameters_CMN["filter_peaks_10_50Da_window"] <- "1"
parameters_CMN["filter_spectra_from_G6_as_blanks_before_networking"] <- "0"
parameters_CMN["find_related_datasets"] <- "0"
parameters_CMN["create_cluster_buckets_and_qiime2_biom_pcoa_plots_output"] <- "1"
parameters_CMN["create_ili_mapping_output"] <- "0"
5.2.4 Dereplicator
5.2.4.1 Generate parameters π΅
parameters_CMN_Dereplicator <- list()
parameters_CMN_Dereplicator["precursor_ion_mass_tolerance"] <- "0.02"
parameters_CMN_Dereplicator["search_analogs"] <- "on" # if you do not select, just leave it empty like ""
parameters_CMN_Dereplicator["fragment_ion_mass_tolerance"] <- "0.02"
parameters_CMN_Dereplicator["pnp_database"] <- "pnpdatabase" # "dmisam" for "Extended", "combined" for "Regular"
parameters_CMN_Dereplicator["max_charge"] <- "2"
parameters_CMN_Dereplicator["accurate_p_values"] <- "on" # leave it empty, like "", if you do not select it
parameters_CMN_Dereplicator["min_number_of_AA"] <- "5"
parameters_CMN_Dereplicator["max_isotopic_shift"] <- "0" # other options are "1", "2"
parameters_CMN_Dereplicator["adducts_Na"] <- "" # if select, give the value like "on"
parameters_CMN_Dereplicator["adducts_K"] <- "" # if select, give the value like "on"
parameters_CMN_Dereplicator["max_allowed_modification_mass"] <- "150"
parameters_CMN_Dereplicator["min_matched_peaks_with_known_compounds"] <- "5"
5.2.5 Dereplicator plus
5.2.5.1 Generate parameters π΅
parameters_CMN_Dereplicator_plus <- list()
parameters_CMN_Dereplicator_plus["precursor_ion_mass_tolerance"] <- "0.02"
parameters_CMN_Dereplicator_plus["fragment_ion_mass_tolerance"] <- "0.02"
parameters_CMN_Dereplicator_plus["max_charge"] <- "2" # other options are 1 and 3
parameters_CMN_Dereplicator_plus["min_score"] <- "12"
5.2.6 NAP
5.2.6.1 Generate parameters π΅
*parameters are different from different polarities
parameters_CMN_NAP <- list()
parameters_CMN_NAP["number_of_a_cluster_index"] <- "0"
parameters_CMN_NAP["cosine_value_to_subselect_inside_a_cluster"] <- "0.7"
parameters_CMN_NAP["n_first_candidates_for_consensus_score"] <- "10"
parameters_CMN_NAP["use_fusion_result_for_consensus"] <- "on" # if deselect, ""
parameters_CMN_NAP["accuracy_for_exact_mass_candidate_search_ppm"] <- "15"
parameters_CMN_NAP["structure_database"] <- "GNPS,HMDB,SUPNAT,NPAtlas,CHEBI,DRUGBANK,FooDB"
parameters_CMN_NAP["maximum_number_of_candidate_structures_in_the_graph"] <- "10"
Be careful the inputs of parameters depending on the polarity, so only change the inputs related to the polarity you are working on
if (polarity == "POS") {
parameters_CMN_NAP["acquisition_mode"] <- "Positive"
parameters_CMN_NAP["adduct_ion_type"] <- "[M+H]" # other options are "[M]","[M+NH4]","[M+Na]","[M+K]", "[M+ACN+H]"
parameters_CMN_NAP["multiple_adduct_types"] <- "[M+Na]"
} else if (polarity == "NEG") {
parameters_CMN_NAP["acquisition_mode"] <- "Negative"
parameters_CMN_NAP["adduct_ion_type"] <- "[M-H]" # other options are "[M+Cl]", "[M+FA-H]"
parameters_CMN_NAP["multiple_adduct_types"] <- "[M+Cl]"
}
5.2.7 MS2LDA
5.2.7.1 Generate parameters π΅
If the polarity is negative mode, recommend to exclude all motif by changing values from βyesβ to βnoβ
parameters_CMN_MS2LDA <- list()
parameters_CMN_MS2LDA["bin_width"] <- "0.01" #other options are 0.005, 0.05, 0.1, 0.5
parameters_CMN_MS2LDA["number_of_lda_iterations"] <- "1000"
parameters_CMN_MS2LDA["minimum_ms2_intensity"] <- "100"
parameters_CMN_MS2LDA["lda_free_motifs"] <- "200"
parameters_CMN_MS2LDA["gnps_motif_inclusion"] <- "yes" #if deselect, "no"
parameters_CMN_MS2LDA["massbank_motif_inclusion"] <- "yes"
parameters_CMN_MS2LDA["urine_motif_inclusion"] <- "yes"
parameters_CMN_MS2LDA["euphorbia_motif_inclusion"] <- "no"
parameters_CMN_MS2LDA["rhamnaceae_plant_motif_inclusion"] <- "no"
parameters_CMN_MS2LDA["streptomyces_and_salinisporus_motif_inclusion"] <- "no"
parameters_CMN_MS2LDA["photorhabdus_and_xenorhabdus_motif_inclusion"] <- "no"
parameters_CMN_MS2LDA["user_motif_inclusion"] <- "None"
parameters_CMN_MS2LDA["overlap_score_threshold"] <- "0.3"
parameters_CMN_MS2LDA["probability_value_threshold"] <- "0.1"
parameters_CMN_MS2LDA["topx_in_node"] <- "5"
5.2.8 MNE (MolNetEnhancer)
5.2.8.1 Generate parameters π΅
source(paste0(path_utils_annotation, "/GNPS_upload_submitjobs.R"))
parameters_CMN_MNE <- list()
parameters_CMN_MNE["enter_varquest_id"] <- "None" # leave it as "None" if this id is not available
parameters_CMN_MNE["enter_nap_id"] <- list_jobid_GNPS$CMN_NAP_task_id
parameters_CMN_MNE["enter_gnps_task_id"] <- list_jobid_GNPS$CMN_task_id
parameters_CMN_MNE["enter_Dereplicator_id"] <- list_jobid_GNPS$CMN_Dereplicator_task_id
parameters_CMN_MNE["enter_ms2lda_job_id"] <- list_jobid_GNPS$CMN_MS2LDA_task_id #Leave it as "None" if this id is not available
5.2.9 Merge network polarity
5.2.9.1 Generate parameters π΅
If it is CMN-based network, the unit of tolerance of rt is second
list_param_jobid_MNP <- list()
source(paste0(path_utils_annotation, "/GNPS_upload_submitjobs.R"))
parameters_CMN_MNP <- list()
parameters_CMN_MNP["enter_gnps_positive_network_task_id"] <- list_jobid_MN_both$CMN_task_id_POS
parameters_CMN_MNP["enter_gnps_negative_network_task_id"] <- list_jobid_MN_both$CMN_task_id_NEG
parameters_CMN_MNP["enter_a_rt_tolerance_for_aligning_masses_between_two_runs"] <- 10
parameters_CMN_MNP["enter_a_ppm_tolerance_for_aligning_masses_between_two_runs"] <- 30
5.3 GNPS-Submit jobs based on FBMN
You only need to prepare a reprocessed MS2-spectra as the format of .mgf
and the corresponding quantification table as the format of .txt
or .csv
1. Not recommend to preprocess MS2 data by metaboscape. Because 1) Regarding .mgf file, the output is average MS2 spectra across multiple CEs; 2) Regarding .csv file, m/z is not correct
2. Change the parameters in the sections below related to the workflows you would like to run
5.3.1 Form quantification table
The script 0b.Pipeline_quan_table.R
is available under the folder of J:\CBMR\SUN-CBMR-Metabolomics\Workflow\Script\modules\MS2_Annotation
5.3.2 Upload files, generate parameters, and submit jobs
The script 1b.Pipeline_GNPS_upload_submitjobs_based on FBMN.R
is available under the folder of J:\CBMR\SUN-CBMR-Metabolomics\Workflow\Script\modules\MS2_Annotation
The pipeline is for:
1.Uploading files to the server of GNPS.
2.Generating parameters for each job you would like to run.
3.Running FBMN (feature based molecular network) and a couple of other jobs based on FBMN on GNPS.
4.Changing the parameters related to the jobs you would like to run.
5.3.2.1 Upload files
5.3.2.1.6 Provide the filenames π΅
.txt
or .csv
of quantification table and .mgf
file of MS2 spectra
5.3.2.3 FBMN
5.3.2.3.1 Generate parameters π΅
parameters_FBMN <- list()
parameters_FBMN["precursor_ion_mass_tolerance"] <- "0.02"
parameters_FBMN["fragment_ion_mass_tolerance"] <- "0.02"
parameters_FBMN["min_pairs_cos"] <- "0.7"
parameters_FBMN["minimum_matched_fragment_ions"] <- "6"
parameters_FBMN["maximum_shift"] <- "500"
parameters_FBMN["network_topk"] <- "10"
parameters_FBMN["maximum_connected_component_size"] <- "100"
parameters_FBMN["library_search_min_matched_peaks"] <- "6"
parameters_FBMN["score_threshold"] <- "0.7"
parameters_FBMN["search_analogs"] <- "0" # don't search/ "1" search
parameters_FBMN["maximum_analog_search_mass_difference"] <- "100" # it only works once select the search analog
parameters_FBMN["top_results_to_report_per_query"] <- "1"
parameters_FBMN["minimum_peak_intensity"] <- "0.0"
parameters_FBMN["filter_precursor_window"] <- "1" # filter / "0" don't filter
parameters_FBMN["filter_library"] <- "1" # filter / "0" don't filter
parameters_FBMN["filter_peaks_10_50Da_window"] <- "1" # filter / "0" don't filter
parameters_FBMN["normalization_per_file"] <- "None" # represent no normalization, the other option is "RowSum"
parameters_FBMN["aggregation_method_for_peak_abundances_per_group"] <- "Mean" # another option is "Sum"
parameters_FBMN["pcoa_distance_metric"] <- "cosine" # other options are "braycurtis", "euclidean", "jaccard"
parameters_FBMN["metadata_column_to_compare"] <- "None"
parameters_FBMN["run_stats_and_plots"] <- "No" # other option is "Yes"
parameters_FBMN["metadata_field_to_compare_1"] <- "None"
parameters_FBMN["metadata_field_to_compare_2"] <- "None"
parameters_FBMN["metadata_column_to_facet"] <- "None"
parameters_FBMN["run_Dereplicator"] <- "1" # "0" represent don't run and "1" represents run
5.3.2.4 Dereplicator plus
5.3.2.4.1 Generate parameters π΅
parameters_FBMN_Dereplicator_plus <- list()
parameters_FBMN_Dereplicator_plus["precursor_ion_mass_tolerance"] <- "0.02"
parameters_FBMN_Dereplicator_plus["fragment_ion_mass_tolerance"] <- "0.02"
parameters_FBMN_Dereplicator_plus["max_charge"] <- "2" # other options are 1 and 3
parameters_FBMN_Dereplicator_plus["min_score"] <- "12"
5.3.2.5 NAP
*parameters are different from different polarities
5.3.2.5.1 Generate parameters π΅
parameters_FBMN_NAP <- list()
parameters_FBMN_NAP["number_of_a_cluster_index"] <- "0"
parameters_FBMN_NAP["cosine_value_to_subselect_inside_a_cluster"] <- "0.7"
parameters_FBMN_NAP["n_first_candidates_for_consensus_score"] <- "10"
parameters_FBMN_NAP["use_fusion_result_for_consensus"] <- "on" # if deselect, ""
parameters_FBMN_NAP["accuracy_for_exact_mass_candidate_search_ppm"] <- "15"
parameters_FBMN_NAP["structure_database"] <- "GNPS,HMDB,SUPNAT,NPAtlas,CHEBI,DRUGBANK,FooDB"
parameters_FBMN_NAP["maximum_number_of_candidate_structures_in_the_graph"] <- "10"
Be careful the inputs of parameters depending on the polarity, so only change the inputs related to the polarity you are working on
if (polarity == "POS") {
parameters_FBMN_NAP["acquisition_mode"] <- "Positive"
parameters_FBMN_NAP["adduct_ion_type"] <- "[M+H]" # other options are "[M]", "[M+NH4]", "[M+Na]", "[M+K]", "[M+ACN+H]"
parameters_FBMN_NAP["multiple_adduct_types"] <- "[M+Na]"
} else if (polarity == "NEG") {
parameters_FBMN_NAP["acquisition_mode"] <- "Negative"
parameters_FBMN_NAP["adduct_ion_type"] <- "[M-H]" # other options are "[M+Cl]", "[M+FA-H]"
parameters_FBMN_NAP["multiple_adduct_types"] <- "[M+Cl]"
}
5.3.2.6 (Optional) MS2LDA βοΈ
5.3.2.6.1 Generate parameters π΅
parameters_FBMN_MS2LDA <- list()
parameters_FBMN_MS2LDA["bin_width"] <- "0.01" #other options are 0.005, 0.05, 0.1, 0.5
parameters_FBMN_MS2LDA["number_of_lda_iterations"] <- "1000"
parameters_FBMN_MS2LDA["minimum_ms2_intensity"] <- "100"
parameters_FBMN_MS2LDA["lda_free_motifs"] <- "200"
parameters_FBMN_MS2LDA["gnps_motif_inclusion"] <- "yes" #if deselect, "no"
parameters_FBMN_MS2LDA["massbank_motif_inclusion"] <- "yes"
parameters_FBMN_MS2LDA["urine_motif_inclusion"] <- "yes"
parameters_FBMN_MS2LDA["euphorbia_motif_inclusion"] <- "no"
parameters_FBMN_MS2LDA["rhamnaceae_plant_motif_inclusion"] <- "no"
parameters_FBMN_MS2LDA["streptomyces_and_salinisporus_motif_inclusion"] <- "no"
parameters_FBMN_MS2LDA["photorhabdus_and_xenorhabdus_motif_inclusion"] <- "no"
parameters_FBMN_MS2LDA["user_motif_inclusion"] <- "None"
parameters_FBMN_MS2LDA["overlap_score_threshold"] <- "0.3"
parameters_FBMN_MS2LDA["probability_value_threshold"] <- "0.1"
parameters_FBMN_MS2LDA["topx_in_node"] <- "5"
5.3.2.7 MNE (MolNetEnhancer)
5.3.2.7.1 Generate parameters π΅
source(paste0(path_utils_annotation, "/GNPS_upload_submitjobs.R"))
parameters_FBMN_MNE <- list()
parameters_FBMN_MNE["enter_varquest_id"] <- "None" # leave it as "None" if this id is not available
parameters_FBMN_MNE["enter_nap_id"] <- list_jobid_GNPS$FBMN_NAP_task_id
parameters_FBMN_MNE["enter_gnps_task_id"] <- list_jobid_GNPS$FBMN_task_id
parameters_FBMN_MNE["enter_ms2lda_job_id"] <- list_jobid_GNPS$FBMN_MS2LDA_task_id # or "None"
``
5.3.3 Merge network polarity
5.3.3.1 Generate parameters π΅
If it is FBMN-based network, the unit of tolerance of rt is minute
list_param_jobid_MNP <- list()
source(paste0(path_utils_annotation, "/GNPS_upload_submitjobs.R"))
parameters_FBMN_MNP <- list()
parameters_FBMN_MNP["enter_gnps_positive_network_task_id"] <- list_jobid_MN_both$FBMN_task_id_POS
parameters_FBMN_MNP["enter_gnps_negative_network_task_id"] <- list_jobid_MN_both$FBMN_task_id_NEG
parameters_FBMN_MNP["enter_a_rt_tolerance_for_aligning_masses_between_two_runs"] <- 0.1
parameters_FBMN_MNP["enter_a_ppm_tolerance_for_aligning_masses_between_two_runs"] <- 20
5.4 GNPS-Download jobs
The script 2.Pipeline_GNPS_download_jobs.R
is available under the folder of J:\CBMR\SUN-CBMR-Metabolomics\Workflow\Script\modules\MS2_Annotation
5.4.5 Provide the absolute path of βlist of job idsβ by including the name of that list π΅
name should start with list_jobid_GNPS
5.5 Merge results from different source
5.5.1 Merging outputs from jobs have run on GNPS into a table
The script 3.Pipeline_merge_outputs_from_GNPS.R
is available under the folder of J:\CBMR\SUN-CBMR-Metabolomics\Workflow\Script\modules\MS2_Annotation
5.5.1.2 Provide the path of storing files downloaded from GNPS π΅
The path should be end with gnps_outputs
5.5.1.3 Provide the absolute path of βlist of job idsβ by including the name of that list π΅
name should start with list_jobid_GNPS
5.5.1.5 Provide which method did you use for preprocessing MS2 data π΅
If you run jobs based on CMN, skip running the line below
5.5.1.8 Conduct
library(dplyr)
list_jobid <- readRDS(path_list)
workflow_all <- names(list_jobid) %>% gsub("_task_id", "",.)
workflow <- workflow_all[grepl(workflow_based, workflow_all)] %>% strsplit("[A-Z]+_") %>% unlist() %>% unique() %>% na_if("") %>% na.omit()
if (workflow_based == "FBMN") {
Merge_outputs_from_GNPS(path_downloaded_files, prep_method, workflow, polarity, path_outputs)
} else if (workflow_based == "CMN") {
Merge_outputs_from_GNPS(path_downloaded_files, NULL, workflow, polarity, path_outputs)
}
5.5.1.9 Merge library-match into GNPS-match
The script 4.Pipeline_merge_lib-match_GNPS-match.R
is available under the folder of J:\CBMR\SUN-CBMR-Metabolomics\Workflow\Script\modules\MS2_Annotation
It is more precise to merge the result from library-match with the results from FBMN-based jobs on GNPS