From 06fd924e504b098fa360c542e20032af10d84a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Tue, 15 Jul 2025 14:09:54 -0300 Subject: [PATCH 1/3] up: modkit parameters to improve modification data analysis - changed modkit extract full to modkit extract calls for testing (future updates will contain a new modified modkit extract full); - implemented reference FASTA file in modkit extract calls; - updated modkit.yaml with path to reference file used in modkit; - implemented region delimiter to mitochondrial DNA using --region MT; - implemented a filter-threshold of 0.9 for all bases; - implemented bgzf compression of calls.tsv to improve storage management; - implemented modkit sample-probs to analyse distribution of bases (cannonicals and modified) across the samples; - added 6mA in nextflow.config to be basecalled concurrently with 5mC_5hmC. --- README.md | 2 +- parameters/human_blood/modkit.yaml | 1 + src/main.nf | 3 ++- src/modules/modkit.nf | 30 +++++++++++++++++++++---- src/nextflow.config | 2 +- src/sub_workflows/MODKIT_AND_MULTIQC.nf | 3 ++- 6 files changed, 33 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 47834be..76f1017 100755 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ Many of the parameters for this step are based on dorado basecaller, see their [ ```txt --basecall_mods - + ``` ```txt diff --git a/parameters/human_blood/modkit.yaml b/parameters/human_blood/modkit.yaml index 5149ad8..3a694f0 100644 --- a/parameters/human_blood/modkit.yaml +++ b/parameters/human_blood/modkit.yaml @@ -1,4 +1,5 @@ project_name: "results_human_blood" step: 3 steps_2_and_3_input_directory: "./results/results_human_blood/" +reference_file: "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa" out_dir: "results_human_blood/" \ No newline at end of file diff --git a/src/main.nf b/src/main.nf index d38bd5e..25986bb 100755 --- a/src/main.nf +++ b/src/main.nf @@ -85,6 +85,7 @@ workflow { quality_thresholds = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*") multiqc_config = Channel.fromPath(params.multiqc_config) multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") + reference_file = file(params.reference_file) } // Run steps if (params.step.toString() == "1") { @@ -94,6 +95,6 @@ workflow { } else if (params.step.toString() == "2_from_minknow") { FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, qscore_thresh) } else if (params.step.toString()== "3") { - MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input) + MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input, reference_file) } } diff --git a/src/modules/modkit.nf b/src/modules/modkit.nf index d5a1a9d..34dc67e 100755 --- a/src/modules/modkit.nf +++ b/src/modules/modkit.nf @@ -5,21 +5,43 @@ process MODKIT { input: tuple val(id), path(bam) path bai + path reference_file output: path "*" script: """ - echo "starting modkit" + echo "starting probabilities calculation" + ## Inspect base modification probabilities + modkit sample-probs --hist -t 4 -i 10000 \ + --out-dir "./modkit_prob/" "${bam}" + echo "modification prob. calculated" + + echo "starting modkit extract calls" ## Make Methylation TSV file - modkit extract full --bgzf --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv" + modkit extract calls --bgzf --queue-size 1000 -t 4 -i 10000 \ + --reference "${reference_file}" \ + --filter-threshold 0.9 \ + --mapped-only \ + --region MT \ + "${bam}" "${id}_modkit_calls.tsv" echo "extract successful" + ## Make Methylation Table - modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" + modkit pileup -t 4 -i 10000 \ + --filter-threshold 0.9 \ + --region MT \ + --sample-region MT \ + "${bam}" "${id}_modkit_pileup.bed" \ + --log-filepath "${id}_modkit_pileup.log" echo "pileup successful" + ## Make Summary - modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt" + modkit summary -t 4 -i 10000 \ + --filter-threshold 0.9 \ + --region MT \ + ${bam} > "${id}_modkit_summary.txt" echo "summary successful" """ } diff --git a/src/nextflow.config b/src/nextflow.config index 41d2239..a8d5c38 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -19,7 +19,7 @@ params { // Desired basecall speed ("fast", "hac", "sup"; @latest <- latest version available) basecall_speed = "sup@latest" // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA). Can't use more than one modification per nucleotide. - basecall_mods = "5mC_5hmC" + basecall_mods = "5mC_5hmC,6mA" // Kit name (kit used to barcode the samples (e.g. SQK-RBK114-24); Use "None" to skip --kit-name in basecalling) barcoding_kit = "SQK-RBK114-24" // Threshold for mapped reasds diff --git a/src/sub_workflows/MODKIT_AND_MULTIQC.nf b/src/sub_workflows/MODKIT_AND_MULTIQC.nf index 35294c7..b6d77d3 100755 --- a/src/sub_workflows/MODKIT_AND_MULTIQC.nf +++ b/src/sub_workflows/MODKIT_AND_MULTIQC.nf @@ -13,11 +13,12 @@ workflow MODKIT_AND_MULTIQC { quality_thresholds multiqc_config multiqc_input + reference_file main: CALCULATE_COVERAGE(filtered_bams, filtered_bais) MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect()) MERGE_QC_REPORT(num_reads.collect(), read_length.collect(), quality_thresholds.collect()) MULTIQC(multiqc_input.concat(MERGE_QC_REPORT.out.flatten()).collect(), MERGE_COVERAGE.out.mqc, multiqc_config) - MODKIT(filtered_bams, filtered_bais) + MODKIT(filtered_bams, filtered_bais, reference_file) } -- GitLab From e3fe36a325641b84a02536e5338818854335d17f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Wed, 13 Aug 2025 12:34:02 -0300 Subject: [PATCH 2/3] fix: few parameters in modkit and identation --- src/modules/modkit.nf | 44 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/src/modules/modkit.nf b/src/modules/modkit.nf index df6c893..16d2f20 100755 --- a/src/modules/modkit.nf +++ b/src/modules/modkit.nf @@ -14,22 +14,48 @@ process MODKIT { script: """ - echo "starting probabilities calculation" + echo "starting probabilities calculation" ## Inspect base modification probabilities - modkit sample-probs --hist -t 4 -i 10000 \ - --out-dir "./modkit_prob/" "${bam}" + modkit sample-probs --hist \ + --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --only-mapped \ + --region MT \ + --out-dir "./modkit_prob/" "${bam}" echo "modification prob. calculated" - echo "starting modkit extract calls" ## Make Methylation TSV file - modkit extract full --bgzf --queue-size ${modkit_extract_qsize} --threads ${modkit_threads} --interval-size ${modkit_isize} --mapped-only "${bam}" "${id}_modkit_output.tsv" + modkit extract full --bgzf \ + --queue-size ${modkit_extract_qsize} \ + --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --region MT \ + --mapped-only \ + "${bam}" "${id}_modkit_full.tsv.gz" echo "extract successful" ## Make Methylation Table (--chunk-size is a multiplier of threads, 1.5x by default) - modkit pileup --threads ${modkit_threads} --interval-size ${modkit_isize} "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" + modkit pileup --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --region MT \ + --filter-threshold C:0.75 \ + --filter-threshold A:0.75 \ + --mod-thresholds m:0.75 \ + --mod-thresholds h:0.75 \ + --mod-thresholds a:0.75 \ + "${bam}" "${id}_modkit_pileup.bed" \ + --log-filepath "${id}_modkit_pileup.log" echo "pileup successful" - ## Make Summary - modkit summary --threads ${modkit_threads} --interval-size ${modkit_isize} ${bam} > "${id}_modkit_summary.txt" + modkit summary --only-mapped \ + --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --region MT \ + --filter-threshold C:0.75 \ + --filter-threshold A:0.75 \ + --mod-thresholds m:0.75 \ + --mod-thresholds h:0.75 \ + --mod-thresholds a:0.75 \ + ${bam} > "${id}_modkit_summary.txt" echo "summary successful" """ -} +} \ No newline at end of file -- GitLab From ee56b0cccc8efd5aaf0c60beb0b178160489da3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Fri, 15 Aug 2025 19:48:26 -0300 Subject: [PATCH 3/3] fixed identation on lines 50-57 in modkit --- src/modules/modkit.nf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/modules/modkit.nf b/src/modules/modkit.nf index 16d2f20..d39edde 100755 --- a/src/modules/modkit.nf +++ b/src/modules/modkit.nf @@ -47,14 +47,14 @@ process MODKIT { echo "pileup successful" ## Make Summary modkit summary --only-mapped \ - --threads ${modkit_threads} \ - --interval-size ${modkit_isize} \ - --region MT \ - --filter-threshold C:0.75 \ - --filter-threshold A:0.75 \ - --mod-thresholds m:0.75 \ - --mod-thresholds h:0.75 \ - --mod-thresholds a:0.75 \ + --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --region MT \ + --filter-threshold C:0.75 \ + --filter-threshold A:0.75 \ + --mod-thresholds m:0.75 \ + --mod-thresholds h:0.75 \ + --mod-thresholds a:0.75 \ ${bam} > "${id}_modkit_summary.txt" echo "summary successful" """ -- GitLab