From 06fd924e504b098fa360c542e20032af10d84a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Tue, 15 Jul 2025 14:09:54 -0300 Subject: [PATCH] up: modkit parameters to improve modification data analysis - changed modkit extract full to modkit extract calls for testing (future updates will contain a new modified modkit extract full); - implemented reference FASTA file in modkit extract calls; - updated modkit.yaml with path to reference file used in modkit; - implemented region delimiter to mitochondrial DNA using --region MT; - implemented a filter-threshold of 0.9 for all bases; - implemented bgzf compression of calls.tsv to improve storage management; - implemented modkit sample-probs to analyse distribution of bases (cannonicals and modified) across the samples; - added 6mA in nextflow.config to be basecalled concurrently with 5mC_5hmC. --- README.md | 2 +- parameters/human_blood/modkit.yaml | 1 + src/main.nf | 3 ++- src/modules/modkit.nf | 30 +++++++++++++++++++++---- src/nextflow.config | 2 +- src/sub_workflows/MODKIT_AND_MULTIQC.nf | 3 ++- 6 files changed, 33 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 47834be..76f1017 100755 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ Many of the parameters for this step are based on dorado basecaller, see their [ ```txt --basecall_mods - + ``` ```txt diff --git a/parameters/human_blood/modkit.yaml b/parameters/human_blood/modkit.yaml index 5149ad8..3a694f0 100644 --- a/parameters/human_blood/modkit.yaml +++ b/parameters/human_blood/modkit.yaml @@ -1,4 +1,5 @@ project_name: "results_human_blood" step: 3 steps_2_and_3_input_directory: "./results/results_human_blood/" +reference_file: "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa" out_dir: "results_human_blood/" \ No newline at end of file diff --git a/src/main.nf b/src/main.nf index d38bd5e..25986bb 100755 --- a/src/main.nf +++ b/src/main.nf @@ -85,6 +85,7 @@ workflow { quality_thresholds = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*") multiqc_config = Channel.fromPath(params.multiqc_config) multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") + reference_file = file(params.reference_file) } // Run steps if (params.step.toString() == "1") { @@ -94,6 +95,6 @@ workflow { } else if (params.step.toString() == "2_from_minknow") { FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, qscore_thresh) } else if (params.step.toString()== "3") { - MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input) + MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input, reference_file) } } diff --git a/src/modules/modkit.nf b/src/modules/modkit.nf index d5a1a9d..34dc67e 100755 --- a/src/modules/modkit.nf +++ b/src/modules/modkit.nf @@ -5,21 +5,43 @@ process MODKIT { input: tuple val(id), path(bam) path bai + path reference_file output: path "*" script: """ - echo "starting modkit" + echo "starting probabilities calculation" + ## Inspect base modification probabilities + modkit sample-probs --hist -t 4 -i 10000 \ + --out-dir "./modkit_prob/" "${bam}" + echo "modification prob. calculated" + + echo "starting modkit extract calls" ## Make Methylation TSV file - modkit extract full --bgzf --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv" + modkit extract calls --bgzf --queue-size 1000 -t 4 -i 10000 \ + --reference "${reference_file}" \ + --filter-threshold 0.9 \ + --mapped-only \ + --region MT \ + "${bam}" "${id}_modkit_calls.tsv" echo "extract successful" + ## Make Methylation Table - modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" + modkit pileup -t 4 -i 10000 \ + --filter-threshold 0.9 \ + --region MT \ + --sample-region MT \ + "${bam}" "${id}_modkit_pileup.bed" \ + --log-filepath "${id}_modkit_pileup.log" echo "pileup successful" + ## Make Summary - modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt" + modkit summary -t 4 -i 10000 \ + --filter-threshold 0.9 \ + --region MT \ + ${bam} > "${id}_modkit_summary.txt" echo "summary successful" """ } diff --git a/src/nextflow.config b/src/nextflow.config index 41d2239..a8d5c38 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -19,7 +19,7 @@ params { // Desired basecall speed ("fast", "hac", "sup"; @latest <- latest version available) basecall_speed = "sup@latest" // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA). Can't use more than one modification per nucleotide. - basecall_mods = "5mC_5hmC" + basecall_mods = "5mC_5hmC,6mA" // Kit name (kit used to barcode the samples (e.g. SQK-RBK114-24); Use "None" to skip --kit-name in basecalling) barcoding_kit = "SQK-RBK114-24" // Threshold for mapped reasds diff --git a/src/sub_workflows/MODKIT_AND_MULTIQC.nf b/src/sub_workflows/MODKIT_AND_MULTIQC.nf index 35294c7..b6d77d3 100755 --- a/src/sub_workflows/MODKIT_AND_MULTIQC.nf +++ b/src/sub_workflows/MODKIT_AND_MULTIQC.nf @@ -13,11 +13,12 @@ workflow MODKIT_AND_MULTIQC { quality_thresholds multiqc_config multiqc_input + reference_file main: CALCULATE_COVERAGE(filtered_bams, filtered_bais) MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect()) MERGE_QC_REPORT(num_reads.collect(), read_length.collect(), quality_thresholds.collect()) MULTIQC(multiqc_input.concat(MERGE_QC_REPORT.out.flatten()).collect(), MERGE_COVERAGE.out.mqc, multiqc_config) - MODKIT(filtered_bams, filtered_bais) + MODKIT(filtered_bams, filtered_bais, reference_file) } -- GitLab