From c64fa5de224eb5c7557614e4902cc0d235904ff4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Tue, 24 Jun 2025 18:14:24 -0300 Subject: [PATCH 1/4] up: basecalling, nextflow.config, nextflow parameters and readability - Updated multiple variables in nextflow.config to match them across the different processes and workflows; - Added profiles in nextflow to facilitate the steps; - Added --kit-name configuration (params.barcoding_kit) to improve trimming during basecalling; - Changed descriptions across variables in nextflow.config; - Improved output directory name to include the projects' name (params.project_name); - Updated params.basecall_speed to use latest available simplex/modification model for basecalling; - Reorganized demux process to be included in the basecalling process to improve code readability; - Corrected few typos. --- .gitignore | 1 + src/main.nf | 32 ++--- src/modules/basecall.nf | 121 +++++++----------- src/modules/convert_input_from_minknow.nf | 8 +- src/modules/pycoqc.nf | 8 +- src/nextflow.config | 66 +++++++--- src/sub_workflows/BASECALLING.nf | 41 +++--- .../FILTERING_AND_QC_FROM_MINKNOW.nf | 8 +- .../FILTERING_AND_QC_FROM_STEP_1.nf | 14 +- 9 files changed, 153 insertions(+), 146 deletions(-) diff --git a/.gitignore b/.gitignore index fd17af6..12d23bf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # project data +models dataset images results diff --git a/src/main.nf b/src/main.nf index 06e3012..d38bd5e 100755 --- a/src/main.nf +++ b/src/main.nf @@ -19,9 +19,9 @@ workflow { basecall read trimming option : ${params.basecall_trim} basecall quality score threshold for basecalling : ${params.qscore_thresh} basecall demultiplexing : ${params.basecall_demux} - trim barcodes during demultiplexing : ${params.trim_barcode} + trim barcodes during demultiplexing : ${params.trimmed_barcodes} submission output file prefix : ${params.prefix} - GPU device for submission : ${params.gpu_devices} + GPU device for submission : ${params.gpu_devices} Output directory : ${params.out_dir} ================================================================= """ @@ -47,7 +47,7 @@ workflow { =============================================== """ } else { - println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://github.com/bernardo-heberle/DCNL_NANOPORE_PIPELINE" + println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://gmapsrv.pucrs.br/gitlab/ccd-public/nanopore" System.exit(1) } // Set initial files and channels @@ -59,25 +59,25 @@ workflow { fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() } - ref = file(params.ref) - quality_score = Channel.value(params.qscore_thresh) basecall_speed = Channel.value(params.basecall_speed) basecall_mods = Channel.value(params.basecall_mods) basecall_config = Channel.value(params.basecall_config) basecall_trim = Channel.value(params.basecall_trim) - // basecall_compute = Channel.value(params.basecall_compute) - trim_barcode = Channel.value(params.trim_barcode) - devices = Channel.value(params.gpu_devices) + qscore_thresh = Channel.value(params.qscore_thresh) + barcoding_kit = Channel.value(params.barcoding_kit) + trimmed_barcodes = Channel.value(params.trimmed_barcodes) + gpu_devices = Channel.value(params.gpu_devices) + reference_file = file(params.reference_file) } else if (params.step.toString() == "2_from_step_1") { - total_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() + bam_files = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) + txt_files = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() mapq = Channel.value(params.mapq) - quality_score = Channel.value(params.qscore_thresh) + qscore_thresh = Channel.value(params.qscore_thresh) } else if (params.step.toString() == "2_from_minknow") { input_dir = Channel.fromPath("${params.steps_2_and_3_input_directory}/") mapq = Channel.value(params.mapq) - quality_score = Channel.value(params.qscore_thresh) - } else if (params.step.toString() == "3") { + qscore_thresh = Channel.value(params.qscore_thresh) + } else if (params.step.toString() == "3") { filtered_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) filtered_bais = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() num_reads = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*") @@ -88,11 +88,11 @@ workflow { } // Run steps if (params.step.toString() == "1") { - BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, quality_score, trim_barcode, devices, ref) + BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file) } else if (params.step.toString() == "2_from_step_1") { - FILTERING_AND_QC_FROM_STEP_1(total_bams, txts, mapq, quality_score) + FILTERING_AND_QC_FROM_STEP_1(bam_files, txt_files, mapq, qscore_thresh) } else if (params.step.toString() == "2_from_minknow") { - FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, quality_score) + FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, qscore_thresh) } else if (params.step.toString()== "3") { MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input) } diff --git a/src/modules/basecall.nf b/src/modules/basecall.nf index 18efc8b..a59fb78 100755 --- a/src/modules/basecall.nf +++ b/src/modules/basecall.nf @@ -10,7 +10,10 @@ process FAST5_to_POD5 { script: """ - pod5 convert fast5 *.fast5 --output . --one-to-one . --threads 12 + pod5 convert fast5 *.fast5 \ + --output . \ + --one-to-one . \ + --threads 12 """ } @@ -20,112 +23,82 @@ process BASECALL { input: tuple val(id), path(pod5_dir) - val speed - val mods - val config - val trim - val qscore - val devices - path ref + val basecall_speed + val basecall_mods + val basecall_config + val basecall_trim + val qscore_thresh + val barcoding_kit + val trimmed_barcodes + val gpu_devices + path reference_file output: - path ("${id}.bam"), emit: bam - path ("${id}.txt"), emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt script: """ echo "Basecalling started for: ${id}" - if [[ "${config}" == "false" ]]; then - if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" + if [[ "${basecall_config}" == "None" ]]; then + if [[ "${basecall_mods}" == "None" ]]; then + dorado basecaller "${basecall_speed}" . \ + ${barcoding_kit != "None" ? "--kit-name ${barcoding_kit}" : ""} \ + --trim "${basecall_trim}" \ + --min-qscore "${qscore_thresh}" \ + --reference "${reference_file}" \ + --device "cuda:${gpu_devices}" > "${id}.bam" else - dorado basecaller "${speed},${mods}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" + dorado basecaller "${basecall_speed},${basecall_mods}" . \ + ${barcoding_kit != "None" ? "--kit-name ${barcoding_kit}" : ""} \ + --trim "${basecall_trim}" \ + --min-qscore "${qscore_thresh}" \ + --reference "${reference_file}" \ + --device "cuda:${gpu_devices}" > "${id}.bam" fi else - if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - fi + dorado basecaller "${basecall_config}" . \ + ${barcoding_kit != "None" ? "--kit-name ${barcoding_kit}" : ""} \ + --trim "${basecall_trim}" \ + --min-qscore "${qscore_thresh}" \ + --reference "${reference_file}" \ + --device "cuda:${gpu_devices}" > "${id}.bam" fi echo "Basecalling completed, sorting bams..." - samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" + samtools sort -@ 12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" - echo "Bams sorted, generating summary with dorado..." - dorado summary "${id}.bam" > "${id}.txt" - echo "Process completed for: ${id}" - """ -} - -process BASECALL_DEMUX { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - label 'gpu' - - input: - tuple val(id), path(pod5_dir) - val speed - val mods - val config - val trim - val qscore - val trim_barcode - val devices - path ref - - output: - path ("${id}.bam"), emit: bam - path ("${id}.txt"), emit: txt - - script: - """ - echo "Demultiplexed basecalling started for: ${id}" - if [[ "${config}" == "false" ]]; then - if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - fi - else - if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - fi - fi - - echo "Basecalling completed, sorting bams..." - samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" - rm "${id}.bam" - mv "${id}_sorted.bam" "${id}.bam" - echo "Bams sorted, demultiplexing..." - if [[ "${trim_barcode}" == "true" ]]; then + if [[ "${trimmed_barcodes}" == "True" ]]; then echo "Demultiplexing with barcode trimming..." dorado demux --output-dir "./demux_data/" --no-classify "${id}.bam" else - echo "Demultiplexing without barcode trimming..." - dorado demux --no-trim --output-dir "./demux_data/" --no-classify "${id}.bam" + echo "Demultiplexing and barcode trimming..." + dorado demux --trim "${basecall_trim}" --output-dir "./demux_data/" "${id}.bam" fi echo "Demultiplexing completed, sorting barcode files..." cd ./demux_data/ for file in *; do - samtools sort -@ -12 "\$file" -o "${id}_\${file}" + samtools sort -@ 12 "\$file" -o "${id}_\$file" rm "\$file" done - - echo "Bams sorted, generating summary with dorado..." + cd ../ - rm "${id}.bam" mv ./demux_data/* ./ rm -r ./demux_data/ + + echo "Bams sorted, generating summary with dorado..." for file in *.bam; do new_id="\${file%%.*}" dorado summary "\$file" > "\${new_id}.txt" done echo "Process completed for: ${id}" + + # echo "Bams sorted, generating summary with dorado..." + # dorado summary "${id}.bam" > "${id}.txt" + # echo "Process completed for: ${id}" """ } \ No newline at end of file diff --git a/src/modules/convert_input_from_minknow.nf b/src/modules/convert_input_from_minknow.nf index 5815965..97f978e 100755 --- a/src/modules/convert_input_from_minknow.nf +++ b/src/modules/convert_input_from_minknow.nf @@ -6,8 +6,8 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { path input output: - path "*.bam", emit: bam - path "*.txt", emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt script: """ @@ -50,8 +50,8 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { path input output: - path "*.bam", emit: bam - path "*.txt", emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt script: """ diff --git a/src/modules/pycoqc.nf b/src/modules/pycoqc.nf index be1ff1b..eb023ca 100755 --- a/src/modules/pycoqc.nf +++ b/src/modules/pycoqc.nf @@ -11,7 +11,7 @@ process PYCOQC_NO_FILTER { path unfiltered_flagstat path filtered_flagstat path seq_summary - val quality_score + val qscore_thresh output: val ("${id}"), emit: id @@ -28,7 +28,7 @@ process PYCOQC_NO_FILTER { pycoQC -f "${seq_summary}" \ -v \ -a "${total_bam}" \ - --min_pass_qual "${quality_score}" \ + --min_pass_qual "${qscore_thresh}" \ -o "./${id}-Unfiltered_pycoqc.html" \ -j "./${id}-Unfiltered_pycoqc.json" """ @@ -45,7 +45,7 @@ process PYCOQC_FILTER { path unfiltered_flagstat path filtered_flagstat path seq_summary - val quality_score + val qscore_thresh path unfiltered_pyco_json output: @@ -61,7 +61,7 @@ process PYCOQC_FILTER { pycoQC -f "${seq_summary}" \ -v \ -a "${filtered_bam}" \ - --min_pass_qual "${quality_score}" \ + --min_pass_qual "${qscore_thresh}" \ -o "./${id}-Filtered_pycoqc.html" \ -j "./${id}-Filtered_pycoqc.json" """ diff --git a/src/nextflow.config b/src/nextflow.config index 7b511f5..4b2d94f 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -4,34 +4,36 @@ // pipeline on command line (e.g. --ont_reads_fq sample_1.fastq) params { + // Project name (used to identify which project you're working on) + project_name = "default" // Input reference fasta file - ref = 'None' + reference_file = "None" // Step of pipeline to execute - step = 'None' + step = "None" // Output directory for pipeline results - out_dir = "output_directory/" + out_dir = "results_${params.project_name}/" // directory of basecalling data - basecall_path = 'None' + basecall_path = "None" // MAPQ filtering threshold for bam files, 0 for no filtering mapq = "10" // Quality score threshold qscore_thresh = "9" - // Desired basecall speed - basecall_speed = "hac" - // Desired basecaller modifications - basecall_mods = false + // Desired basecall speed ("fast", "hac", "sup"; @latest <- latest version available) + basecall_speed = "sup@latest" + // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA; can't use more than one modification per nucleotide) + basecall_mods = "5mC_5hmC" + // Kit name (kit used to barcode the samples (e.g. SQK-RBK114-24); Use "None" to skip --kit-name in basecalling) + barcoding_kit = "SQK-RBK114-24" // Threshold for mapped reasds min_mapped_reads_thresh = 500 - // Desired basecall configuration + // Desired basecall model version as a path (e.g. ./models/dna_r10.4.1_e8.2_400bps_sup@v5.2.0) basecall_config = "None" - // Type of read trimming during basecalling ("all", "primers", "adapters", "none") - basecall_trim = "none" + // Type of read trimming during basecalling ("all", "primers", "adapters", "none"); You should change to "none" if you don't want to trim in the basecalling + basecall_trim = "all" // Basecalling demultiplexing basecall_demux = false - // CPU vs GPU basecalling - basecall_compute = "gpu" - // Trim barcodes (only counts if demultiplexing is enabled) - trim_barcode = "True" + // Barcodes were trimmed? (if True = demux will only separate the files; if False = demux will trim after basecalling and separate them) + trimmed_barcodes = "True" // Add prefix to all output files prefix = "None" // Which GPU devices to use for basecalling? @@ -73,3 +75,37 @@ apptainer { enabled = true pullTimeout = '60m' } + +profiles { + Basecall_Human_Blood { + params { + project_name = "humanTest" + step = 1 + basecall_path = "./data/pod5" + reference_file = "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa" + qscore_thresh = "9" + basecall_speed = "sup@latest" + basecall_mods = "5mC_5hmC" + barcoding_kit = "SQK-RBK114-24" + basecall_config = "None" + basecall_trim = "all" + basecall_demux = false + trim_barcode = "True" + gpu_devices = "all" + out_dir = "results_humanTest/" + } + executor.queueSize = 1 + } + + QC_Human_Blood { + params { + project_name = "humanTest" + step = "2_from_step_1" + steps_2_and_3_input_directory = "./results/results_humanTest/" + min_mapped_reads_thresh = 500 + qscore_thresh = 9 + mapq = 10 + out_dir = "results_humanTest/" + } + } +} \ No newline at end of file diff --git a/src/sub_workflows/BASECALLING.nf b/src/sub_workflows/BASECALLING.nf index dfd2aff..d0da756 100755 --- a/src/sub_workflows/BASECALLING.nf +++ b/src/sub_workflows/BASECALLING.nf @@ -1,31 +1,28 @@ -include { FAST5_to_POD5 ; BASECALL ; BASECALL_DEMUX } from '../modules/basecall.nf' +include { FAST5_to_POD5 ; BASECALL } from '../modules/basecall.nf' workflow BASECALLING { take: pod5_path fast5_path - speed - modifications - config - trim - quality_score - trim_barcode - devices - ref - + basecall_speed + basecall_mods + basecall_config + basecall_trim + qscore_thresh + barcoding_kit + trimmed_barcodes + gpu_devices + reference_file + main: FAST5_to_POD5(fast5_path) pod5_path = FAST5_to_POD5.out.mix(pod5_path) - - if (params.basecall_demux == true) { - BASECALL_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - bams = BASECALL_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - txts = BASECALL_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - } - else { - BASECALL(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - bams = BASECALL.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - } - + + BASECALL(pod5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file) + bams = BASECALL.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + + emit: + bam_files = bams + txt_files = txts } diff --git a/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf b/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf index 90dc60c..8746425 100755 --- a/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf +++ b/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf @@ -8,7 +8,7 @@ workflow FILTERING_AND_QC_FROM_MINKNOW { take: input mapq - quality_score + qscore_thresh main: if (params.is_barcoded == true) { @@ -32,7 +32,7 @@ workflow FILTERING_AND_QC_FROM_MINKNOW { FILTER_BAM.out.unfiltered_flagstat, FILTER_BAM.out.filtered_flagstat, FILTER_BAM.out.txt, - quality_score, + qscore_thresh, ) PYCOQC_FILTER( PYCOQC_NO_FILTER.out.id, @@ -41,7 +41,7 @@ workflow FILTERING_AND_QC_FROM_MINKNOW { PYCOQC_NO_FILTER.out.unfiltered_flagstat, PYCOQC_NO_FILTER.out.filtered_flagstat, PYCOQC_NO_FILTER.out.txt, - quality_score, + qscore_thresh, PYCOQC_NO_FILTER.out.unfiltered_pyco_json, ) MAKE_QC_REPORT( @@ -51,6 +51,6 @@ workflow FILTERING_AND_QC_FROM_MINKNOW { PYCOQC_FILTER.out.unfiltered_pyco_json, PYCOQC_FILTER.out.filtered_pyco_json, mapq, - quality_score, + qscore_thresh, ) } diff --git a/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf b/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf index 6f3a711..7edef55 100755 --- a/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf +++ b/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf @@ -6,13 +6,13 @@ include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf' workflow FILTERING_AND_QC_FROM_STEP_1 { take: - bams - txts + bam_files + txt_files mapq - quality_score + qscore_thresh main: - FILTER_BAM(bams, txts, mapq) + FILTER_BAM(bam_files, txt_files, mapq) PYCOQC_NO_FILTER( FILTER_BAM.out.id, FILTER_BAM.out.total_bam, @@ -22,7 +22,7 @@ workflow FILTERING_AND_QC_FROM_STEP_1 { FILTER_BAM.out.unfiltered_flagstat, FILTER_BAM.out.filtered_flagstat, FILTER_BAM.out.txt, - quality_score, + qscore_thresh, ) PYCOQC_FILTER( PYCOQC_NO_FILTER.out.id, @@ -31,7 +31,7 @@ workflow FILTERING_AND_QC_FROM_STEP_1 { PYCOQC_NO_FILTER.out.unfiltered_flagstat, PYCOQC_NO_FILTER.out.filtered_flagstat, PYCOQC_NO_FILTER.out.txt, - quality_score, + qscore_thresh, PYCOQC_NO_FILTER.out.unfiltered_pyco_json, ) MAKE_QC_REPORT( @@ -41,6 +41,6 @@ workflow FILTERING_AND_QC_FROM_STEP_1 { PYCOQC_FILTER.out.unfiltered_pyco_json, PYCOQC_FILTER.out.filtered_pyco_json, mapq, - quality_score, + qscore_thresh, ) } -- GitLab From 57c707e2b587b016d2d70779cb3397de88d0347a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Fri, 27 Jun 2025 14:32:49 -0300 Subject: [PATCH 2/4] up: nextflow.config with modkit profile --- src/nextflow.config | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/nextflow.config b/src/nextflow.config index 4b2d94f..3399573 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -108,4 +108,14 @@ profiles { out_dir = "results_humanTest/" } } + + Modkit_Human_Blood { + params { + project_name = "humanTest" + step = 3 + steps_2_and_3_input_directory = "./results/results_humanTest/" + multiqc_config = "./references/multiqc_config.yaml" + out_dir = "results_humanTest/" + } + } } \ No newline at end of file -- GitLab From f8ce0a316cafe0179db77f1a41359adb3bf822d2 Mon Sep 17 00:00:00 2001 From: Carlos Gomes Date: Mon, 30 Jun 2025 11:24:08 -0300 Subject: [PATCH 3/4] up: gpu comments + profiles usage --- src/nextflow.config | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/nextflow.config b/src/nextflow.config index 3399573..7616a1f 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -60,6 +60,8 @@ process { // Define local gpu execution withLabel: gpu { executor='local' + // --nv deafult flag to run CUDA applications (can be set on by deafult in /etc/apptainer/apptainer.conf) + // --nvccli uses the nvidia-container-cli (see 'https://apptainer.org/docs/user/main/gpu.html#nvidia-gpus-cuda-nvidia-container-cli') containerOptions = '--nv --nvccli' } // Define the container for every process @@ -77,9 +79,9 @@ apptainer { } profiles { - Basecall_Human_Blood { + human_blood_basecall { params { - project_name = "humanTest" + project_name = "human_blood" step = 1 basecall_path = "./data/pod5" reference_file = "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa" @@ -92,30 +94,30 @@ profiles { basecall_demux = false trim_barcode = "True" gpu_devices = "all" - out_dir = "results_humanTest/" + out_dir = "results_human_blood/" } executor.queueSize = 1 } - QC_Human_Blood { + human_blood_qc { params { - project_name = "humanTest" + project_name = "results_human_blood" step = "2_from_step_1" - steps_2_and_3_input_directory = "./results/results_humanTest/" + steps_2_and_3_input_directory = "./results/results_human_blood/" min_mapped_reads_thresh = 500 qscore_thresh = 9 mapq = 10 - out_dir = "results_humanTest/" + out_dir = "results_human_blood/" } } - Modkit_Human_Blood { + human_blood_modkit { params { - project_name = "humanTest" + project_name = "results_human_blood" step = 3 - steps_2_and_3_input_directory = "./results/results_humanTest/" + steps_2_and_3_input_directory = "./results/results_human_blood/" multiqc_config = "./references/multiqc_config.yaml" - out_dir = "results_humanTest/" + out_dir = "results_human_blood/" } } } \ No newline at end of file -- GitLab From 9d836bda38c907671ec96ba5e03d272e919e57a4 Mon Sep 17 00:00:00 2001 From: Carlos Gomes Date: Mon, 30 Jun 2025 12:31:01 -0300 Subject: [PATCH 4/4] fix: docs and nextflow main config fix typos and missing parameters in the documentation. remove redundant parameters in the human_blood profiles (most were the same as the default ones). --- README.md | 90 ++++++++++++++++++++++----------------------- src/nextflow.config | 29 +++------------ 2 files changed, 50 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 5b85206..622eb06 100755 --- a/README.md +++ b/README.md @@ -83,9 +83,15 @@ NextFlow pipeline used by the Developmental Cognitive Neuroscience Lab (DCNL) to ## Pipeline parameters -### Step 1: Basecalling +The default values for all parameters are set in `src/nexflow.config`. Please notice that it's required to overwrite a few because they depend on the procedure you need to run (e.g., `--step`). The specifics of each are described next. -Many of the parameters for this step are based on dorado basecaller, see their [documentation](https://github.com/nanoporetech/dorado) to understand it better. +### Global options + +```txt +--project_name + + +``` ```txt --step @@ -94,95 +100,99 @@ Many of the parameters for this step are based on dorado basecaller, see their [ ``` ```txt ---basecall_path +--out_dir - +/" in the directory you submitted the pipeline from. Default: "results_/"> ``` ```txt ---basecall_speed +--steps_2_and_3_input_directory - +". Default = "None"> ``` ```txt ---basecall_mods +--prefix - + ``` +### Step 1: Basecalling + +Many of the parameters for this step are based on dorado basecaller, see their [documentation](https://github.com/nanoporetech/dorado) to understand it better. + ```txt ---basecall_compute +--basecall_path - + ``` ```txt ---basecall_config +--basecall_speed - + ``` ```txt ---basecall_trim +--basecall_mods - + ``` ```txt ---qscore_thresh +--basecall_compute - + ``` ```txt ---demux +--basecall_config - + ``` ```txt ---trim_barcodes +--basecall_trim - + ``` ```txt ---gpu_devices +--barcoding_kit - ``` ```txt ---prefix +--qscore_thresh - + ``` ```txt ---out_dir +--basecall_demux -/" in the directory you submitted the pipeline from. Default: "output_directory"> + ``` -### Step 2: Alignment Filtering and Quality Control - ```txt ---step +--trimmed_barcodes - + ``` ```txt ---steps_2_and_3_input_directory +--gpu_devices -". Default = "None"> + + ``` ```txt @@ -205,22 +215,10 @@ Many of the parameters for this step are based on dorado basecaller, see their [ ### Step 3: Methylation Calling and MultiQC -```txt ---step - - -``` - -```txt ---steps_2_and_3_input_directory - -". Default = "None"> -``` - ```txt --multiqc_config - + ``` [top](#table-of-contents) diff --git a/src/nextflow.config b/src/nextflow.config index 7616a1f..2b0189a 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -1,7 +1,5 @@ -// CONFIGURATION FILE - -// Pipeline parameter default values, can be modified by user when calling -// pipeline on command line (e.g. --ont_reads_fq sample_1.fastq) +// MAIN CONFIGURATION FILE +// see src/configs for other parameters params { // Project name (used to identify which project you're working on) @@ -13,14 +11,14 @@ params { // Output directory for pipeline results out_dir = "results_${params.project_name}/" // directory of basecalling data - basecall_path = "None" + basecall_path = "./data" // MAPQ filtering threshold for bam files, 0 for no filtering mapq = "10" // Quality score threshold qscore_thresh = "9" // Desired basecall speed ("fast", "hac", "sup"; @latest <- latest version available) basecall_speed = "sup@latest" - // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA; can't use more than one modification per nucleotide) + // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA). Can't use more than one modification per nucleotide. basecall_mods = "5mC_5hmC" // Kit name (kit used to barcode the samples (e.g. SQK-RBK114-24); Use "None" to skip --kit-name in basecalling) barcoding_kit = "SQK-RBK114-24" @@ -41,7 +39,7 @@ params { // Previous results steps_2_and_3_input_directory = "None" // MultiQC config - multiqc_config = "None" + multiqc_config = "./references/multiqc_config.yaml" // Are the files from MinKNOW barcoded or not is_barcoded = true } @@ -79,44 +77,29 @@ apptainer { } profiles { + // Human Blood human_blood_basecall { params { project_name = "human_blood" step = 1 basecall_path = "./data/pod5" reference_file = "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa" - qscore_thresh = "9" - basecall_speed = "sup@latest" - basecall_mods = "5mC_5hmC" - barcoding_kit = "SQK-RBK114-24" - basecall_config = "None" - basecall_trim = "all" - basecall_demux = false - trim_barcode = "True" - gpu_devices = "all" out_dir = "results_human_blood/" } - executor.queueSize = 1 } - human_blood_qc { params { project_name = "results_human_blood" step = "2_from_step_1" steps_2_and_3_input_directory = "./results/results_human_blood/" - min_mapped_reads_thresh = 500 - qscore_thresh = 9 - mapq = 10 out_dir = "results_human_blood/" } } - human_blood_modkit { params { project_name = "results_human_blood" step = 3 steps_2_and_3_input_directory = "./results/results_human_blood/" - multiqc_config = "./references/multiqc_config.yaml" out_dir = "results_human_blood/" } } -- GitLab