From c64fa5de224eb5c7557614e4902cc0d235904ff4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= <chruscieljoao@gmail.com>
Date: Tue, 24 Jun 2025 18:14:24 -0300
Subject: [PATCH 1/4] up: basecalling, nextflow.config, nextflow parameters and
 readability

 - Updated multiple variables in nextflow.config to match them across the different processes and workflows;
 - Added profiles in nextflow to facilitate the steps;
 - Added --kit-name configuration (params.barcoding_kit) to improve trimming during basecalling;
 - Changed descriptions across variables in nextflow.config;
 - Improved output directory name to include the projects' name (params.project_name);
 - Updated params.basecall_speed to use latest available simplex/modification model for basecalling;
 - Reorganized demux process to be included in the basecalling process to improve code readability;
 - Corrected few typos.
---
 .gitignore                                    |   1 +
 src/main.nf                                   |  32 ++---
 src/modules/basecall.nf                       | 121 +++++++-----------
 src/modules/convert_input_from_minknow.nf     |   8 +-
 src/modules/pycoqc.nf                         |   8 +-
 src/nextflow.config                           |  66 +++++++---
 src/sub_workflows/BASECALLING.nf              |  41 +++---
 .../FILTERING_AND_QC_FROM_MINKNOW.nf          |   8 +-
 .../FILTERING_AND_QC_FROM_STEP_1.nf           |  14 +-
 9 files changed, 153 insertions(+), 146 deletions(-)

diff --git a/.gitignore b/.gitignore
index fd17af6..12d23bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # project
 data
+models
 dataset
 images
 results
diff --git a/src/main.nf b/src/main.nf
index 06e3012..d38bd5e 100755
--- a/src/main.nf
+++ b/src/main.nf
@@ -19,9 +19,9 @@ workflow {
         basecall read trimming option                       : ${params.basecall_trim}
         basecall quality score threshold for basecalling    : ${params.qscore_thresh}
         basecall demultiplexing                             : ${params.basecall_demux}
-        trim barcodes during demultiplexing                 : ${params.trim_barcode}
+        trim barcodes during demultiplexing                 : ${params.trimmed_barcodes}
         submission output file prefix                       : ${params.prefix}
-        GPU device for submission   						: ${params.gpu_devices}
+        GPU device for submission                           : ${params.gpu_devices}
         Output directory                                    : ${params.out_dir}
         =================================================================
         """ 
@@ -47,7 +47,7 @@ workflow {
         ===============================================
         """
     } else {
-        println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://github.com/bernardo-heberle/DCNL_NANOPORE_PIPELINE"
+        println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://gmapsrv.pucrs.br/gitlab/ccd-public/nanopore"
         System.exit(1)
     }
     // Set initial files and channels
@@ -59,25 +59,25 @@ workflow {
             fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple()
             pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" +  file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple()
         }
-        ref = file(params.ref)
-        quality_score = Channel.value(params.qscore_thresh)
         basecall_speed = Channel.value(params.basecall_speed)
         basecall_mods = Channel.value(params.basecall_mods)
         basecall_config = Channel.value(params.basecall_config)
         basecall_trim = Channel.value(params.basecall_trim)
-        // basecall_compute = Channel.value(params.basecall_compute)
-        trim_barcode = Channel.value(params.trim_barcode)
-        devices = Channel.value(params.gpu_devices)
+        qscore_thresh = Channel.value(params.qscore_thresh)
+        barcoding_kit = Channel.value(params.barcoding_kit)
+        trimmed_barcodes = Channel.value(params.trimmed_barcodes)
+        gpu_devices = Channel.value(params.gpu_devices)
+        reference_file = file(params.reference_file)
     } else if (params.step.toString() == "2_from_step_1") {
-        total_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) 
-        txts = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten()
+        bam_files = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) 
+        txt_files = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten()
         mapq = Channel.value(params.mapq)
-        quality_score = Channel.value(params.qscore_thresh)
+        qscore_thresh = Channel.value(params.qscore_thresh)
     } else if (params.step.toString() == "2_from_minknow") {
         input_dir = Channel.fromPath("${params.steps_2_and_3_input_directory}/")
         mapq = Channel.value(params.mapq)
-        quality_score = Channel.value(params.qscore_thresh)
-    } else if (params.step.toString() == "3") {    
+        qscore_thresh = Channel.value(params.qscore_thresh)
+    } else if (params.step.toString() == "3") {
         filtered_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) 
         filtered_bais = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() 
         num_reads = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*")
@@ -88,11 +88,11 @@ workflow {
     }
     // Run steps
     if (params.step.toString() == "1") {
-        BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, quality_score, trim_barcode, devices, ref)
+        BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file)
     } else if (params.step.toString() == "2_from_step_1") {
-        FILTERING_AND_QC_FROM_STEP_1(total_bams, txts, mapq, quality_score)
+        FILTERING_AND_QC_FROM_STEP_1(bam_files, txt_files, mapq, qscore_thresh)
     } else if (params.step.toString() == "2_from_minknow") {
-        FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, quality_score)
+        FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, qscore_thresh)
     } else if (params.step.toString()== "3") {
         MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input)
     }
diff --git a/src/modules/basecall.nf b/src/modules/basecall.nf
index 18efc8b..a59fb78 100755
--- a/src/modules/basecall.nf
+++ b/src/modules/basecall.nf
@@ -10,7 +10,10 @@ process FAST5_to_POD5 {
 
     script:
         """
-        pod5 convert fast5 *.fast5 --output . --one-to-one . --threads 12
+        pod5 convert fast5 *.fast5 \
+        --output . \
+        --one-to-one . \
+        --threads 12
         """
 }
 
@@ -20,112 +23,82 @@ process BASECALL {
 
     input:
         tuple val(id), path(pod5_dir)
-        val speed
-        val mods
-        val config
-        val trim
-        val qscore
-        val devices
-        path ref
+        val basecall_speed
+        val basecall_mods
+        val basecall_config
+        val basecall_trim
+        val qscore_thresh
+        val barcoding_kit
+        val trimmed_barcodes
+        val gpu_devices
+        path reference_file
 
     output:
-        path ("${id}.bam"), emit: bam
-        path ("${id}.txt"), emit: txt
+        path ("*.bam"), emit: bam
+        path ("*.txt"), emit: txt
 
     script:
         """
         echo "Basecalling started for: ${id}"
-        if [[ "${config}" == "false" ]]; then    
-            if [[ "${mods}" == "false" ]]; then 
-                dorado basecaller "${speed}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" 
+        if [[ "${basecall_config}" == "None" ]]; then
+            if [[ "${basecall_mods}" == "None" ]]; then
+                dorado basecaller "${basecall_speed}" . \
+                ${barcoding_kit != "None" ? "--kit-name ${barcoding_kit}" : ""} \
+                --trim "${basecall_trim}" \
+                --min-qscore "${qscore_thresh}" \
+                --reference "${reference_file}" \
+                --device "cuda:${gpu_devices}" > "${id}.bam" 
             else
-                dorado basecaller "${speed},${mods}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam"
+                dorado basecaller "${basecall_speed},${basecall_mods}" . \
+                ${barcoding_kit != "None" ? "--kit-name ${barcoding_kit}" : ""} \
+                --trim "${basecall_trim}" \
+                --min-qscore "${qscore_thresh}" \
+                --reference "${reference_file}" \
+                --device "cuda:${gpu_devices}" > "${id}.bam"
             fi
         else
-            if [[ "${mods}" == "false" ]]; then
-                dorado basecaller "${speed}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam"
-            else
-                dorado basecaller "${speed},${mods}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam"
-            fi
+                dorado basecaller "${basecall_config}" . \
+                ${barcoding_kit != "None" ? "--kit-name ${barcoding_kit}" : ""} \
+                --trim "${basecall_trim}" \
+                --min-qscore "${qscore_thresh}" \
+                --reference "${reference_file}" \
+                --device "cuda:${gpu_devices}" > "${id}.bam"
         fi
 
         echo "Basecalling completed, sorting bams..."
-        samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam"
+        samtools sort -@ 12 "${id}.bam" -o "${id}_sorted.bam"
         rm "${id}.bam"
         mv "${id}_sorted.bam" "${id}.bam"
 
-        echo "Bams sorted, generating summary with dorado..."
-        dorado summary "${id}.bam" > "${id}.txt"
-        echo "Process completed for: ${id}"
-        """
-}
-
-process BASECALL_DEMUX {
-    publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true
-    label 'gpu'
-
-    input:
-        tuple val(id), path(pod5_dir)
-        val speed
-        val mods
-        val config
-        val trim
-        val qscore
-        val trim_barcode
-        val devices
-        path ref
-
-    output:
-        path ("${id}.bam"), emit: bam
-        path ("${id}.txt"), emit: txt
-
-    script:
-        """
-        echo "Demultiplexed basecalling started for: ${id}"
-        if [[ "${config}" == "false" ]]; then
-            if [[ "${mods}" == "false" ]]; then
-                dorado basecaller "${speed}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam"
-            else
-                dorado basecaller "${speed},${mods}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam"
-            fi
-        else
-            if [[ "${mods}" == "false" ]]; then
-                dorado basecaller "${speed}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam"
-            else
-                dorado basecaller "${speed},${mods}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam"
-            fi
-        fi
-
-        echo "Basecalling completed, sorting bams..."
-	    samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam"
-    	rm "${id}.bam"
-    	mv "${id}_sorted.bam" "${id}.bam"
-
         echo "Bams sorted, demultiplexing..."
-        if [[ "${trim_barcode}" == "true" ]]; then
+        if [[ "${trimmed_barcodes}" == "True" ]]; then
             echo "Demultiplexing with barcode trimming..."
             dorado demux --output-dir "./demux_data/" --no-classify "${id}.bam"
         else
-            echo "Demultiplexing without barcode trimming..."
-            dorado demux --no-trim --output-dir "./demux_data/" --no-classify "${id}.bam"
+            echo "Demultiplexing and barcode trimming..."
+            dorado demux --trim "${basecall_trim}" --output-dir "./demux_data/" "${id}.bam"
         fi
         
         echo "Demultiplexing completed, sorting barcode files..."
         cd ./demux_data/
         for file in *; do
-            samtools sort -@ -12 "\$file" -o "${id}_\${file}"
+            samtools sort -@ 12 "\$file" -o "${id}_\$file"
             rm "\$file"
         done
-        
-        echo "Bams sorted, generating summary with dorado..."
+
         cd ../
-        rm "${id}.bam"
         mv ./demux_data/* ./
         rm -r ./demux_data/
+        
+        echo "Bams sorted, generating summary with dorado..."
         for file in *.bam; do
             new_id="\${file%%.*}"
             dorado summary "\$file" > "\${new_id}.txt"
         done
         echo "Process completed for: ${id}"
+
+        # echo "Bams sorted, generating summary with dorado..."
+        # dorado summary "${id}.bam" > "${id}.txt"
+        # echo "Process completed for: ${id}"
         """
 }
\ No newline at end of file
diff --git a/src/modules/convert_input_from_minknow.nf b/src/modules/convert_input_from_minknow.nf
index 5815965..97f978e 100755
--- a/src/modules/convert_input_from_minknow.nf
+++ b/src/modules/convert_input_from_minknow.nf
@@ -6,8 +6,8 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED {
         path input
 
     output:
-        path "*.bam", emit: bam
-        path "*.txt", emit: txt
+        path ("*.bam"), emit: bam
+        path ("*.txt"), emit: txt
 
     script:
         """
@@ -50,8 +50,8 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED {
         path input
 
     output:
-        path "*.bam", emit: bam
-        path "*.txt", emit: txt
+        path ("*.bam"), emit: bam
+        path ("*.txt"), emit: txt
 
     script:
         """
diff --git a/src/modules/pycoqc.nf b/src/modules/pycoqc.nf
index be1ff1b..eb023ca 100755
--- a/src/modules/pycoqc.nf
+++ b/src/modules/pycoqc.nf
@@ -11,7 +11,7 @@ process PYCOQC_NO_FILTER {
         path unfiltered_flagstat
         path filtered_flagstat
         path seq_summary
-        val quality_score
+        val qscore_thresh
 
     output:
         val ("${id}"), emit: id
@@ -28,7 +28,7 @@ process PYCOQC_NO_FILTER {
         pycoQC -f "${seq_summary}" \
             -v \
             -a "${total_bam}" \
-            --min_pass_qual "${quality_score}" \
+            --min_pass_qual "${qscore_thresh}" \
             -o "./${id}-Unfiltered_pycoqc.html" \
             -j "./${id}-Unfiltered_pycoqc.json"
         """
@@ -45,7 +45,7 @@ process PYCOQC_FILTER {
         path unfiltered_flagstat
         path filtered_flagstat
         path seq_summary
-        val quality_score
+        val qscore_thresh
         path unfiltered_pyco_json
 
     output:
@@ -61,7 +61,7 @@ process PYCOQC_FILTER {
         pycoQC -f "${seq_summary}" \
             -v \
             -a "${filtered_bam}" \
-            --min_pass_qual "${quality_score}" \
+            --min_pass_qual "${qscore_thresh}" \
             -o "./${id}-Filtered_pycoqc.html" \
             -j "./${id}-Filtered_pycoqc.json"
         """
diff --git a/src/nextflow.config b/src/nextflow.config
index 7b511f5..4b2d94f 100755
--- a/src/nextflow.config
+++ b/src/nextflow.config
@@ -4,34 +4,36 @@
 // pipeline on command line (e.g. --ont_reads_fq sample_1.fastq)
 
 params {
+    // Project name (used to identify which project you're working on)
+    project_name = "default"
     // Input reference fasta file
-    ref = 'None' 
+    reference_file = "None" 
     // Step of pipeline to execute
-    step = 'None'
+    step = "None"
     // Output directory for pipeline results
-    out_dir = "output_directory/" 
+    out_dir = "results_${params.project_name}/" 
     // directory of basecalling data
-    basecall_path = 'None' 
+    basecall_path = "None" 
     // MAPQ filtering threshold for bam files, 0 for no filtering
     mapq = "10" 
     // Quality score threshold
     qscore_thresh = "9"
-    // Desired basecall speed 
-    basecall_speed = "hac"
-    // Desired basecaller modifications
-    basecall_mods = false
+    // Desired basecall speed ("fast", "hac", "sup"; @latest <- latest version available)
+    basecall_speed = "sup@latest"
+    // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA; can't use more than one modification per nucleotide)
+    basecall_mods = "5mC_5hmC"
+    // Kit name (kit used to barcode the samples (e.g. SQK-RBK114-24); Use "None" to skip --kit-name in basecalling)
+    barcoding_kit = "SQK-RBK114-24"
     // Threshold for mapped reasds
     min_mapped_reads_thresh = 500
-    // Desired basecall configuration
+    // Desired basecall model version as a path (e.g. ./models/dna_r10.4.1_e8.2_400bps_sup@v5.2.0)
     basecall_config = "None"
-    // Type of read trimming during basecalling ("all", "primers", "adapters", "none")
-    basecall_trim = "none"
+    // Type of read trimming during basecalling ("all", "primers", "adapters", "none"); You should change to "none" if you don't want to trim in the basecalling
+    basecall_trim = "all"
     // Basecalling demultiplexing
     basecall_demux = false
-    // CPU vs GPU basecalling
-    basecall_compute = "gpu"
-    // Trim barcodes (only counts if demultiplexing is enabled)
-    trim_barcode = "True"
+    // Barcodes were trimmed? (if True = demux will only separate the files; if False = demux will trim after basecalling and separate them)
+    trimmed_barcodes = "True"
     // Add prefix to all output files
     prefix = "None"
     // Which GPU devices to use for basecalling?
@@ -73,3 +75,37 @@ apptainer {
 	enabled = true
 	pullTimeout = '60m'
 }
+
+profiles {
+    Basecall_Human_Blood {
+        params {
+            project_name = "humanTest"
+            step = 1
+            basecall_path = "./data/pod5"
+            reference_file = "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
+            qscore_thresh = "9"
+            basecall_speed = "sup@latest"
+            basecall_mods = "5mC_5hmC"
+            barcoding_kit = "SQK-RBK114-24"
+            basecall_config = "None"
+            basecall_trim = "all"
+            basecall_demux = false
+            trim_barcode = "True"
+            gpu_devices = "all"
+            out_dir = "results_humanTest/"
+        }
+        executor.queueSize = 1
+    }
+
+    QC_Human_Blood {
+        params {
+            project_name = "humanTest"
+            step = "2_from_step_1"
+            steps_2_and_3_input_directory = "./results/results_humanTest/"
+            min_mapped_reads_thresh = 500
+            qscore_thresh = 9
+            mapq = 10
+            out_dir = "results_humanTest/"
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/sub_workflows/BASECALLING.nf b/src/sub_workflows/BASECALLING.nf
index dfd2aff..d0da756 100755
--- a/src/sub_workflows/BASECALLING.nf
+++ b/src/sub_workflows/BASECALLING.nf
@@ -1,31 +1,28 @@
-include { FAST5_to_POD5 ; BASECALL ; BASECALL_DEMUX } from '../modules/basecall.nf'
+include { FAST5_to_POD5 ; BASECALL } from '../modules/basecall.nf'
 
 workflow BASECALLING {
     take:
         pod5_path
         fast5_path
-        speed
-        modifications
-        config
-        trim
-        quality_score
-        trim_barcode
-        devices
-        ref
-    
+        basecall_speed
+        basecall_mods
+        basecall_config
+        basecall_trim
+        qscore_thresh
+        barcoding_kit
+        trimmed_barcodes
+        gpu_devices
+        reference_file
+        
     main:
         FAST5_to_POD5(fast5_path)
         pod5_path = FAST5_to_POD5.out.mix(pod5_path)
-       
-        if (params.basecall_demux == true) {
-            BASECALL_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref)
-            bams = BASECALL_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten()
-            txts = BASECALL_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten()
-        }
-        else {
-            BASECALL(pod5_path, speed, modifications, config, trim, quality_score, devices, ref)
-            bams = BASECALL.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2)
-            txts = BASECALL.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten()
-        }
-
+        
+        BASECALL(pod5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit,  trimmed_barcodes, gpu_devices, reference_file)
+        bams = BASECALL.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2)
+        txts = BASECALL.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten()
+        
+    emit:
+    bam_files = bams
+    txt_files = txts
 }
diff --git a/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf b/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf
index 90dc60c..8746425 100755
--- a/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf
+++ b/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf
@@ -8,7 +8,7 @@ workflow FILTERING_AND_QC_FROM_MINKNOW {
     take:
         input
         mapq
-        quality_score
+        qscore_thresh
 
     main:
         if (params.is_barcoded == true) {
@@ -32,7 +32,7 @@ workflow FILTERING_AND_QC_FROM_MINKNOW {
             FILTER_BAM.out.unfiltered_flagstat,
             FILTER_BAM.out.filtered_flagstat,
             FILTER_BAM.out.txt,
-            quality_score,
+            qscore_thresh,
         )
         PYCOQC_FILTER(
             PYCOQC_NO_FILTER.out.id,
@@ -41,7 +41,7 @@ workflow FILTERING_AND_QC_FROM_MINKNOW {
             PYCOQC_NO_FILTER.out.unfiltered_flagstat,
             PYCOQC_NO_FILTER.out.filtered_flagstat,
             PYCOQC_NO_FILTER.out.txt,
-            quality_score,
+            qscore_thresh,
             PYCOQC_NO_FILTER.out.unfiltered_pyco_json,
         )
         MAKE_QC_REPORT(
@@ -51,6 +51,6 @@ workflow FILTERING_AND_QC_FROM_MINKNOW {
             PYCOQC_FILTER.out.unfiltered_pyco_json,
             PYCOQC_FILTER.out.filtered_pyco_json,
             mapq,
-            quality_score,
+            qscore_thresh,
         )
 }
diff --git a/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf b/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf
index 6f3a711..7edef55 100755
--- a/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf
+++ b/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf
@@ -6,13 +6,13 @@ include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf'
 
 workflow FILTERING_AND_QC_FROM_STEP_1 {
     take:
-        bams
-        txts
+        bam_files
+        txt_files
         mapq
-        quality_score
+        qscore_thresh
 
     main:
-        FILTER_BAM(bams, txts, mapq)
+        FILTER_BAM(bam_files, txt_files, mapq)
         PYCOQC_NO_FILTER(
             FILTER_BAM.out.id,
             FILTER_BAM.out.total_bam,
@@ -22,7 +22,7 @@ workflow FILTERING_AND_QC_FROM_STEP_1 {
             FILTER_BAM.out.unfiltered_flagstat,
             FILTER_BAM.out.filtered_flagstat,
             FILTER_BAM.out.txt,
-            quality_score,
+            qscore_thresh,
         )
         PYCOQC_FILTER(
             PYCOQC_NO_FILTER.out.id,
@@ -31,7 +31,7 @@ workflow FILTERING_AND_QC_FROM_STEP_1 {
             PYCOQC_NO_FILTER.out.unfiltered_flagstat,
             PYCOQC_NO_FILTER.out.filtered_flagstat,
             PYCOQC_NO_FILTER.out.txt,
-            quality_score,
+            qscore_thresh,
             PYCOQC_NO_FILTER.out.unfiltered_pyco_json,
         )
         MAKE_QC_REPORT(
@@ -41,6 +41,6 @@ workflow FILTERING_AND_QC_FROM_STEP_1 {
             PYCOQC_FILTER.out.unfiltered_pyco_json,
             PYCOQC_FILTER.out.filtered_pyco_json,
             mapq,
-            quality_score,
+            qscore_thresh,
         )
 }
-- 
GitLab


From 57c707e2b587b016d2d70779cb3397de88d0347a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= <chruscieljoao@gmail.com>
Date: Fri, 27 Jun 2025 14:32:49 -0300
Subject: [PATCH 2/4] up: nextflow.config with modkit profile

---
 src/nextflow.config | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/nextflow.config b/src/nextflow.config
index 4b2d94f..3399573 100755
--- a/src/nextflow.config
+++ b/src/nextflow.config
@@ -108,4 +108,14 @@ profiles {
             out_dir = "results_humanTest/"
         }
     }
+
+    Modkit_Human_Blood {
+        params {
+            project_name = "humanTest"
+            step = 3
+            steps_2_and_3_input_directory = "./results/results_humanTest/"
+            multiqc_config = "./references/multiqc_config.yaml"
+            out_dir = "results_humanTest/"
+        }
+    }
 }
\ No newline at end of file
-- 
GitLab


From f8ce0a316cafe0179db77f1a41359adb3bf822d2 Mon Sep 17 00:00:00 2001
From: Carlos Gomes <carlos.fagomes@gmail.com>
Date: Mon, 30 Jun 2025 11:24:08 -0300
Subject: [PATCH 3/4] up: gpu comments + profiles usage

---
 src/nextflow.config | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/nextflow.config b/src/nextflow.config
index 3399573..7616a1f 100755
--- a/src/nextflow.config
+++ b/src/nextflow.config
@@ -60,6 +60,8 @@ process {
     // Define local gpu execution
     withLabel: gpu {
         executor='local'
+        // --nv deafult flag to run CUDA applications (can be set on by deafult in /etc/apptainer/apptainer.conf)
+        // --nvccli uses the nvidia-container-cli (see 'https://apptainer.org/docs/user/main/gpu.html#nvidia-gpus-cuda-nvidia-container-cli')
         containerOptions = '--nv --nvccli'
     }
     // Define the container for every process
@@ -77,9 +79,9 @@ apptainer {
 }
 
 profiles {
-    Basecall_Human_Blood {
+    human_blood_basecall {
         params {
-            project_name = "humanTest"
+            project_name = "human_blood"
             step = 1
             basecall_path = "./data/pod5"
             reference_file = "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
@@ -92,30 +94,30 @@ profiles {
             basecall_demux = false
             trim_barcode = "True"
             gpu_devices = "all"
-            out_dir = "results_humanTest/"
+            out_dir = "results_human_blood/"
         }
         executor.queueSize = 1
     }
 
-    QC_Human_Blood {
+    human_blood_qc {
         params {
-            project_name = "humanTest"
+            project_name = "results_human_blood"
             step = "2_from_step_1"
-            steps_2_and_3_input_directory = "./results/results_humanTest/"
+            steps_2_and_3_input_directory = "./results/results_human_blood/"
             min_mapped_reads_thresh = 500
             qscore_thresh = 9
             mapq = 10
-            out_dir = "results_humanTest/"
+            out_dir = "results_human_blood/"
         }
     }
 
-    Modkit_Human_Blood {
+    human_blood_modkit {
         params {
-            project_name = "humanTest"
+            project_name = "results_human_blood"
             step = 3
-            steps_2_and_3_input_directory = "./results/results_humanTest/"
+            steps_2_and_3_input_directory = "./results/results_human_blood/"
             multiqc_config = "./references/multiqc_config.yaml"
-            out_dir = "results_humanTest/"
+            out_dir = "results_human_blood/"
         }
     }
 }
\ No newline at end of file
-- 
GitLab


From 9d836bda38c907671ec96ba5e03d272e919e57a4 Mon Sep 17 00:00:00 2001
From: Carlos Gomes <carlos.fagomes@gmail.com>
Date: Mon, 30 Jun 2025 12:31:01 -0300
Subject: [PATCH 4/4] fix: docs and nextflow main config

fix typos and missing parameters in the documentation.

remove redundant parameters in the human_blood profiles (most were
the same as the default ones).
---
 README.md           | 90 ++++++++++++++++++++++-----------------------
 src/nextflow.config | 29 +++------------
 2 files changed, 50 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index 5b85206..622eb06 100755
--- a/README.md
+++ b/README.md
@@ -83,9 +83,15 @@ NextFlow pipeline used by the Developmental Cognitive Neuroscience Lab (DCNL) to
 
 ## Pipeline parameters
 
-### Step 1: Basecalling
+The default values for all parameters are set in `src/nexflow.config`. Please notice that it's required to overwrite a few because they depend on the procedure you need to run (e.g., `--step`). The specifics of each are described next.
 
-Many of the parameters for this step are based on dorado basecaller, see their [documentation](https://github.com/nanoporetech/dorado) to understand it better.
+### Global options
+
+```txt
+--project_name
+
+<Type: String. A short name used to identify a project. Default: "default">
+```
 
 ```txt
 --step
@@ -94,95 +100,99 @@ Many of the parameters for this step are based on dorado basecaller, see their [
 ```
 
 ```txt
---basecall_path
+--out_dir
 
-<Type: Path. Path to base directory containing all fast5 and/or pod5 files you want to basecall. It will automatically separate samples based on naming conventions and  directory structure. example: /sequencing_run/">
+<Type: Path. Name of output directory. Output files/directories will be output to "./results/<out_dir>/" in the directory you submitted the pipeline from. Default: "results_<project_name>/">
 ```
 
 ```txt
---basecall_speed
+--steps_2_and_3_input_directory
 
-<Type: String. "fast", "hac", "sup". Default = "hac">
+<Type: Path. When performing a step other than 1, this parameter must be set to the output path of the step 1. Example: "./results/<out_dir>". Default = "None">
 ```
 
 ```txt
---basecall_mods
+--prefix
 
-<Type: String. Comma separated list of base modifications you want to basecall. See dorado docs for more information. Example: "5mCG_5hmCG,5mC_5hmC,6mA". Default: "False">
+<Type: String. Adds a prefix to the beggining of your filenames, good when wanting to keep track of batches of data. Example: "Batch_1". Default: "None">
 ```
 
+### Step 1: Basecalling
+
+Many of the parameters for this step are based on dorado basecaller, see their [documentation](https://github.com/nanoporetech/dorado) to understand it better.
+
 ```txt
---basecall_compute
+--basecall_path
 
-<Type: String. "gpu", "cpu". Default: "gpu". Allows users to choose to basecall with  CPU or GPU. CPU basecalling is super slow and should only be used for small test datasets when GPUs are not available. Also, use --basecall_speed "fast" when basecalling with CPU to make it less slow. Default: "gpu">
+<Type: Path. Path to base directory containing all fast5 or pod5 files you want to basecall. It will automatically separate samples based on naming conventions and  directory structure. Default: "./data">
 ```
 
 ```txt
---basecall_config
+--basecall_speed
 
-<Type: String: configuration name for basecalling setting. This is not necessary since dorado  is able to automatically determine the appropriate configuration. When set to "false" the basecaller will automatically pick the basecall configuration. Example: "dna_r9.4.1_e8_hac@v3.3". Default: "false">
+<Type: String. "fast", "hac", "sup". Default = "sup@latest">
 ```
 
 ```txt
---basecall_trim
+--basecall_mods
 
-<Type: String. "all", "primers", "adapters", "none". Default: "none">
+<Type: String. Comma separated list of base modifications you want to basecall. See dorado docs for more information. Please note that you can't use more than one modification per nucleotide. Options: "4mC_5mC", "5mCG_5hmCG", "5mC_5hmC", "6mA". Default: "5mC_5hmC">
 ```
 
 ```txt
---qscore_thresh
+--basecall_compute
 
-<Type: Integer. Mean quality score threshold for basecalled reads to be considered passing. Default: 9>
+<Type: String. "gpu", "cpu". Default: "gpu". Allows users to choose to basecall with  CPU or GPU. CPU basecalling is super slow and should only be used for small test datasets when GPUs are not available. Also, use --basecall_speed "fast" when basecalling with CPU to make it less slow. Default: "gpu">
 ```
 
 ```txt
---demux
+--basecall_config
 
-<Type: Boolean. "True", "False". Whether you want the data to be demultiplexed setting it to "True" will perform demultiplexing. Default: "False">
+<Type: String Configuration name for basecalling setting. This is not necessary since dorado  is able to automatically determine the appropriate configuration. Default: "None">
 ```
 
 ```txt
---trim_barcodes
+--basecall_trim
 
-<Type: Boolean. "True", "False". Only relevant is --demux is set to "True". if set to "True" barcodes will be trimmed during demultiplexing and will not be present in output "fastq" files. Default: "False">
+<Type: String. Options: "all", "primers", "adapters", "none". Default: "all">
 ```
 
 ```txt
---gpu_devices
+--barcoding_kit
 
-<Type: String. Which gpu devices to use for basecalling. Only relevant when parameter "--basecall_compute" is set to "gpu". Use the "nvidia-smi" command to see available gpu devices and their current usage. Default: "all". Alternative: "0".  Second Alternative: "0,1,2".
+<Type: String. Kit name used to barcode the samples. Use "None" to skip --kit-name in basecalling. Default: "SQK-RBK114-24">
 ```
 
 ```txt
---prefix
+--qscore_thresh
 
-<Type: String. Will add a prefix to the beggining of your filenames, good when wanting to keep track of batches of data. Example: "Batch_1". Default value is "None" which does not add any prefixes>
+<Type: Integer. Mean quality score threshold for basecalled reads to be considered passing. Default: 9>
 ```
 
 ```txt
---out_dir
+--basecall_demux
 
-<Type: Path. Name of output directory. Output files/directories will be output to "./results/<out_dir>/" in the directory you submitted the pipeline from. Default: "output_directory">
+<Type: Boolean. "True", "False". Whether you want the data to be demultiplexed setting it to "True" will perform demultiplexing. Default: false>
 ```
 
-### Step 2: Alignment Filtering and Quality Control
-
 ```txt
---step
+--trimmed_barcodes
 
-<same as before>
+<Type: Boolean. "True", "False". Only relevant is --demux is set to "True". if set to "True" barcodes will be trimmed during demultiplexing and will not be present in output "fastq" files. Default: "False">
 ```
 
 ```txt
---steps_2_and_3_input_directory
+--gpu_devices
 
-<This parameter must be set to the output path of step 1 set with the out_dir parameter. For example "./results/<out_dir>". Default = "None">
+<Type: String. Which gpu devices to use for basecalling. Only relevant when parameter "--basecall_compute" is set to "gpu". For troubleshooting, you can use the 'nvidia-smi' command to see all the available gpu devices. Options: "0", "0,1,2", ... . Default: "all".
 ```
 
+### Step 2: Alignment Filtering and Quality Control
+
 ```txt
 --qscore_thresh
 
-<Mean quality score threshold for basecalled reads to be considered passing. Should be set to the same value specified in step 1. Default: 9>
+<Type: Integer. Mean quality score threshold for basecalled reads to be considered passing. Should be set to the same value specified in step 1. Default: 9>
 ```
 
 ```txt
@@ -205,22 +215,10 @@ Many of the parameters for this step are based on dorado basecaller, see their [
 
 ### Step 3: Methylation Calling and MultiQC
 
-```txt
---step
-
-<same as before>
-```
-
-```txt
---steps_2_and_3_input_directory
-
-<Type: Path. This parameter must be set to the output path of step 1 set with the out_dir parameter. Must also be set to the same as the steps_2_and_3_input_directory for step 2. For example "./results/<out_dir>". Default = "None">
-```
-
 ```txt
 --multiqc_config
 
-<Type: Path. MultiQC configuration file. We provide a template that works well under "./references/multiqc_config.yaml" in this repository, but you are welcome to customize it as you see fit. Default: "None">
+<Type: Path. MultiQC configuration file. We provide a template that works well under "./references/multiqc_config.yaml" in this repository, but you are welcome to customize it as you see fit. Default: "./references/multiqc_config.yaml">
 ```
 
 [top](#table-of-contents)
diff --git a/src/nextflow.config b/src/nextflow.config
index 7616a1f..2b0189a 100755
--- a/src/nextflow.config
+++ b/src/nextflow.config
@@ -1,7 +1,5 @@
-// CONFIGURATION FILE
-
-// Pipeline parameter default values, can be modified by user when calling
-// pipeline on command line (e.g. --ont_reads_fq sample_1.fastq)
+// MAIN CONFIGURATION FILE
+// see src/configs for other parameters
 
 params {
     // Project name (used to identify which project you're working on)
@@ -13,14 +11,14 @@ params {
     // Output directory for pipeline results
     out_dir = "results_${params.project_name}/" 
     // directory of basecalling data
-    basecall_path = "None" 
+    basecall_path = "./data" 
     // MAPQ filtering threshold for bam files, 0 for no filtering
     mapq = "10" 
     // Quality score threshold
     qscore_thresh = "9"
     // Desired basecall speed ("fast", "hac", "sup"; @latest <- latest version available)
     basecall_speed = "sup@latest"
-    // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA; can't use more than one modification per nucleotide)
+    // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA). Can't use more than one modification per nucleotide.
     basecall_mods = "5mC_5hmC"
     // Kit name (kit used to barcode the samples (e.g. SQK-RBK114-24); Use "None" to skip --kit-name in basecalling)
     barcoding_kit = "SQK-RBK114-24"
@@ -41,7 +39,7 @@ params {
     // Previous results
     steps_2_and_3_input_directory = "None"
     // MultiQC config
-    multiqc_config = "None"
+    multiqc_config = "./references/multiqc_config.yaml"
     // Are the files from MinKNOW barcoded or not 
     is_barcoded = true
 }
@@ -79,44 +77,29 @@ apptainer {
 }
 
 profiles {
+    // Human Blood
     human_blood_basecall {
         params {
             project_name = "human_blood"
             step = 1
             basecall_path = "./data/pod5"
             reference_file = "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
-            qscore_thresh = "9"
-            basecall_speed = "sup@latest"
-            basecall_mods = "5mC_5hmC"
-            barcoding_kit = "SQK-RBK114-24"
-            basecall_config = "None"
-            basecall_trim = "all"
-            basecall_demux = false
-            trim_barcode = "True"
-            gpu_devices = "all"
             out_dir = "results_human_blood/"
         }
-        executor.queueSize = 1
     }
-
     human_blood_qc {
         params {
             project_name = "results_human_blood"
             step = "2_from_step_1"
             steps_2_and_3_input_directory = "./results/results_human_blood/"
-            min_mapped_reads_thresh = 500
-            qscore_thresh = 9
-            mapq = 10
             out_dir = "results_human_blood/"
         }
     }
-
     human_blood_modkit {
         params {
             project_name = "results_human_blood"
             step = 3
             steps_2_and_3_input_directory = "./results/results_human_blood/"
-            multiqc_config = "./references/multiqc_config.yaml"
             out_dir = "results_human_blood/"
         }
     }
-- 
GitLab