From 8274539259af3fb01ae4b92625d26977bcb79010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Tue, 27 May 2025 18:32:08 -0300 Subject: [PATCH 1/5] up: basecalling.nf code optmization - removed CPU process from basecalling since it was not needed. --- .gitignore | 1 + src/modules/basecall.nf | 104 ------------------------------- src/sub_workflows/BASECALLING.nf | 34 ++++------ 3 files changed, 12 insertions(+), 127 deletions(-) diff --git a/.gitignore b/.gitignore index 78a43a6..fd17af6 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ work *.csv *.zip *.gz +*.sif \ No newline at end of file diff --git a/src/modules/basecall.nf b/src/modules/basecall.nf index e1da10c..7a9408f 100755 --- a/src/modules/basecall.nf +++ b/src/modules/basecall.nf @@ -14,109 +14,6 @@ process FAST5_to_POD5 { """ } -process BASECALL_CPU { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - label 'cpu' - - input: - tuple val(id), path(pod5_dir) - val speed - val mods - val config - val trim - val qscore - val devices - path ref - - output: - path ("*.bam"), emit: bam - path ("*.txt"), emit: txt - - script: - """ - if [[ "${config}" == "false" ]]; then - if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . -x cpu --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . -x cpu --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - fi - else - if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . -x cpu --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . -x cpu --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - fi - fi - samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" - rm "${id}.bam" - mv "${id}_sorted.bam" "${id}.bam" - dorado summary "${id}.bam" > "${id}.txt" - """ -} - -process BASECALL_CPU_DEMUX { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - label 'cpu' - - input: - tuple val(id), path(pod5_dir) - val speed - val mods - val config - val trim - val qscore - val trim_barcode - val devices - path ref - - output: - path ("*.bam"), emit: bam - path ("*.txt"), emit: txt - - script: - """ - if [[ "${config}" == "false" ]]; then - if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . -x cpu --trim "none" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . -x cpu --trim "none" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - fi - else - if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . -x cpu --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . -x cpu --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - - fi - fi - - samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" - rm "${id}.bam" - mv "${id}_sorted.bam" "${id}.bam" - - if [[ "${trim_barcode}" == "true" ]]; then - dorado demux --output-dir "./demux_data/" --no-classify "${id}.bam" - else - dorado demux --no-trim --output-dir "./demux_data/" --no-classify "${id}.bam" - fi - - cd ./demux_data/ - for file in *; do - samtools sort -@ -12 "\$file" -o "${id}_\${file}" - rm "\$file" - done - - cd ../ - rm "${id}.bam" - mv ./demux_data/* ./ - rm -r ./demux_data/ - for file in *.bam; do - new_id="\${file%%.*}" - dorado summary "\$file" > "\${new_id}.txt" - done - """ -} - process BASECALL_GPU { publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'gpu' @@ -158,7 +55,6 @@ process BASECALL_GPU { """ } - process BASECALL_GPU_DEMUX { publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'gpu' diff --git a/src/sub_workflows/BASECALLING.nf b/src/sub_workflows/BASECALLING.nf index 1e0a62f..bc5bd31 100755 --- a/src/sub_workflows/BASECALLING.nf +++ b/src/sub_workflows/BASECALLING.nf @@ -1,4 +1,4 @@ -include { FAST5_to_POD5 ; BASECALL_CPU ; BASECALL_CPU_DEMUX ; BASECALL_GPU ; BASECALL_GPU_DEMUX } from '../modules/basecall.nf' +include { FAST5_to_POD5 ; BASECALL_GPU ; BASECALL_GPU_DEMUX } from '../modules/basecall.nf' workflow BASECALLING { take: @@ -16,28 +16,16 @@ workflow BASECALLING { main: FAST5_to_POD5(fast5_path) pod5_path = FAST5_to_POD5.out.mix(pod5_path) - if (params.basecall_compute?.equalsIgnoreCase("cpu")) { - if (params.basecall_demux == true) { - BASECALL_CPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - bams = BASECALL_CPU_DEMUX.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL_CPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - } - else { - BASECALL_CPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - bams = BASECALL_CPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL_CPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - } + + if (params.basecall_demux == true) { + BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) + bams = BASECALL_GPU_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + txts = BASECALL_GPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() } - else if (params.basecall_compute?.equalsIgnoreCase("gpu")) { - if (params.basecall_demux == true) { - BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - bams = BASECALL_GPU_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - txts = BASECALL_GPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - } - else { - BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - bams = BASECALL_GPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL_GPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - } + else { + BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) + bams = BASECALL_GPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_GPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() } + } -- GitLab From fea1474769abcd6446d2213c6189badd1ded5584 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Sat, 7 Jun 2025 13:47:39 -0300 Subject: [PATCH 2/5] fixed conventions in BASECALL and BASECALL_DEMUX processes - added echo logs for each step --- src/modules/basecall.nf | 25 +++++++++++++++++++------ src/sub_workflows/BASECALLING.nf | 14 +++++++------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/modules/basecall.nf b/src/modules/basecall.nf index 7a9408f..2405f67 100755 --- a/src/modules/basecall.nf +++ b/src/modules/basecall.nf @@ -14,7 +14,7 @@ process FAST5_to_POD5 { """ } -process BASECALL_GPU { +process BASECALL { publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'gpu' @@ -34,6 +34,7 @@ process BASECALL_GPU { script: """ + echo "Basecalling started for: ${id}" if [[ "${config}" == "false" ]]; then if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" @@ -48,14 +49,18 @@ process BASECALL_GPU { fi fi + echo "Basecalling completed, sorting bams..." samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" + + echo "Bams sorted, generating summary with dorado..." dorado summary "${id}.bam" > "${id}.txt" + echo "Process completed for: ${id}" """ } -process BASECALL_GPU_DEMUX { +process BASECALL_DEMUX { publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'gpu' @@ -76,6 +81,7 @@ process BASECALL_GPU_DEMUX { script: """ + echo "Demultiplexed basecalling started for: ${id}" if [[ "${config}" == "false" ]]; then if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" @@ -90,22 +96,28 @@ process BASECALL_GPU_DEMUX { fi fi + echo "Basecalling completed, sorting bams..." samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" - + + echo "Bams sorted, demultiplexing..." if [[ "${trim_barcode}" == "true" ]]; then + echo "Demultiplexing with barcode trimming..." dorado demux --output-dir "./demux_data/" --no-classify "${id}.bam" else + echo "Demultiplexing without barcode trimming..." dorado demux --no-trim --output-dir "./demux_data/" --no-classify "${id}.bam" fi - + + echo "Demultiplexing completed, sorting barcode files..." cd ./demux_data/ for file in *; do samtools sort -@ -12 "\$file" -o "${id}_\${file}" - rm "\$file" + rm "\$file" done + echo "Bams sorted, generating summary with dorado..." cd ../ rm "${id}.bam" mv ./demux_data/* ./ @@ -114,5 +126,6 @@ process BASECALL_GPU_DEMUX { new_id="\${file%%.*}" dorado summary "\$file" > "\${new_id}.txt" done + echo "Process completed for: ${id}" """ -} +} \ No newline at end of file diff --git a/src/sub_workflows/BASECALLING.nf b/src/sub_workflows/BASECALLING.nf index bc5bd31..dfd2aff 100755 --- a/src/sub_workflows/BASECALLING.nf +++ b/src/sub_workflows/BASECALLING.nf @@ -1,4 +1,4 @@ -include { FAST5_to_POD5 ; BASECALL_GPU ; BASECALL_GPU_DEMUX } from '../modules/basecall.nf' +include { FAST5_to_POD5 ; BASECALL ; BASECALL_DEMUX } from '../modules/basecall.nf' workflow BASECALLING { take: @@ -18,14 +18,14 @@ workflow BASECALLING { pod5_path = FAST5_to_POD5.out.mix(pod5_path) if (params.basecall_demux == true) { - BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - bams = BASECALL_GPU_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - txts = BASECALL_GPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + BASECALL_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) + bams = BASECALL_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + txts = BASECALL_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() } else { - BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - bams = BASECALL_GPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL_GPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + BASECALL(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) + bams = BASECALL.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() } } -- GitLab From 6fad528590822bfcbc2534570bc3f8ff18629da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Sat, 7 Jun 2025 14:57:13 -0300 Subject: [PATCH 3/5] updated dorado to v1.0.1 --- containers/debian-nanopore.def | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/containers/debian-nanopore.def b/containers/debian-nanopore.def index c48acff..e55c1c1 100644 --- a/containers/debian-nanopore.def +++ b/containers/debian-nanopore.def @@ -49,10 +49,10 @@ From: debian:12 # Install Dorado cd /opt mkdir dorado && cd dorado - wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.9.6-linux-x64.tar.gz - tar -xzf dorado-0.9.6-linux-x64.tar.gz - rm dorado-0.9.6-linux-x64.tar.gz - echo 'export PATH="/opt/dorado/dorado-0.9.6-linux-x64/bin/:$PATH"' >> "$SINGULARITY_ENVIRONMENT" + wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-1.0.1-linux-x64.tar.gz + tar -xzf dorado-1.0.1-linux-x64.tar.gz + rm dorado-1.0.1-linux-x64.tar.gz + echo 'export PATH="/opt/dorado/dorado-1.0.1-linux-x64/bin/:$PATH"' >> "$SINGULARITY_ENVIRONMENT" %test # Check if installations are on path and display their versions @@ -67,12 +67,12 @@ From: debian:12 %labels author Joao Henrique Chrusciel - version v0.4.0 + version v0.5.0 %help Software included in the container are: - dorado==0.9.6 + dorado==1.0.1 modkit==0.4.4 samtools==1.16.1 pod5==0.3.23 -- GitLab From 7498e42e1ec518e7381a5bf4ee54a7e80aeecdf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Sun, 8 Jun 2025 18:32:10 -0300 Subject: [PATCH 4/5] fixed GPU access issue by adding --nvccli --- src/nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nextflow.config b/src/nextflow.config index 4c73a49..7b511f5 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -58,7 +58,7 @@ process { // Define local gpu execution withLabel: gpu { executor='local' - containerOptions = '--nv' + containerOptions = '--nv --nvccli' } // Define the container for every process container = "./images/debian-nanopore.sif" -- GitLab From bd18ebaa36e043e4e7c8fb4558b8d863c1027cd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Chrusciel?= Date: Fri, 20 Jun 2025 09:19:16 -0300 Subject: [PATCH 5/5] up: README.md and basecall.nf - added the nvidia-conatiner-toolkit dependency info in README.md - modified basecall.nf to improve output handling --- README.md | 8 +++++++- src/modules/basecall.nf | 8 ++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ee239bc..5b85206 100755 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ NextFlow pipeline used by the Developmental Cognitive Neuroscience Lab (DCNL) to sudo apt install git lz4 ``` - - Install Java: install either [OpenJRE/JDK][openjava] (**recommended, see below**) or [OracleJRE/JDK][oraclejava]. to install both openjre and openjdk using Debian/Ubuntu: + - Install Java: install either [OpenJRE/JDK][openjava] (**recommended, see below**) or [OracleJRE/JDK][oraclejava]. To install both openjre and openjdk using Debian/Ubuntu: ```sh sudo apt install default-jre default-jdk @@ -30,6 +30,11 @@ NextFlow pipeline used by the Developmental Cognitive Neuroscience Lab (DCNL) to - Install [NextFlow][nextflow-docs-install] (skip Java installation) - Install [Apptainer][apptainer-docs-install-deb] + - Install [NVIDIA Container Toolkit][nvidia-container-toolkit]: This enables Apptainer to access your GPU during basecalling. After installing it, to know if you have GPU support inside the container you can download a test container and check it by running: + + ```sh + apptainer pull docker://nvidia/cuda:12.2.0-base-ubuntu22.04 && apptainer exec --nvccli cuda_12.2.0-base-ubuntu22.04.sif nvidia-smi + ``` 1. Check that all dependencies are accessible via your users `$PATH`: @@ -72,6 +77,7 @@ NextFlow pipeline used by the Developmental Cognitive Neuroscience Lab (DCNL) to [oraclejava]:https://www.java.com/en/download/linux_manual.jsp [nextflow-docs-install]:https://www.nextflow.io/docs/latest/install.html#install-nextflow [apptainer-docs-install-deb]:https://apptainer.org/docs/admin/main/installation.html#install-debian-packages +[nvidia-container-toolkit]:https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html [top](#table-of-contents) diff --git a/src/modules/basecall.nf b/src/modules/basecall.nf index 2405f67..18efc8b 100755 --- a/src/modules/basecall.nf +++ b/src/modules/basecall.nf @@ -29,8 +29,8 @@ process BASECALL { path ref output: - path ("*.bam"), emit: bam - path ("*.txt"), emit: txt + path ("${id}.bam"), emit: bam + path ("${id}.txt"), emit: txt script: """ @@ -76,8 +76,8 @@ process BASECALL_DEMUX { path ref output: - path ("*.bam"), emit: bam - path ("*.txt"), emit: txt + path ("${id}.bam"), emit: bam + path ("${id}.txt"), emit: txt script: """ -- GitLab