From 87d1f3a0006b42d3c124a94537c87ebbbba79661 Mon Sep 17 00:00:00 2001 From: Gabriela Carvalho Date: Thu, 24 Apr 2025 14:21:33 -0500 Subject: [PATCH 1/5] up: style fix removal of unnecesary lines breaks --- modules/basecall.nf | 166 +++++------------- modules/calculate_coverage.nf | 63 +++---- modules/convert_input_from_minknow.nf | 19 +- modules/filter_bam.nf | 40 ++--- modules/modkit.nf | 64 +++---- modules/multiqc.nf | 15 +- modules/num_reads_report.nf | 64 +++---- modules/pycoqc.nf | 75 ++++---- sub_workflows/BASECALLING.nf | 72 +++----- .../FILTERING_AND_QC_FROM_MINKNOW.nf | 78 ++++---- sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf | 59 ++++--- sub_workflows/MODKIT_AND_MULTIQC.nf | 21 +-- workflow/main.nf | 71 ++------ 13 files changed, 318 insertions(+), 489 deletions(-) diff --git a/modules/basecall.nf b/modules/basecall.nf index ca679c1..ecf8a8d 100755 --- a/modules/basecall.nf +++ b/modules/basecall.nf @@ -1,7 +1,5 @@ process FAST5_to_POD5 { - publishDir "results/${params.out_dir}/fast5_to_pod5/${id}/", mode: "copy", overwrite: true - label 'cpu' input: @@ -10,22 +8,16 @@ process FAST5_to_POD5 { output: tuple val("${id}"), path("*.pod5") - script: """ - pod5 convert fast5 *.fast5 --output . --one-to-one . --threads 12 - """ - } process BASECALL_CPU { - - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - + publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'cpu' - + input: tuple val(id), path(pod5_dir) val speed @@ -33,53 +25,36 @@ process BASECALL_CPU { val config val trim val qscore - val devices - path ref + val devices + path ref output: - path("*.bam"), emit: bam - path("*.txt"), emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt - - script: + script: """ - - if [[ "$config" == "false" ]]; then - - if [[ "$mods" == "false" ]]; then + if [[ "${config}" == "false" ]]; then + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . -x cpu --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . -x cpu --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" fi - - else - - if [[ "$mods" == "false" ]]; then - + else + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . -x cpu --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . -x cpu --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - fi - fi - + fi samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" - dorado summary "${id}.bam" > "${id}.txt" - """ } - - process BASECALL_CPU_DEMUX { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - - label 'cpu' input: @@ -90,77 +65,61 @@ process BASECALL_CPU_DEMUX { val trim val qscore val trim_barcode - val devices - path ref + val devices + path ref output: - path("*.bam"), emit: bam - path("*.txt"), emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt - - script: + script: """ - - if [[ "$config" == "false" ]]; then - - if [[ "$mods" == "false" ]]; then + if [[ "${config}" == "false" ]]; then + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . -x cpu --trim "none" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . -x cpu --trim "none" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" fi - - else - - if [[ "$mods" == "false" ]]; then - + else + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . -x cpu --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . -x cpu --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" fi fi - + samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" - if [[ "$trim_barcode" == "true" ]]; then + if [[ "${trim_barcode}" == "true" ]]; then dorado demux --output-dir "./demux_data/" --no-classify "${id}.bam" else dorado demux --no-trim --output-dir "./demux_data/" --no-classify "${id}.bam" fi cd ./demux_data/ - for file in *; do samtools sort -@ -12 "\$file" -o "${id}_\${file}" rm "\$file" done cd ../ - rm "${id}.bam" - mv ./demux_data/* ./ - rm -r ./demux_data/ - for file in *.bam; do new_id="\${file%%.*}" dorado summary "\$file" > "\${new_id}.txt" done - """ } - -process BASECALL_GPU { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - +process BASECALL_GPU { + publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'gpu' - + input: tuple val(id), path(pod5_dir) val speed @@ -168,52 +127,38 @@ process BASECALL_GPU { val config val trim val qscore - val devices - path ref + val devices + path ref output: - path("*.bam"), emit: bam - path("*.txt"), emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt - - script: + script: """ - - if [[ "$config" == "false" ]]; then - - if [[ "$mods" == "false" ]]; then + if [[ "${config}" == "false" ]]; then + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" fi - - else - - if [[ "$mods" == "false" ]]; then - + else + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - fi fi - samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" - dorado summary "${id}.bam" > "${id}.txt" - """ } process BASECALL_GPU_DEMUX { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - label 'gpu' input: @@ -224,67 +169,52 @@ process BASECALL_GPU_DEMUX { val trim val qscore val trim_barcode - val devices - path ref + val devices + path ref output: - path("*.bam"), emit: bam - path("*.txt"), emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt - script: + script: """ - - if [[ "$config" == "false" ]]; then - - if [[ "$mods" == "false" ]]; then + if [[ "${config}" == "false" ]]; then + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" fi - - else - - if [[ "$mods" == "false" ]]; then + else + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - fi fi - samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" - rm "${id}.bam" - mv "${id}_sorted.bam" "${id}.bam" + samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" + rm "${id}.bam" + mv "${id}_sorted.bam" "${id}.bam" - - if [[ "$trim_barcode" == "true" ]]; then + if [[ "${trim_barcode}" == "true" ]]; then dorado demux --output-dir "./demux_data/" --no-classify "${id}.bam" else dorado demux --no-trim --output-dir "./demux_data/" --no-classify "${id}.bam" fi cd ./demux_data/ - for file in *; do samtools sort -@ -12 "\$file" -o "${id}_\${file}" rm "\$file" done - cd ../ - rm "${id}.bam" - mv ./demux_data/* ./ - rm -r ./demux_data/ - for file in *.bam; do new_id="\${file%%.*}" dorado summary "\$file" > "\${new_id}.txt" done - """ } diff --git a/modules/calculate_coverage.nf b/modules/calculate_coverage.nf index 1e63385..f318b04 100755 --- a/modules/calculate_coverage.nf +++ b/modules/calculate_coverage.nf @@ -1,57 +1,46 @@ process CALCULATE_COVERAGE { - label 'cpu' input: - tuple val(id), path(bam) - path(bai) + tuple val(id), path(bam) + path bai output: - path("*coverage*") + path "*coverage*" script: + """ + echo -e "Chromosome\\t${id}" > ${id}.coverage.tsv + samtools depth -a ${bam} | awk ' + { + sum[\$1] += \$3; + count[\$1]++; + } + END { + for (chr in sum) { + print chr "\\t" sum[chr]/count[chr]; + } + }' >> ${id}.coverage.tsv """ - - echo -e "Chromosome\\t${id}" > ${id}.coverage.tsv - samtools depth -a ${bam} | awk ' - { - sum[\$1] += \$3; - count[\$1]++; - } - END { - for (chr in sum) { - print chr "\\t" sum[chr]/count[chr]; - } - }' >> ${id}.coverage.tsv - - - """ - } - process MERGE_COVERAGE { - publishDir "${params.steps_2_and_3_input_directory}/calculate_coverage/", mode: "copy", pattern: "*", overwrite: true - + publishDir "${params.steps_2_and_3_input_directory}/calculate_coverage/", mode: "copy", pattern: "*", overwrite: true label 'cpu' input: - path(coverage_files) + path coverage_files output: - path("merged_coverage.tsv"), emit: output - path("*mqc*"), emit: mqc + path ("merged_coverage.tsv"), emit: output + path ("*mqc*"), emit: mqc script: - """ - - merge_files.py ${coverage_files.join(' ')} merged_coverage.tsv - - echo "# plot_type: 'table'" >> "Chr_Coverage_mqc.tsv" - echo "# id: 'chromosome coverage custom'" >> "Chr_Coverage_mqc.tsv" - echo "# section_name: 'Average chromosome coverage per sample'" >> "Chr_Coverage_mqc.tsv" - cat merged_coverage.tsv >> "Chr_Coverage_mqc.tsv" - - """ - + """ + merge_files.py ${coverage_files.join(' ')} merged_coverage.tsv + echo "# plot_type: 'table'" >> "Chr_Coverage_mqc.tsv" + echo "# id: 'chromosome coverage custom'" >> "Chr_Coverage_mqc.tsv" + echo "# section_name: 'Average chromosome coverage per sample'" >> "Chr_Coverage_mqc.tsv" + cat merged_coverage.tsv >> "Chr_Coverage_mqc.tsv" + """ } diff --git a/modules/convert_input_from_minknow.nf b/modules/convert_input_from_minknow.nf index a906269..e404950 100755 --- a/modules/convert_input_from_minknow.nf +++ b/modules/convert_input_from_minknow.nf @@ -1,17 +1,17 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { publishDir "results/${params.out_dir}/minknow_converted_input", mode: "copy", overwrite: true - label 'cpu' input: - path input + path input + output: - path "*.bam", emit: bam - path "*.txt", emit: txt + path "*.bam", emit: bam + path "*.txt", emit: txt script: - """ + """ # Define the input directory path input_dir="${input.toString()}" @@ -50,11 +50,11 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { publishDir "results/${params.out_dir}/minknow_converted_input", mode: "copy", overwrite: true - label 'cpu' input: path input + output: path "*.bam", emit: bam path "*.txt", emit: txt @@ -73,22 +73,19 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { fi # Find all 'pass' directories, follow symlinks with -L - find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do + find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do - ## Get the name of the input directory for file naming + ## Get the name of the input directory for file naming parent_dir="\${pass_dir%/*}"; parent_dir="\${parent_dir##*/}" - # Check if the pass directory contains any BAM files bam_files=(\$(find -L "\$pass_dir" -type f -name '*.bam' | grep -v 'iltered')) if [ \${#bam_files[@]} -gt 0 ]; then # Merge BAM files using samtools output_bam="./\${parent_dir}.bam" samtools merge "./\$output_bam" "\${bam_files[@]}" - # Generate summary with dorado dorado summary "\$output_bam" > "./\${parent_dir}.txt" fi done """ } - diff --git a/modules/filter_bam.nf b/modules/filter_bam.nf index 0a7c9d3..9eac4de 100755 --- a/modules/filter_bam.nf +++ b/modules/filter_bam.nf @@ -1,43 +1,37 @@ process FILTER_BAM { - + publishDir "results/${params.out_dir}/bam_filtering/", mode: "copy", pattern: "*.ba*", overwrite: true publishDir "results/${params.out_dir}/multiqc_input/minimap2/", mode: "copy", pattern: "*.*stat", overwrite: true - - label 'cpu' input: - tuple val(id), path(bam) - path(txt) - val(mapq) + tuple val(id), path(bam) + path txt + val mapq output: - env(id), emit: id, optional: true - path("*-Unfiltered.bam"), emit: total_bam, optional: true - path("*-Unfiltered.bam.bai"), emit: total_bai, optional: true - path("*-Filtered*.bam"), emit: filtered_bam, optional: true - path("*-Filtered*.bai"), emit: filtered_bai, optional: true - path("*-Unfiltered.flagstat"), emit: unfiltered_flagstat, optional: true - path("*-Filtered*.flagstat"), emit: filtered_flagstat, optional: true - path("*_sequencing_summary*"), emit: txt, optional: true - path("*.*stat"), emit: multiqc, optional: true + env (id), emit: id, optional: true + path ("*-Unfiltered.bam"), emit: total_bam, optional: true + path ("*-Unfiltered.bam.bai"), emit: total_bai, optional: true + path ("*-Filtered*.bam"), emit: filtered_bam, optional: true + path ("*-Filtered*.bai"), emit: filtered_bai, optional: true + path ("*-Unfiltered.flagstat"), emit: unfiltered_flagstat, optional: true + path ("*-Filtered*.flagstat"), emit: filtered_flagstat, optional: true + path ("*_sequencing_summary*"), emit: txt, optional: true + path ("*.*stat"), emit: multiqc, optional: true script: """ - cp "${bam}" "${id}-Unfiltered.bam" samtools index "${id}-Unfiltered.bam" - - samtools flagstat "${id}-Unfiltered.bam" > "${id}-Unfiltered.flagstat" + samtools flagstat "${id}-Unfiltered.bam" > "${id}-Unfiltered.flagstat" samtools idxstats "${id}-Unfiltered.bam" > "${id}-Unfiltered.idxstat" - - samtools view -b -q $mapq -F 2304 -@ 12 $bam > "intermediate.bam" + samtools view -b -q ${mapq} -F 2304 -@ 12 ${bam} > "intermediate.bam" samtools sort -@ 12 "intermediate.bam" -o "${id}-Filtered_primary_mapq_${mapq}.bam" samtools index "${id}-Filtered_primary_mapq_${mapq}.bam" samtools flagstat "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.flagstat" samtools idxstats "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.idxstat" - var=\$(awk 'NR==8 {print \$1}' "${id}-Filtered_primary_mapq_${mapq}.flagstat") ## If flagstat file says there are no mapped reads then delete bam and bai files. @@ -49,10 +43,6 @@ process FILTER_BAM { id="${id}" cp "${txt}" "${id}_sequencing_summary.txt" fi - rm "intermediate.bam" - """ - } - diff --git a/modules/modkit.nf b/modules/modkit.nf index c7f4a6b..fb436bc 100755 --- a/modules/modkit.nf +++ b/modules/modkit.nf @@ -1,38 +1,32 @@ process MODKIT { - publishDir "${params.steps_2_and_3_input_directory}/modkit/", mode: "copy", pattern: "*", overwrite: true - - - label 'cpu' - - input: - tuple val(id), path(bam) - path(bai) - - output: - path("*") - - script: - """ - - echo "starting modkit" - - ## Make Methylation TSV file - modkit extract full --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv" - - echo "extract successful" - - ## Make Methylation Table - modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" - - echo "pileup successful" - - ## Make Summary - modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt" - - echo "summary successful" - - """ - + publishDir "${params.steps_2_and_3_input_directory}/modkit/", mode: "copy", pattern: "*", overwrite: true + label 'cpu' + + input: + tuple val(id), path(bam) + path bai + + output: + path "*" + + script: + """ + echo "starting modkit" + + ## Make Methylation TSV file + modkit extract full --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv" + + echo "extract successful" + + ## Make Methylation Table + modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" + + echo "pileup successful" + + ## Make Summary + modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt" + + echo "summary successful" + """ } - diff --git a/modules/multiqc.nf b/modules/multiqc.nf index ced6882..adca7e4 100755 --- a/modules/multiqc.nf +++ b/modules/multiqc.nf @@ -1,19 +1,18 @@ process MULTIQC { publishDir "${params.steps_2_and_3_input_directory}/multiQC_output", mode: "copy", overwrite: true - label 'cpu' input: - path(multiqc_input) - path(merged_coverage) - path(multiqc_config) - - output: - path "*" + path multiqc_input + path merged_coverage + path multiqc_config + + output: + path "*" script: """ - multiqc -c $multiqc_config -n multiQC_report.html . + multiqc -c ${multiqc_config} -n multiQC_report.html . """ } diff --git a/modules/num_reads_report.nf b/modules/num_reads_report.nf index f9f2372..24d2a51 100755 --- a/modules/num_reads_report.nf +++ b/modules/num_reads_report.nf @@ -1,99 +1,75 @@ process MAKE_QC_REPORT { - + publishDir "results/${params.out_dir}/intermediate_qc_reports/number_of_reads/", pattern: "*num_reads.tsv", mode: "copy", overwrite: true publishDir "results/${params.out_dir}/intermediate_qc_reports/read_length/", pattern: "*length.tsv", mode: "copy", overwrite: true publishDir "results/${params.out_dir}/intermediate_qc_reports/quality_score_thresholds/", pattern: "*thresholds.tsv", mode: "copy", overwrite: true - label 'cpu' input: - val(id) - path(unfiltered_flagstat) - path(filtered_flagstat) - path(unfiltered_pyco_json) - path(filtered_pyco_json) - val(mapq) - val(qscore_thresh) + val id + path unfiltered_flagstat + path filtered_flagstat + path unfiltered_pyco_json + path filtered_pyco_json + val mapq + val qscore_thresh output: - path("${id}_num_reads.tsv"), emit: num_reads - path("${id}_read_length.tsv"), emit: read_length - path("${id}_quality_thresholds.tsv"), emit: qscore_thresh + path ("${id}_num_reads.tsv"), emit: num_reads + path ("${id}_read_length.tsv"), emit: read_length + path ("${id}_quality_thresholds.tsv"), emit: qscore_thresh script: """ num_all_reads=\$(jq '.["All Reads"].basecall.reads_number' "${unfiltered_pyco_json}") - num_pass_reads=\$(jq '.["Pass Reads"].basecall.reads_number' "${unfiltered_pyco_json}") - num_primary_alignments=\$(grep "primary mapped" "${unfiltered_flagstat}" | awk {'print \$1}') - num_primary_alignments_filtered=\$(grep "primary mapped" "${filtered_flagstat}" | awk '{print \$1}') - N50_fastq_all=\$(jq '.["All Reads"].basecall.N50' "${unfiltered_pyco_json}") - median_read_length_fastq_all=\$(jq '.["All Reads"].basecall.len_percentiles[50]' "${unfiltered_pyco_json}") - N50_fastq_pass=\$(jq '.["Pass Reads"].basecall.N50' "${unfiltered_pyco_json}") - median_read_length_fastq_pass=\$(jq '.["Pass Reads"].basecall.len_percentiles[50]' "${unfiltered_pyco_json}") - N50_alignment_unfiltered=\$(jq '.["All Reads"].alignment.N50' "${unfiltered_pyco_json}") - median_read_length_alignment_unfiltered=\$(jq '.["All Reads"].alignment.len_percentiles[50]' "${unfiltered_pyco_json}") - N50_alignment_filtered=\$(jq '.["All Reads"].alignment.N50' "${filtered_pyco_json}") - median_read_length_alignment_filtered=\$(jq '.["All Reads"].alignment.len_percentiles[50]' "${filtered_pyco_json}") - echo "${id}\t\${num_all_reads}\t\${num_pass_reads}\t\${num_primary_alignments}\t\${num_primary_alignments_filtered}" > "${id}_num_reads.tsv" - echo "${id}\t\${N50_fastq_all}\t\${median_read_length_fastq_all}\t\${N50_fastq_pass}\t\${median_read_length_fastq_pass}\t\${N50_alignment_unfiltered}\t\${median_read_length_alignment_unfiltered}\t\${N50_alignment_filtered}\t\${median_read_length_alignment_filtered}" > "${id}_read_length.tsv" - echo "${id}\t${qscore_thresh}\t${mapq}" > "${id}_quality_thresholds.tsv" - """ - } process MERGE_QC_REPORT { publishDir "${params.steps_2_and_3_input_directory}/reads_report/", pattern: "*", mode: "copy", overwrite: true - label 'cpu' input: - path(num_reads) - path(read_length) - path(qscore_thresh) + path num_reads + path read_length + path qscore_thresh output: - path("*") + path "*" script: - """ - + """ echo "# plot_type: 'table'" >> "Number_of_Reads_mqc.tsv" echo "# id: 'number of reads custom'" >> "Number_of_Reads_mqc.tsv" echo "# section_name: 'Number of reads per sample'" >> "Number_of_Reads_mqc.tsv" echo "Sample_ID\tAll Reads\tPass Reads\tPrimary Alignments\tFiltered Primary Alignments (MAPQ)" >> "Number_of_Reads_mqc.tsv" - cat $num_reads >> "Number_of_Reads_mqc.tsv" - - - + cat ${num_reads} >> "Number_of_Reads_mqc.tsv" + echo "# plot_type: 'table'" >> "Read_Length_mqc.tsv" echo "# id: 'read length custom'" >> "Read_Length_mqc.tsv" echo "# section_name: 'Read lengths per sample'" >> "Read_Length_mqc.tsv" echo "Sample_ID\tN50 All Reads\tMedian Read Length All Reads\tN50 Pass Reads\tMedian Read Length Pass Reads\tN50 Primary Alignments\tMedian Read Length Primary Alignments\tN50 Filtered Primary Alignments\tMedian Read Length Filtered Primary Alignments" >> "Read_Length_mqc.tsv" - cat $read_length >> "Read_Length_mqc.tsv" - + cat ${read_length} >> "Read_Length_mqc.tsv" echo "# plot_type: 'table'" >> "Quality_Thresholds_mqc.tsv" echo "# id: 'quality threholds'" >> "Quality_Thresholds_mqc.tsv" echo "# section_name: 'Quality thresholds for each sample'" >> "Quality_Thresholds_mqc.tsv" echo "Sample_ID\tRead Mean Base Quality Score Threshold (PHRED)\tMapping Quality Threshold (MAPQ)" >> "Quality_Thresholds_mqc.tsv" - cat $qscore_thresh >> "Quality_Thresholds_mqc.tsv" - + cat ${qscore_thresh} >> "Quality_Thresholds_mqc.tsv" """ - } diff --git a/modules/pycoqc.nf b/modules/pycoqc.nf index ace2eab..7fcfa01 100755 --- a/modules/pycoqc.nf +++ b/modules/pycoqc.nf @@ -1,81 +1,70 @@ process PYCOQC_NO_FILTER { publishDir "results/${params.out_dir}/pycoqc_no_filter/", mode: 'copy', overwrite: true, pattern: "*-Unfiltered_pycoqc*" - label 'cpu' input: - val(id) - path(total_bam) - path(total_bai) - path(filtered_bam) - path(filtered_bai) - path(unfiltered_flagstat) - path(filtered_flagstat) - path(seq_summary) - val(quality_score) + val id + path total_bam + path total_bai + path filtered_bam + path filtered_bai + path unfiltered_flagstat + path filtered_flagstat + path seq_summary + val quality_score output: - val("${id}"), emit: id - path("${filtered_bam}"), emit: filtered_bam - path("${filtered_bai}"), emit: filtered_bai - path("${unfiltered_flagstat}"), emit: unfiltered_flagstat - path("${filtered_flagstat}"), emit: filtered_flagstat - path("${seq_summary}"), emit: txt - path("*.html"), emit: unfiltered_pyco_html - path("*.json"), emit: unfiltered_pyco_json - - + val ("${id}"), emit: id + path ("${filtered_bam}"), emit: filtered_bam + path ("${filtered_bai}"), emit: filtered_bai + path ("${unfiltered_flagstat}"), emit: unfiltered_flagstat + path ("${filtered_flagstat}"), emit: filtered_flagstat + path ("${seq_summary}"), emit: txt + path ("*.html"), emit: unfiltered_pyco_html + path ("*.json"), emit: unfiltered_pyco_json + script: """ - - pycoQC -f "${seq_summary}" \ -v \ -a "${total_bam}" \ --min_pass_qual "${quality_score}" \ -o "./${id}-Unfiltered_pycoqc.html" \ -j "./${id}-Unfiltered_pycoqc.json" - """ } process PYCOQC_FILTER { - publishDir "results/${params.out_dir}/pycoqc_filtered/", mode: 'copy', overwrite: true, pattern: "*-Filtered_pycoqc*" - label 'cpu' input: - val(id) - path(filtered_bam) - path(filtered_bai) - path(unfiltered_flagstat) - path(filtered_flagstat) - path(seq_summary) - val(quality_score) - path(unfiltered_pyco_json) - + val id + path filtered_bam + path filtered_bai + path unfiltered_flagstat + path filtered_flagstat + path seq_summary + val quality_score + path unfiltered_pyco_json output: - val("${id}"), emit: id - path("${unfiltered_flagstat}"), emit: unfiltered_flagstat - path("${filtered_flagstat}"), emit: filtered_flagstat - path("${unfiltered_pyco_json}"), emit: unfiltered_pyco_json - path("*-Filtered*.html"), emit: filtered_pyco_html - path("*-Filtered*.json"), emit: filtered_pyco_json - + val ("${id}"), emit: id + path ("${unfiltered_flagstat}"), emit: unfiltered_flagstat + path ("${filtered_flagstat}"), emit: filtered_flagstat + path ("${unfiltered_pyco_json}"), emit: unfiltered_pyco_json + path ("*-Filtered*.html"), emit: filtered_pyco_html + path ("*-Filtered*.json"), emit: filtered_pyco_json script: """ - pycoQC -f "${seq_summary}" \ -v \ -a "${filtered_bam}" \ --min_pass_qual "${quality_score}" \ -o "./${id}-Filtered_pycoqc.html" \ -j "./${id}-Filtered_pycoqc.json" - """ } diff --git a/sub_workflows/BASECALLING.nf b/sub_workflows/BASECALLING.nf index fcc11dd..cc07e68 100755 --- a/sub_workflows/BASECALLING.nf +++ b/sub_workflows/BASECALLING.nf @@ -1,65 +1,43 @@ -// Import Modules - -include {FAST5_to_POD5; BASECALL_CPU; BASECALL_CPU_DEMUX; BASECALL_GPU; BASECALL_GPU_DEMUX} from '../modules/basecall.nf' +include { FAST5_to_POD5 ; BASECALL_CPU ; BASECALL_CPU_DEMUX ; BASECALL_GPU ; BASECALL_GPU_DEMUX } from '../modules/basecall.nf' workflow BASECALLING { - take: - pod5_path - fast5_path - speed - modifications - config - trim - quality_score - trim_barcode - devices - ref - + pod5_path + fast5_path + speed + modifications + config + trim + quality_score + trim_barcode + devices + ref main: - FAST5_to_POD5(fast5_path) pod5_path = FAST5_to_POD5.out.mix(pod5_path) - - if (params.basecall_compute?.equalsIgnoreCase("cpu")) { - if (params.basecall_demux == true) { - BASECALL_CPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - - bams = BASECALL_CPU_DEMUX.out.bam.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = BASECALL_CPU_DEMUX.out.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - - } else { - + bams = BASECALL_CPU_DEMUX.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_CPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + else { BASECALL_CPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - - bams = BASECALL_CPU.out.bam.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = BASECALL_CPU.out.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - - + bams = BASECALL_CPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_CPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() } - - } else if (params.basecall_compute?.equalsIgnoreCase("gpu")) { - + } + else if (params.basecall_compute?.equalsIgnoreCase("gpu")) { if (params.basecall_demux == true) { - BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - - bams = BASECALL_GPU_DEMUX.out.bam.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - txts = BASECALL_GPU_DEMUX.out.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - - } else { - + bams = BASECALL_GPU_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + txts = BASECALL_GPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + else { BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - - bams = BASECALL_GPU.out.bam.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = BASECALL_GPU.out.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - + bams = BASECALL_GPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_GPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() } - } - } diff --git a/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf b/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf index e68e76c..2a007cc 100755 --- a/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf +++ b/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf @@ -1,38 +1,56 @@ // Import Modules - -include {PYCOQC_NO_FILTER ; PYCOQC_FILTER} from '../modules/pycoqc.nf' -include {FILTER_BAM} from '../modules/filter_bam.nf' -include {MAKE_QC_REPORT} from '../modules/num_reads_report.nf' -include {CONVERT_INPUT_FROM_MINKNOW_BARCODED; CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED} from '../modules/convert_input_from_minknow.nf' +include { PYCOQC_NO_FILTER ; PYCOQC_FILTER } from '../modules/pycoqc.nf' +include { FILTER_BAM } from '../modules/filter_bam.nf' +include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf' +include { CONVERT_INPUT_FROM_MINKNOW_BARCODED ; CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED } from '../modules/convert_input_from_minknow.nf' workflow FILTERING_AND_QC_FROM_MINKNOW { - take: - input - mapq - quality_score - + input + mapq + quality_score main: - if (params.is_barcoded == true) { - CONVERT_INPUT_FROM_MINKNOW_BARCODED(input) - converted_input = CONVERT_INPUT_FROM_MINKNOW_BARCODED.out - } else if (params.is_barcoded == false) { - CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input) - converted_input = CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED.out + CONVERT_INPUT_FROM_MINKNOW_BARCODED(input) + converted_input = CONVERT_INPUT_FROM_MINKNOW_BARCODED.out } - - FILTER_BAM(converted_input.bam.flatten().map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2), - converted_input.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten(), mapq) - - PYCOQC_NO_FILTER(FILTER_BAM.out.id, FILTER_BAM.out.total_bam, FILTER_BAM.out.total_bai, FILTER_BAM.out.filtered_bam, - FILTER_BAM.out.filtered_bai, FILTER_BAM.out.unfiltered_flagstat, FILTER_BAM.out.filtered_flagstat, FILTER_BAM.out.txt, quality_score) - - PYCOQC_FILTER(PYCOQC_NO_FILTER.out.id, PYCOQC_NO_FILTER.out.filtered_bam, PYCOQC_NO_FILTER.out.filtered_bai, PYCOQC_NO_FILTER.out.unfiltered_flagstat, - PYCOQC_NO_FILTER.out.filtered_flagstat, PYCOQC_NO_FILTER.out.txt, quality_score, PYCOQC_NO_FILTER.out.unfiltered_pyco_json) - - MAKE_QC_REPORT(PYCOQC_FILTER.out.id, PYCOQC_FILTER.out.unfiltered_flagstat, PYCOQC_FILTER.out.filtered_flagstat, - PYCOQC_FILTER.out.unfiltered_pyco_json, PYCOQC_FILTER.out.filtered_pyco_json, mapq, quality_score) - - + else if (params.is_barcoded == false) { + CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input) + converted_input = CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED.out + } + FILTER_BAM( + converted_input.bam.flatten().map { file -> tuple(file.baseName, file) }.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2), + converted_input.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten(), + mapq, + ) + PYCOQC_NO_FILTER( + FILTER_BAM.out.id, + FILTER_BAM.out.total_bam, + FILTER_BAM.out.total_bai, + FILTER_BAM.out.filtered_bam, + FILTER_BAM.out.filtered_bai, + FILTER_BAM.out.unfiltered_flagstat, + FILTER_BAM.out.filtered_flagstat, + FILTER_BAM.out.txt, + quality_score, + ) + PYCOQC_FILTER( + PYCOQC_NO_FILTER.out.id, + PYCOQC_NO_FILTER.out.filtered_bam, + PYCOQC_NO_FILTER.out.filtered_bai, + PYCOQC_NO_FILTER.out.unfiltered_flagstat, + PYCOQC_NO_FILTER.out.filtered_flagstat, + PYCOQC_NO_FILTER.out.txt, + quality_score, + PYCOQC_NO_FILTER.out.unfiltered_pyco_json, + ) + MAKE_QC_REPORT( + PYCOQC_FILTER.out.id, + PYCOQC_FILTER.out.unfiltered_flagstat, + PYCOQC_FILTER.out.filtered_flagstat, + PYCOQC_FILTER.out.unfiltered_pyco_json, + PYCOQC_FILTER.out.filtered_pyco_json, + mapq, + quality_score, + ) } diff --git a/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf b/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf index abdaa47..bf14a51 100755 --- a/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf +++ b/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf @@ -1,30 +1,45 @@ // Import Modules -include {PYCOQC_NO_FILTER ; PYCOQC_FILTER} from '../modules/pycoqc.nf' -include {FILTER_BAM} from '../modules/filter_bam.nf' -include {MAKE_QC_REPORT} from '../modules/num_reads_report.nf' - +include { PYCOQC_NO_FILTER ; PYCOQC_FILTER } from '../modules/pycoqc.nf' +include { FILTER_BAM } from '../modules/filter_bam.nf' +include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf' workflow FILTERING_AND_QC_FROM_STEP_1 { - take: - bams - txts - mapq - quality_score + bams + txts + mapq + quality_score main: - FILTER_BAM(bams, txts, mapq) - - PYCOQC_NO_FILTER(FILTER_BAM.out.id, FILTER_BAM.out.total_bam, FILTER_BAM.out.total_bai, FILTER_BAM.out.filtered_bam, FILTER_BAM.out.filtered_bai, - FILTER_BAM.out.unfiltered_flagstat, FILTER_BAM.out.filtered_flagstat, FILTER_BAM.out.txt, quality_score) - - PYCOQC_FILTER(PYCOQC_NO_FILTER.out.id, PYCOQC_NO_FILTER.out.filtered_bam, PYCOQC_NO_FILTER.out.filtered_bai, PYCOQC_NO_FILTER.out.unfiltered_flagstat, - PYCOQC_NO_FILTER.out.filtered_flagstat, PYCOQC_NO_FILTER.out.txt, quality_score, PYCOQC_NO_FILTER.out.unfiltered_pyco_json) - - - MAKE_QC_REPORT(PYCOQC_FILTER.out.id, PYCOQC_FILTER.out.unfiltered_flagstat, PYCOQC_FILTER.out.filtered_flagstat, - PYCOQC_FILTER.out.unfiltered_pyco_json, PYCOQC_FILTER.out.filtered_pyco_json, mapq, quality_score) - - + PYCOQC_NO_FILTER( + FILTER_BAM.out.id, + FILTER_BAM.out.total_bam, + FILTER_BAM.out.total_bai, + FILTER_BAM.out.filtered_bam, + FILTER_BAM.out.filtered_bai, + FILTER_BAM.out.unfiltered_flagstat, + FILTER_BAM.out.filtered_flagstat, + FILTER_BAM.out.txt, + quality_score, + ) + PYCOQC_FILTER( + PYCOQC_NO_FILTER.out.id, + PYCOQC_NO_FILTER.out.filtered_bam, + PYCOQC_NO_FILTER.out.filtered_bai, + PYCOQC_NO_FILTER.out.unfiltered_flagstat, + PYCOQC_NO_FILTER.out.filtered_flagstat, + PYCOQC_NO_FILTER.out.txt, + quality_score, + PYCOQC_NO_FILTER.out.unfiltered_pyco_json, + ) + MAKE_QC_REPORT( + PYCOQC_FILTER.out.id, + PYCOQC_FILTER.out.unfiltered_flagstat, + PYCOQC_FILTER.out.filtered_flagstat, + PYCOQC_FILTER.out.unfiltered_pyco_json, + PYCOQC_FILTER.out.filtered_pyco_json, + mapq, + quality_score, + ) } diff --git a/sub_workflows/MODKIT_AND_MULTIQC.nf b/sub_workflows/MODKIT_AND_MULTIQC.nf index 3fa7df6..346e2dd 100755 --- a/sub_workflows/MODKIT_AND_MULTIQC.nf +++ b/sub_workflows/MODKIT_AND_MULTIQC.nf @@ -1,12 +1,11 @@ // Import Modules -include {MULTIQC} from '../modules/multiqc.nf' -include {MODKIT} from '../modules/modkit.nf' -include {MERGE_QC_REPORT} from '../modules//num_reads_report.nf' -include {CALCULATE_COVERAGE; MERGE_COVERAGE} from "../modules/calculate_coverage.nf" +include { MULTIQC } from '../modules/multiqc.nf' +include { MODKIT } from '../modules/modkit.nf' +include { MERGE_QC_REPORT } from '../modules//num_reads_report.nf' +include { CALCULATE_COVERAGE ; MERGE_COVERAGE } from "../modules/calculate_coverage.nf" workflow MODKIT_AND_MULTIQC { - take: filtered_bams filtered_bais @@ -16,18 +15,10 @@ workflow MODKIT_AND_MULTIQC { multiqc_config multiqc_input - - main: - - CALCULATE_COVERAGE(filtered_bams, filtered_bais) - - MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect()) - + CALCULATE_COVERAGE(filtered_bams, filtered_bais) + MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect()) MERGE_QC_REPORT(num_reads.collect(), read_length.collect(), quality_thresholds.collect()) - MULTIQC(multiqc_input.concat(MERGE_QC_REPORT.out.flatten()).collect(), MERGE_COVERAGE.out.mqc, multiqc_config) - MODKIT(filtered_bams, filtered_bais) - } diff --git a/workflow/main.nf b/workflow/main.nf index be87908..77d8171 100755 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,8 +1,8 @@ // Make this pipeline a nextflow 2 implementation nextflow.enable.dsl=2 - if (params.step.toString() == "1") { + log.info """ STEP 1 - OXFORD NANOPORE DNA SEQUENCING BASECALLING AND ALIGNMENT - Bernardo Aguzzoli Heberle ====================================================================================================================================================================================== @@ -16,8 +16,7 @@ basecall quality score threshold for basecalling : basecall demultiplexing : ${params.basecall_demux} trim barcodes during demultiplexing : ${params.trim_barcode} submission output file prefix : ${params.prefix} -GPU device for submission : ${params.gpu_devices} - +GPU device for submission : ${params.gpu_devices} Output directory : ${params.out_dir} ====================================================================================================================================================================================== @@ -42,9 +41,7 @@ Are input bam files barcoded? (Only relevant to "step_2_from_minknow" option) """ - - } else if (params.step.toString() == "3") { - +} else if (params.step.toString() == "3") { log.info """ STEP 3 - METHYLATION CALLING AND MULTIQC REPORT - Bernardo Aguzzoli Heberle @@ -57,17 +54,10 @@ MultiQC configuration file (provided, but may need to be altered for different u ====================================================================================================================================================================================== """ - - - - } else { - +} else { println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://github.com/bernardo-heberle/DCNL_NANOPORE_PIPELINE" System.exit(1) - - } - - +} // Import Workflows include {BASECALLING} from '../sub_workflows/BASECALLING' @@ -75,58 +65,41 @@ include {FILTERING_AND_QC_FROM_STEP_1} from '../sub_workflows/FILTERING_AND_QC_F include {FILTERING_AND_QC_FROM_MINKNOW} from '../sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf' include {MODKIT_AND_MULTIQC} from '../sub_workflows/MODKIT_AND_MULTIQC.nf' - // Define initial files and channels - if (params.step.toString() == "1") { - if (params.prefix == "None") { - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - - } else { - + }else{ fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - } - ref = file(params.ref) - quality_score = Channel.value(params.qscore_thresh) - basecall_speed = Channel.value(params.basecall_speed) - basecall_mods = Channel.value(params.basecall_mods) - basecall_config = Channel.value(params.basecall_config) - basecall_trim = Channel.value(params.basecall_trim) - basecall_compute = Channel.value(params.basecall_compute) - trim_barcode = Channel.value(params.trim_barcode) - devices = Channel.value(params.gpu_devices) - +ref = file(params.ref) +quality_score = Channel.value(params.qscore_thresh) +basecall_speed = Channel.value(params.basecall_speed) +basecall_mods = Channel.value(params.basecall_mods) +basecall_config = Channel.value(params.basecall_config) +basecall_trim = Channel.value(params.basecall_trim) +basecall_compute = Channel.value(params.basecall_compute) +trim_barcode = Channel.value(params.trim_barcode) +devices = Channel.value(params.gpu_devices) } else if (params.step.toString() == "2_from_step_1") { - total_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() mapq = Channel.value(params.mapq) quality_score = Channel.value(params.qscore_thresh) - } else if (params.step.toString() == "2_from_minknow") { - input_dir = Channel.fromPath("${params.steps_2_and_3_input_directory}/") mapq = Channel.value(params.mapq) quality_score = Channel.value(params.qscore_thresh) - } -else if (params.step.toString() == "3") { - - +else if (params.step.toString() == "3") { filtered_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) filtered_bais = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() num_reads = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*") @@ -134,35 +107,25 @@ else if (params.step.toString() == "3") { quality_thresholds = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*") multiqc_config = Channel.fromPath(params.multiqc_config) multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") - - } - - workflow { if (params.step.toString() == "1") { - BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, quality_score, trim_barcode, devices, ref) } else if (params.step.toString() == "2_from_step_1") { - FILTERING_AND_QC_FROM_STEP_1(total_bams, txts, mapq, quality_score) - } else if (params.step.toString() == "2_from_minknow") { - FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, quality_score) - } else if (params.step.toString()== "3") { - MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input) - } } + -- GitLab From c15c34c351403f9d533e88c2cfc63f03bb3884f5 Mon Sep 17 00:00:00 2001 From: Carlos Gomes Date: Fri, 25 Apr 2025 10:19:26 -0300 Subject: [PATCH 2/5] fix: nextflow indents and line breaks --- workflow/main.nf | 141 +++++++++++++++++++---------------------------- 1 file changed, 56 insertions(+), 85 deletions(-) diff --git a/workflow/main.nf b/workflow/main.nf index 77d8171..6b45ca5 100755 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -2,57 +2,43 @@ nextflow.enable.dsl=2 if (params.step.toString() == "1") { - -log.info """ - STEP 1 - OXFORD NANOPORE DNA SEQUENCING BASECALLING AND ALIGNMENT - Bernardo Aguzzoli Heberle -====================================================================================================================================================================================== - -path containing samples and files to be basecalled (basecall only) : ${params.basecall_path} -basecall speed (basecall only) : ${params.basecall_speed} -basecall modifications (basecall only) : ${params.basecall_mods} -basecall config (If "false" the basecaller will automatically pick one) : ${params.basecall_config} -basecall read trimming option : ${params.basecall_trim} -basecall quality score threshold for basecalling : ${params.qscore_thresh} -basecall demultiplexing : ${params.basecall_demux} -trim barcodes during demultiplexing : ${params.trim_barcode} -submission output file prefix : ${params.prefix} -GPU device for submission : ${params.gpu_devices} -Output directory : ${params.out_dir} - -====================================================================================================================================================================================== - + log.info """ +================================================================= +STEP 1 - OXFORD NANOPORE DNA SEQUENCING BASECALLING AND ALIGNMENT +================================================================= +basecall files path containing : ${params.basecall_path} +basecall speed (basecall only) : ${params.basecall_speed} +basecall modifications (basecall only) : ${params.basecall_mods} +basecall config : ${params.basecall_config} +basecall read trimming option : ${params.basecall_trim} +basecall quality score threshold for basecalling : ${params.qscore_thresh} +basecall demultiplexing : ${params.basecall_demux} +trim barcodes during demultiplexing : ${params.trim_barcode} +submission output file prefix : ${params.prefix} +GPU device for submission : ${params.gpu_devices} +Output directory : ${params.out_dir} +================================================================= """ - } else if (params.step.toString() == "2_from_step_1" || params.step.toString() == "2_from_minknow") { - -log.info """ - STEP 2 - FILTERING AND QUALITY CONTROL - Bernardo Aguzzoli Heberle -====================================================================================================================================================================================== - -Input directory - should be the Output Directory from step 1 : ${params.steps_2_and_3_input_directory} - -Basecall quality score threshold for basecalling (make sure it is the same as in step 1) : ${params.qscore_thresh} - -MAPQ filtering threshold : ${params.mapq} - -Minimum number of mapped reads per sample/barcode for a file to be included in analysis : ${params.min_mapped_reads_thresh} - -Are input bam files barcoded? (Only relevant to "step_2_from_minknow" option) : ${params.is_barcoded} -====================================================================================================================================================================================== - +} else if (params.step.toString() == "2_from_step_1" || params.step.toString() == "2_from_minknow") { + log.info """ +====================================== +STEP 2 - FILTERING AND QUALITY CONTROL +====================================== +Input directory (output dir from step 1) : ${params.steps_2_and_3_input_directory} +Basecall quality score threshold : ${params.qscore_thresh} +MAPQ filtering threshold : ${params.mapq} +Min number of mapped reads per sample/barcode : ${params.min_mapped_reads_thresh} +BAM files barcoded? : ${params.is_barcoded} +====================================== """ - } else if (params.step.toString() == "3") { - -log.info """ - STEP 3 - METHYLATION CALLING AND MULTIQC REPORT - Bernardo Aguzzoli Heberle -======================================================================================================================================================================================= - -Input directory - should be the Output Directory from step 1 and Input Directory from step 2 : ${params.steps_2_and_3_input_directory} - -MultiQC configuration file (provided, but may need to be altered for different use cases) : ${params.multiqc_config} - -====================================================================================================================================================================================== - + log.info """ +=============================================== +STEP 3 - METHYLATION CALLING AND MULTIQC REPORT +=============================================== +Input directory (input dir from step 2) : ${params.steps_2_and_3_input_directory} +MultiQC configuration file : ${params.multiqc_config} +=============================================== """ } else { println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://github.com/bernardo-heberle/DCNL_NANOPORE_PIPELINE" @@ -68,38 +54,31 @@ include {MODKIT_AND_MULTIQC} from '../sub_workflows/MODKIT_AND_MULTIQC.nf' // Define initial files and channels if (params.step.toString() == "1") { if (params.prefix == "None") { - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - }else{ - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - } - -ref = file(params.ref) -quality_score = Channel.value(params.qscore_thresh) -basecall_speed = Channel.value(params.basecall_speed) -basecall_mods = Channel.value(params.basecall_mods) -basecall_config = Channel.value(params.basecall_config) -basecall_trim = Channel.value(params.basecall_trim) -basecall_compute = Channel.value(params.basecall_compute) -trim_barcode = Channel.value(params.trim_barcode) -devices = Channel.value(params.gpu_devices) -} - -else if (params.step.toString() == "2_from_step_1") { + fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + } else { + fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + } + ref = file(params.ref) + quality_score = Channel.value(params.qscore_thresh) + basecall_speed = Channel.value(params.basecall_speed) + basecall_mods = Channel.value(params.basecall_mods) + basecall_config = Channel.value(params.basecall_config) + basecall_trim = Channel.value(params.basecall_trim) + basecall_compute = Channel.value(params.basecall_compute) + trim_barcode = Channel.value(params.trim_barcode) + devices = Channel.value(params.gpu_devices) +} else if (params.step.toString() == "2_from_step_1") { total_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) txts = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() mapq = Channel.value(params.mapq) quality_score = Channel.value(params.qscore_thresh) -} - -else if (params.step.toString() == "2_from_minknow") { +} else if (params.step.toString() == "2_from_minknow") { input_dir = Channel.fromPath("${params.steps_2_and_3_input_directory}/") mapq = Channel.value(params.mapq) quality_score = Channel.value(params.qscore_thresh) -} - -else if (params.step.toString() == "3") { +} else if (params.step.toString() == "3") { filtered_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) filtered_bais = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() num_reads = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*") @@ -109,23 +88,15 @@ else if (params.step.toString() == "3") { multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") } +// Main logic workflow { - if (params.step.toString() == "1") { BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, quality_score, trim_barcode, devices, ref) - } - - else if (params.step.toString() == "2_from_step_1") { + } else if (params.step.toString() == "2_from_step_1") { FILTERING_AND_QC_FROM_STEP_1(total_bams, txts, mapq, quality_score) - } - - else if (params.step.toString() == "2_from_minknow") { + } else if (params.step.toString() == "2_from_minknow") { FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, quality_score) - } - - else if (params.step.toString()== "3") { + } else if (params.step.toString()== "3") { MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input) } - } - -- GitLab From cbeac104507d36142c22840eb1676dc51a36cb50 Mon Sep 17 00:00:00 2001 From: Carlos Gomes Date: Fri, 25 Apr 2025 10:22:45 -0300 Subject: [PATCH 3/5] fix: nextflow line breaks and comments --- workflow/nextflow.config | 43 +++------------------------------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 9405aab..3fe4140 100755 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -1,107 +1,70 @@ // CONFIGURATION FILE -// Pipeline parameter default values, can be modified by user when calling pipeline on command line (e.g. --ont_reads_fq sample_1.fastq) ## +// Pipeline parameter default values, can be modified by user when calling +// pipeline on command line (e.g. --ont_reads_fq sample_1.fastq) // Input reference fasta file params.ref = 'None' - // Step of pipeline to execute params.step = 'None' - // Output directory for pipeline results params.out_dir = "output_directory/" - // directory of basecalling data params.basecall_path = 'None' - // MAPQ filtering threshold for bam files, 0 for no filtering params.mapq = "10" - // Quality score threshold params.qscore_thresh = "9" - // Desired basecall speed params.basecall_speed = "hac" - // Desired basecaller modifications params.basecall_mods = false - // Threshold for mapped reasds params.min_mapped_reads_thresh = 500 - // Desired basecall configuration params.basecall_config = "None" - // Type of read trimming during basecalling ("all", "primers", "adapters", "none") params.basecall_trim = "none" - // Basecalling demultiplexing params.basecall_demux = false - // CPU vs GPU basecalling params.basecall_compute = "gpu" - // Trim barcodes (only counts if demultiplexing is enabled) params.trim_barcode = "True" - // Add prefix to all output files params.prefix = "None" - // Which GPU devices to use for basecalling? params.gpu_devices = "all" - // Previous results params.steps_2_and_3_input_directory = "None" - // MultiQC config params.multiqc_config = "None" - // Are the files from MinKNOW barcoded or not params.is_barcoded = true - - - +// Set queue size for the executor if (params.step == 1) { - queue_size = 1 - } else { - queue_size = 5 - } - process { - // Define local cpu execution - withLabel: cpu { executor='local' } - // Define local gpu execution withLabel: gpu { - executor='local' containerOptions = '--nv' } - // Define the singularity container for every process, will pull container from the cloud container = "library://joaochrusciel/nanopore/ont_methylation:2024-10-18" - } - executor { - name = 'local' queueSize = queue_size - } - - apptainer { - enabled = true - } -- GitLab From 1ff43043d1e25968d9588d9981135cca295abbba Mon Sep 17 00:00:00 2001 From: Carlos Gomes Date: Fri, 25 Apr 2025 10:26:09 -0300 Subject: [PATCH 4/5] fix: subflows indent and spacing --- sub_workflows/BASECALLING.nf | 70 +++++++------- .../FILTERING_AND_QC_FROM_MINKNOW.nf | 92 +++++++++---------- sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf | 71 +++++++------- sub_workflows/MODKIT_AND_MULTIQC.nf | 1 - 4 files changed, 117 insertions(+), 117 deletions(-) diff --git a/sub_workflows/BASECALLING.nf b/sub_workflows/BASECALLING.nf index cc07e68..1e0a62f 100755 --- a/sub_workflows/BASECALLING.nf +++ b/sub_workflows/BASECALLING.nf @@ -2,42 +2,42 @@ include { FAST5_to_POD5 ; BASECALL_CPU ; BASECALL_CPU_DEMUX ; BASECALL_GPU ; BAS workflow BASECALLING { take: - pod5_path - fast5_path - speed - modifications - config - trim - quality_score - trim_barcode - devices - ref + pod5_path + fast5_path + speed + modifications + config + trim + quality_score + trim_barcode + devices + ref + main: - - FAST5_to_POD5(fast5_path) - pod5_path = FAST5_to_POD5.out.mix(pod5_path) - if (params.basecall_compute?.equalsIgnoreCase("cpu")) { - if (params.basecall_demux == true) { - BASECALL_CPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - bams = BASECALL_CPU_DEMUX.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL_CPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - } - else { - BASECALL_CPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - bams = BASECALL_CPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL_CPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - } - } - else if (params.basecall_compute?.equalsIgnoreCase("gpu")) { - if (params.basecall_demux == true) { - BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - bams = BASECALL_GPU_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - txts = BASECALL_GPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + FAST5_to_POD5(fast5_path) + pod5_path = FAST5_to_POD5.out.mix(pod5_path) + if (params.basecall_compute?.equalsIgnoreCase("cpu")) { + if (params.basecall_demux == true) { + BASECALL_CPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) + bams = BASECALL_CPU_DEMUX.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_CPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + else { + BASECALL_CPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) + bams = BASECALL_CPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_CPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } } - else { - BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - bams = BASECALL_GPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL_GPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + else if (params.basecall_compute?.equalsIgnoreCase("gpu")) { + if (params.basecall_demux == true) { + BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) + bams = BASECALL_GPU_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + txts = BASECALL_GPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + else { + BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) + bams = BASECALL_GPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_GPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } } - } } diff --git a/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf b/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf index 2a007cc..90dc60c 100755 --- a/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf +++ b/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf @@ -6,51 +6,51 @@ include { CONVERT_INPUT_FROM_MINKNOW_BARCODED ; CONVERT_INPUT_FROM_MINKNOW_NOT_B workflow FILTERING_AND_QC_FROM_MINKNOW { take: - input - mapq - quality_score + input + mapq + quality_score + main: - if (params.is_barcoded == true) { - CONVERT_INPUT_FROM_MINKNOW_BARCODED(input) - converted_input = CONVERT_INPUT_FROM_MINKNOW_BARCODED.out - } - else if (params.is_barcoded == false) { - CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input) - converted_input = CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED.out - } - FILTER_BAM( - converted_input.bam.flatten().map { file -> tuple(file.baseName, file) }.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2), - converted_input.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten(), - mapq, - ) - PYCOQC_NO_FILTER( - FILTER_BAM.out.id, - FILTER_BAM.out.total_bam, - FILTER_BAM.out.total_bai, - FILTER_BAM.out.filtered_bam, - FILTER_BAM.out.filtered_bai, - FILTER_BAM.out.unfiltered_flagstat, - FILTER_BAM.out.filtered_flagstat, - FILTER_BAM.out.txt, - quality_score, - ) - PYCOQC_FILTER( - PYCOQC_NO_FILTER.out.id, - PYCOQC_NO_FILTER.out.filtered_bam, - PYCOQC_NO_FILTER.out.filtered_bai, - PYCOQC_NO_FILTER.out.unfiltered_flagstat, - PYCOQC_NO_FILTER.out.filtered_flagstat, - PYCOQC_NO_FILTER.out.txt, - quality_score, - PYCOQC_NO_FILTER.out.unfiltered_pyco_json, - ) - MAKE_QC_REPORT( - PYCOQC_FILTER.out.id, - PYCOQC_FILTER.out.unfiltered_flagstat, - PYCOQC_FILTER.out.filtered_flagstat, - PYCOQC_FILTER.out.unfiltered_pyco_json, - PYCOQC_FILTER.out.filtered_pyco_json, - mapq, - quality_score, - ) + if (params.is_barcoded == true) { + CONVERT_INPUT_FROM_MINKNOW_BARCODED(input) + converted_input = CONVERT_INPUT_FROM_MINKNOW_BARCODED.out + } else if (params.is_barcoded == false) { + CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input) + converted_input = CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED.out + } + FILTER_BAM( + converted_input.bam.flatten().map { file -> tuple(file.baseName, file) }.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2), + converted_input.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten(), + mapq, + ) + PYCOQC_NO_FILTER( + FILTER_BAM.out.id, + FILTER_BAM.out.total_bam, + FILTER_BAM.out.total_bai, + FILTER_BAM.out.filtered_bam, + FILTER_BAM.out.filtered_bai, + FILTER_BAM.out.unfiltered_flagstat, + FILTER_BAM.out.filtered_flagstat, + FILTER_BAM.out.txt, + quality_score, + ) + PYCOQC_FILTER( + PYCOQC_NO_FILTER.out.id, + PYCOQC_NO_FILTER.out.filtered_bam, + PYCOQC_NO_FILTER.out.filtered_bai, + PYCOQC_NO_FILTER.out.unfiltered_flagstat, + PYCOQC_NO_FILTER.out.filtered_flagstat, + PYCOQC_NO_FILTER.out.txt, + quality_score, + PYCOQC_NO_FILTER.out.unfiltered_pyco_json, + ) + MAKE_QC_REPORT( + PYCOQC_FILTER.out.id, + PYCOQC_FILTER.out.unfiltered_flagstat, + PYCOQC_FILTER.out.filtered_flagstat, + PYCOQC_FILTER.out.unfiltered_pyco_json, + PYCOQC_FILTER.out.filtered_pyco_json, + mapq, + quality_score, + ) } diff --git a/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf b/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf index bf14a51..6f3a711 100755 --- a/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf +++ b/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf @@ -3,43 +3,44 @@ include { PYCOQC_NO_FILTER ; PYCOQC_FILTER } from '../modules/pycoqc.nf' include { FILTER_BAM } from '../modules/filter_bam.nf' include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf' + workflow FILTERING_AND_QC_FROM_STEP_1 { take: - bams - txts - mapq - quality_score + bams + txts + mapq + quality_score main: - FILTER_BAM(bams, txts, mapq) - PYCOQC_NO_FILTER( - FILTER_BAM.out.id, - FILTER_BAM.out.total_bam, - FILTER_BAM.out.total_bai, - FILTER_BAM.out.filtered_bam, - FILTER_BAM.out.filtered_bai, - FILTER_BAM.out.unfiltered_flagstat, - FILTER_BAM.out.filtered_flagstat, - FILTER_BAM.out.txt, - quality_score, - ) - PYCOQC_FILTER( - PYCOQC_NO_FILTER.out.id, - PYCOQC_NO_FILTER.out.filtered_bam, - PYCOQC_NO_FILTER.out.filtered_bai, - PYCOQC_NO_FILTER.out.unfiltered_flagstat, - PYCOQC_NO_FILTER.out.filtered_flagstat, - PYCOQC_NO_FILTER.out.txt, - quality_score, - PYCOQC_NO_FILTER.out.unfiltered_pyco_json, - ) - MAKE_QC_REPORT( - PYCOQC_FILTER.out.id, - PYCOQC_FILTER.out.unfiltered_flagstat, - PYCOQC_FILTER.out.filtered_flagstat, - PYCOQC_FILTER.out.unfiltered_pyco_json, - PYCOQC_FILTER.out.filtered_pyco_json, - mapq, - quality_score, - ) + FILTER_BAM(bams, txts, mapq) + PYCOQC_NO_FILTER( + FILTER_BAM.out.id, + FILTER_BAM.out.total_bam, + FILTER_BAM.out.total_bai, + FILTER_BAM.out.filtered_bam, + FILTER_BAM.out.filtered_bai, + FILTER_BAM.out.unfiltered_flagstat, + FILTER_BAM.out.filtered_flagstat, + FILTER_BAM.out.txt, + quality_score, + ) + PYCOQC_FILTER( + PYCOQC_NO_FILTER.out.id, + PYCOQC_NO_FILTER.out.filtered_bam, + PYCOQC_NO_FILTER.out.filtered_bai, + PYCOQC_NO_FILTER.out.unfiltered_flagstat, + PYCOQC_NO_FILTER.out.filtered_flagstat, + PYCOQC_NO_FILTER.out.txt, + quality_score, + PYCOQC_NO_FILTER.out.unfiltered_pyco_json, + ) + MAKE_QC_REPORT( + PYCOQC_FILTER.out.id, + PYCOQC_FILTER.out.unfiltered_flagstat, + PYCOQC_FILTER.out.filtered_flagstat, + PYCOQC_FILTER.out.unfiltered_pyco_json, + PYCOQC_FILTER.out.filtered_pyco_json, + mapq, + quality_score, + ) } diff --git a/sub_workflows/MODKIT_AND_MULTIQC.nf b/sub_workflows/MODKIT_AND_MULTIQC.nf index 346e2dd..35294c7 100755 --- a/sub_workflows/MODKIT_AND_MULTIQC.nf +++ b/sub_workflows/MODKIT_AND_MULTIQC.nf @@ -1,5 +1,4 @@ // Import Modules - include { MULTIQC } from '../modules/multiqc.nf' include { MODKIT } from '../modules/modkit.nf' include { MERGE_QC_REPORT } from '../modules//num_reads_report.nf' -- GitLab From 1dfebfb2c31069ef49a4903232781335653c9a3a Mon Sep 17 00:00:00 2001 From: Carlos Gomes Date: Fri, 25 Apr 2025 10:39:32 -0300 Subject: [PATCH 5/5] fix: module flows and script, fix indent and spacing --- modules/basecall.nf | 10 +++--- modules/calculate_coverage.nf | 1 - modules/convert_input_from_minknow.nf | 49 +++++++++++---------------- modules/filter_bam.nf | 19 +++++------ modules/modkit.nf | 47 +++++++++++-------------- modules/multiqc.nf | 1 - modules/num_reads_report.nf | 2 -- modules/pycoqc.nf | 2 -- 8 files changed, 54 insertions(+), 77 deletions(-) diff --git a/modules/basecall.nf b/modules/basecall.nf index ecf8a8d..e1da10c 100755 --- a/modules/basecall.nf +++ b/modules/basecall.nf @@ -53,6 +53,7 @@ process BASECALL_CPU { dorado summary "${id}.bam" > "${id}.txt" """ } + process BASECALL_CPU_DEMUX { publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'cpu' @@ -142,13 +143,14 @@ process BASECALL_GPU { else dorado basecaller "${speed},${mods}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" fi - else + else if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" fi fi + samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" @@ -186,7 +188,6 @@ process BASECALL_GPU_DEMUX { fi else if [[ "${mods}" == "false" ]]; then - dorado basecaller "${speed}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" @@ -205,9 +206,10 @@ process BASECALL_GPU_DEMUX { cd ./demux_data/ for file in *; do - samtools sort -@ -12 "\$file" -o "${id}_\${file}" - rm "\$file" + samtools sort -@ -12 "\$file" -o "${id}_\${file}" + rm "\$file" done + cd ../ rm "${id}.bam" mv ./demux_data/* ./ diff --git a/modules/calculate_coverage.nf b/modules/calculate_coverage.nf index f318b04..d880204 100755 --- a/modules/calculate_coverage.nf +++ b/modules/calculate_coverage.nf @@ -24,7 +24,6 @@ process CALCULATE_COVERAGE { """ } process MERGE_COVERAGE { - publishDir "${params.steps_2_and_3_input_directory}/calculate_coverage/", mode: "copy", pattern: "*", overwrite: true label 'cpu' diff --git a/modules/convert_input_from_minknow.nf b/modules/convert_input_from_minknow.nf index e404950..5815965 100755 --- a/modules/convert_input_from_minknow.nf +++ b/modules/convert_input_from_minknow.nf @@ -1,20 +1,18 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { - publishDir "results/${params.out_dir}/minknow_converted_input", mode: "copy", overwrite: true label 'cpu' input: - path input + path input output: - path "*.bam", emit: bam - path "*.txt", emit: txt + path "*.bam", emit: bam + path "*.txt", emit: txt script: - """ + """ # Define the input directory path input_dir="${input.toString()}" - # Check if the input directory exists if [ -d "\${input_dir}" ]; then echo "Input directory exists." @@ -22,22 +20,19 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { echo "Input directory does not exist." exit 1 fi - # Find all 'pass' directories, follow symlinks with -L - find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do - # Find all 'barcode' directories within each 'pass' directory + find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do + # Find all 'barcode' directories within each 'pass' directory find -L "\$pass_dir" -type d -name '*barcode*' -print0 -o -type d -name '*unclassified*' -print0 | while IFS= read -r -d '' barcode_dir; do - # Get the name of the directory two levels above the barcode directory + # Get the name of the directory two levels above the barcode directory parent_dir_two_above=\$(basename \$(dirname \$(dirname "\$barcode_dir"))) - # Check if the barcode directory contains any BAM files - bam_files=(\$(find -L "\$barcode_dir" -type f -name '*.bam' | grep -v 'iltered')) - if [ \${#bam_files[@]} -gt 0 ]; then + bam_files=(\$(find -L "\$barcode_dir" -type f -name '*.bam' | grep -v 'iltered')) + if [ \${#bam_files[@]} -gt 0 ]; then # Merge BAM files using samtools barcode_name=\$(basename "\$barcode_dir") output_bam="./\${parent_dir_two_above}_\${barcode_name}.bam" samtools merge "./\$output_bam" "\${bam_files[@]}" - # Generate summary with dorado dorado summary "\$output_bam" > "./\${parent_dir_two_above}_\${barcode_name}.txt" fi @@ -48,7 +43,6 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { - publishDir "results/${params.out_dir}/minknow_converted_input", mode: "copy", overwrite: true label 'cpu' @@ -63,7 +57,6 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { """ # Define the input directory path input_dir="${input.toString()}" - # Check if the input directory exists if [ -d "\${input_dir}" ]; then echo "Input directory exists." @@ -71,21 +64,19 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { echo "Input directory does not exist." exit 1 fi - # Find all 'pass' directories, follow symlinks with -L find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do - - ## Get the name of the input directory for file naming - parent_dir="\${pass_dir%/*}"; parent_dir="\${parent_dir##*/}" - # Check if the pass directory contains any BAM files - bam_files=(\$(find -L "\$pass_dir" -type f -name '*.bam' | grep -v 'iltered')) - if [ \${#bam_files[@]} -gt 0 ]; then - # Merge BAM files using samtools - output_bam="./\${parent_dir}.bam" - samtools merge "./\$output_bam" "\${bam_files[@]}" - # Generate summary with dorado - dorado summary "\$output_bam" > "./\${parent_dir}.txt" - fi + ## Get the name of the input directory for file naming + parent_dir="\${pass_dir%/*}"; parent_dir="\${parent_dir##*/}" + # Check if the pass directory contains any BAM files + bam_files=(\$(find -L "\$pass_dir" -type f -name '*.bam' | grep -v 'iltered')) + if [ \${#bam_files[@]} -gt 0 ]; then + # Merge BAM files using samtools + output_bam="./\${parent_dir}.bam" + samtools merge "./\$output_bam" "\${bam_files[@]}" + # Generate summary with dorado + dorado summary "\$output_bam" > "./\${parent_dir}.txt" + fi done """ } diff --git a/modules/filter_bam.nf b/modules/filter_bam.nf index 9eac4de..86d54e1 100755 --- a/modules/filter_bam.nf +++ b/modules/filter_bam.nf @@ -1,5 +1,4 @@ process FILTER_BAM { - publishDir "results/${params.out_dir}/bam_filtering/", mode: "copy", pattern: "*.ba*", overwrite: true publishDir "results/${params.out_dir}/multiqc_input/minimap2/", mode: "copy", pattern: "*.*stat", overwrite: true label 'cpu' @@ -31,17 +30,15 @@ process FILTER_BAM { samtools index "${id}-Filtered_primary_mapq_${mapq}.bam" samtools flagstat "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.flagstat" samtools idxstats "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.idxstat" - - var=\$(awk 'NR==8 {print \$1}' "${id}-Filtered_primary_mapq_${mapq}.flagstat") - - ## If flagstat file says there are no mapped reads then delete bam and bai files. + var=\$(awk 'NR==8 {print \$1}' "${id}-Filtered_primary_mapq_${mapq}.flagstat") + ## If flagstat file says there are no mapped reads then delete bam and bai files. ## Since we passed optional on the output statement lines the script should still run fine even - ## When the files are deleted. However, I never used optional before so I am not 100% sure - if [ \$var -lt "${params.min_mapped_reads_thresh}" ]; then - rm ${id}-* - else - id="${id}" - cp "${txt}" "${id}_sequencing_summary.txt" + ## When the files are deleted. However, I never used optional before so I am not 100% sure + if [ \$var -lt "${params.min_mapped_reads_thresh}" ]; then + rm ${id}-* + else + id="${id}" + cp "${txt}" "${id}_sequencing_summary.txt" fi rm "intermediate.bam" """ diff --git a/modules/modkit.nf b/modules/modkit.nf index fb436bc..16b98bc 100755 --- a/modules/modkit.nf +++ b/modules/modkit.nf @@ -1,32 +1,25 @@ process MODKIT { + publishDir "${params.steps_2_and_3_input_directory}/modkit/", mode: "copy", pattern: "*", overwrite: true + label 'cpu' - publishDir "${params.steps_2_and_3_input_directory}/modkit/", mode: "copy", pattern: "*", overwrite: true - label 'cpu' + input: + tuple val(id), path(bam) + path bai - input: - tuple val(id), path(bam) - path bai + output: + path "*" - output: - path "*" - - script: - """ - echo "starting modkit" - - ## Make Methylation TSV file - modkit extract full --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv" - - echo "extract successful" - - ## Make Methylation Table - modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" - - echo "pileup successful" - - ## Make Summary - modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt" - - echo "summary successful" - """ + script: + """ + echo "starting modkit" + ## Make Methylation TSV file + modkit extract full --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv" + echo "extract successful" + ## Make Methylation Table + modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" + echo "pileup successful" + ## Make Summary + modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt" + echo "summary successful" + """ } diff --git a/modules/multiqc.nf b/modules/multiqc.nf index adca7e4..c2181eb 100755 --- a/modules/multiqc.nf +++ b/modules/multiqc.nf @@ -1,5 +1,4 @@ process MULTIQC { - publishDir "${params.steps_2_and_3_input_directory}/multiQC_output", mode: "copy", overwrite: true label 'cpu' diff --git a/modules/num_reads_report.nf b/modules/num_reads_report.nf index 24d2a51..7210826 100755 --- a/modules/num_reads_report.nf +++ b/modules/num_reads_report.nf @@ -1,5 +1,4 @@ process MAKE_QC_REPORT { - publishDir "results/${params.out_dir}/intermediate_qc_reports/number_of_reads/", pattern: "*num_reads.tsv", mode: "copy", overwrite: true publishDir "results/${params.out_dir}/intermediate_qc_reports/read_length/", pattern: "*length.tsv", mode: "copy", overwrite: true publishDir "results/${params.out_dir}/intermediate_qc_reports/quality_score_thresholds/", pattern: "*thresholds.tsv", mode: "copy", overwrite: true @@ -40,7 +39,6 @@ process MAKE_QC_REPORT { } process MERGE_QC_REPORT { - publishDir "${params.steps_2_and_3_input_directory}/reads_report/", pattern: "*", mode: "copy", overwrite: true label 'cpu' diff --git a/modules/pycoqc.nf b/modules/pycoqc.nf index 7fcfa01..be1ff1b 100755 --- a/modules/pycoqc.nf +++ b/modules/pycoqc.nf @@ -1,5 +1,4 @@ process PYCOQC_NO_FILTER { - publishDir "results/${params.out_dir}/pycoqc_no_filter/", mode: 'copy', overwrite: true, pattern: "*-Unfiltered_pycoqc*" label 'cpu' @@ -36,7 +35,6 @@ process PYCOQC_NO_FILTER { } process PYCOQC_FILTER { - publishDir "results/${params.out_dir}/pycoqc_filtered/", mode: 'copy', overwrite: true, pattern: "*-Filtered_pycoqc*" label 'cpu' -- GitLab