diff --git a/modules/basecall.nf b/modules/basecall.nf index ca679c1ca371cd7d4acd5667c955cce2635bba71..e1da10c5d6c2545cd378dad47e9250126cb33b7f 100755 --- a/modules/basecall.nf +++ b/modules/basecall.nf @@ -1,7 +1,5 @@ process FAST5_to_POD5 { - publishDir "results/${params.out_dir}/fast5_to_pod5/${id}/", mode: "copy", overwrite: true - label 'cpu' input: @@ -10,22 +8,16 @@ process FAST5_to_POD5 { output: tuple val("${id}"), path("*.pod5") - script: """ - pod5 convert fast5 *.fast5 --output . --one-to-one . --threads 12 - """ - } process BASECALL_CPU { - - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - + publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'cpu' - + input: tuple val(id), path(pod5_dir) val speed @@ -33,53 +25,37 @@ process BASECALL_CPU { val config val trim val qscore - val devices - path ref + val devices + path ref output: - path("*.bam"), emit: bam - path("*.txt"), emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt - - script: + script: """ - - if [[ "$config" == "false" ]]; then - - if [[ "$mods" == "false" ]]; then + if [[ "${config}" == "false" ]]; then + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . -x cpu --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . -x cpu --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" fi - - else - - if [[ "$mods" == "false" ]]; then - + else + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . -x cpu --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . -x cpu --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - fi - fi - + fi samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" - dorado summary "${id}.bam" > "${id}.txt" - """ } - process BASECALL_CPU_DEMUX { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - - label 'cpu' input: @@ -90,77 +66,61 @@ process BASECALL_CPU_DEMUX { val trim val qscore val trim_barcode - val devices - path ref + val devices + path ref output: - path("*.bam"), emit: bam - path("*.txt"), emit: txt - + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt - script: + script: """ - - if [[ "$config" == "false" ]]; then - - if [[ "$mods" == "false" ]]; then + if [[ "${config}" == "false" ]]; then + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . -x cpu --trim "none" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . -x cpu --trim "none" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" fi - - else - - if [[ "$mods" == "false" ]]; then - + else + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . -x cpu --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . -x cpu --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" > "${id}.bam" fi fi - + samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" - if [[ "$trim_barcode" == "true" ]]; then + if [[ "${trim_barcode}" == "true" ]]; then dorado demux --output-dir "./demux_data/" --no-classify "${id}.bam" else dorado demux --no-trim --output-dir "./demux_data/" --no-classify "${id}.bam" fi cd ./demux_data/ - for file in *; do samtools sort -@ -12 "\$file" -o "${id}_\${file}" rm "\$file" done cd ../ - rm "${id}.bam" - mv ./demux_data/* ./ - rm -r ./demux_data/ - for file in *.bam; do new_id="\${file%%.*}" dorado summary "\$file" > "\${new_id}.txt" done - """ } - -process BASECALL_GPU { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - +process BASECALL_GPU { + publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true label 'gpu' - + input: tuple val(id), path(pod5_dir) val speed @@ -168,52 +128,39 @@ process BASECALL_GPU { val config val trim val qscore - val devices - path ref + val devices + path ref output: - path("*.bam"), emit: bam - path("*.txt"), emit: txt - + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt - script: + script: """ - - if [[ "$config" == "false" ]]; then - - if [[ "$mods" == "false" ]]; then + if [[ "${config}" == "false" ]]; then + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . --trim "${trim}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" fi - - else - - if [[ "$mods" == "false" ]]; then - + else + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . --trim "${trim}" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - fi fi samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" rm "${id}.bam" mv "${id}_sorted.bam" "${id}.bam" - dorado summary "${id}.bam" > "${id}.txt" - """ } process BASECALL_GPU_DEMUX { - publishDir "results/${params.out_dir}/basecalling_output/", mode: "copy", overwrite: true - label 'gpu' input: @@ -224,67 +171,52 @@ process BASECALL_GPU_DEMUX { val trim val qscore val trim_barcode - val devices - path ref + val devices + path ref output: - path("*.bam"), emit: bam - path("*.txt"), emit: txt + path ("*.bam"), emit: bam + path ("*.txt"), emit: txt - script: + script: """ - - if [[ "$config" == "false" ]]; then - - if [[ "$mods" == "false" ]]; then + if [[ "${config}" == "false" ]]; then + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" else dorado basecaller "${speed},${mods}" . --trim "none" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" fi - - else - - if [[ "$mods" == "false" ]]; then - + else + if [[ "${mods}" == "false" ]]; then dorado basecaller "${speed}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - else - dorado basecaller "${speed},${mods}" . --trim "none" --config "${config}" --min-qscore "${qscore}" --reference "${ref}" --device "cuda:${devices}" > "${id}.bam" - fi fi - samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" - rm "${id}.bam" - mv "${id}_sorted.bam" "${id}.bam" + samtools sort -@ -12 "${id}.bam" -o "${id}_sorted.bam" + rm "${id}.bam" + mv "${id}_sorted.bam" "${id}.bam" - - if [[ "$trim_barcode" == "true" ]]; then + if [[ "${trim_barcode}" == "true" ]]; then dorado demux --output-dir "./demux_data/" --no-classify "${id}.bam" else dorado demux --no-trim --output-dir "./demux_data/" --no-classify "${id}.bam" fi cd ./demux_data/ - for file in *; do - samtools sort -@ -12 "\$file" -o "${id}_\${file}" - rm "\$file" + samtools sort -@ -12 "\$file" -o "${id}_\${file}" + rm "\$file" done - + cd ../ - rm "${id}.bam" - mv ./demux_data/* ./ - rm -r ./demux_data/ - for file in *.bam; do new_id="\${file%%.*}" dorado summary "\$file" > "\${new_id}.txt" done - """ } diff --git a/modules/calculate_coverage.nf b/modules/calculate_coverage.nf index 1e63385db7838b816a0391087fa85b22bc8efdbd..d8802045f1989dc5005f870d41fc0b2074c9460e 100755 --- a/modules/calculate_coverage.nf +++ b/modules/calculate_coverage.nf @@ -1,57 +1,45 @@ process CALCULATE_COVERAGE { - label 'cpu' input: - tuple val(id), path(bam) - path(bai) + tuple val(id), path(bam) + path bai output: - path("*coverage*") + path "*coverage*" script: + """ + echo -e "Chromosome\\t${id}" > ${id}.coverage.tsv + samtools depth -a ${bam} | awk ' + { + sum[\$1] += \$3; + count[\$1]++; + } + END { + for (chr in sum) { + print chr "\\t" sum[chr]/count[chr]; + } + }' >> ${id}.coverage.tsv """ - - echo -e "Chromosome\\t${id}" > ${id}.coverage.tsv - samtools depth -a ${bam} | awk ' - { - sum[\$1] += \$3; - count[\$1]++; - } - END { - for (chr in sum) { - print chr "\\t" sum[chr]/count[chr]; - } - }' >> ${id}.coverage.tsv - - - """ - } - process MERGE_COVERAGE { - - publishDir "${params.steps_2_and_3_input_directory}/calculate_coverage/", mode: "copy", pattern: "*", overwrite: true - + publishDir "${params.steps_2_and_3_input_directory}/calculate_coverage/", mode: "copy", pattern: "*", overwrite: true label 'cpu' input: - path(coverage_files) + path coverage_files output: - path("merged_coverage.tsv"), emit: output - path("*mqc*"), emit: mqc + path ("merged_coverage.tsv"), emit: output + path ("*mqc*"), emit: mqc script: - """ - - merge_files.py ${coverage_files.join(' ')} merged_coverage.tsv - - echo "# plot_type: 'table'" >> "Chr_Coverage_mqc.tsv" - echo "# id: 'chromosome coverage custom'" >> "Chr_Coverage_mqc.tsv" - echo "# section_name: 'Average chromosome coverage per sample'" >> "Chr_Coverage_mqc.tsv" - cat merged_coverage.tsv >> "Chr_Coverage_mqc.tsv" - - """ - + """ + merge_files.py ${coverage_files.join(' ')} merged_coverage.tsv + echo "# plot_type: 'table'" >> "Chr_Coverage_mqc.tsv" + echo "# id: 'chromosome coverage custom'" >> "Chr_Coverage_mqc.tsv" + echo "# section_name: 'Average chromosome coverage per sample'" >> "Chr_Coverage_mqc.tsv" + cat merged_coverage.tsv >> "Chr_Coverage_mqc.tsv" + """ } diff --git a/modules/convert_input_from_minknow.nf b/modules/convert_input_from_minknow.nf index a906269f485b58eb9a719eb7e7824a4d76c585fd..5815965594d03fa71da782f250df72fb5fe5880d 100755 --- a/modules/convert_input_from_minknow.nf +++ b/modules/convert_input_from_minknow.nf @@ -1,11 +1,10 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { - publishDir "results/${params.out_dir}/minknow_converted_input", mode: "copy", overwrite: true - label 'cpu' input: path input + output: path "*.bam", emit: bam path "*.txt", emit: txt @@ -14,7 +13,6 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { """ # Define the input directory path input_dir="${input.toString()}" - # Check if the input directory exists if [ -d "\${input_dir}" ]; then echo "Input directory exists." @@ -22,22 +20,19 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { echo "Input directory does not exist." exit 1 fi - # Find all 'pass' directories, follow symlinks with -L - find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do - # Find all 'barcode' directories within each 'pass' directory + find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do + # Find all 'barcode' directories within each 'pass' directory find -L "\$pass_dir" -type d -name '*barcode*' -print0 -o -type d -name '*unclassified*' -print0 | while IFS= read -r -d '' barcode_dir; do - # Get the name of the directory two levels above the barcode directory + # Get the name of the directory two levels above the barcode directory parent_dir_two_above=\$(basename \$(dirname \$(dirname "\$barcode_dir"))) - # Check if the barcode directory contains any BAM files - bam_files=(\$(find -L "\$barcode_dir" -type f -name '*.bam' | grep -v 'iltered')) - if [ \${#bam_files[@]} -gt 0 ]; then + bam_files=(\$(find -L "\$barcode_dir" -type f -name '*.bam' | grep -v 'iltered')) + if [ \${#bam_files[@]} -gt 0 ]; then # Merge BAM files using samtools barcode_name=\$(basename "\$barcode_dir") output_bam="./\${parent_dir_two_above}_\${barcode_name}.bam" samtools merge "./\$output_bam" "\${bam_files[@]}" - # Generate summary with dorado dorado summary "\$output_bam" > "./\${parent_dir_two_above}_\${barcode_name}.txt" fi @@ -48,13 +43,12 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED { process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { - publishDir "results/${params.out_dir}/minknow_converted_input", mode: "copy", overwrite: true - label 'cpu' input: path input + output: path "*.bam", emit: bam path "*.txt", emit: txt @@ -63,7 +57,6 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { """ # Define the input directory path input_dir="${input.toString()}" - # Check if the input directory exists if [ -d "\${input_dir}" ]; then echo "Input directory exists." @@ -71,24 +64,19 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED { echo "Input directory does not exist." exit 1 fi - # Find all 'pass' directories, follow symlinks with -L - find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do - - ## Get the name of the input directory for file naming - parent_dir="\${pass_dir%/*}"; parent_dir="\${parent_dir##*/}" - - # Check if the pass directory contains any BAM files - bam_files=(\$(find -L "\$pass_dir" -type f -name '*.bam' | grep -v 'iltered')) - if [ \${#bam_files[@]} -gt 0 ]; then - # Merge BAM files using samtools - output_bam="./\${parent_dir}.bam" - samtools merge "./\$output_bam" "\${bam_files[@]}" - - # Generate summary with dorado - dorado summary "\$output_bam" > "./\${parent_dir}.txt" - fi + find -L "\${input_dir}" -type d \\( -name 'pass' -o -name '*pass*' \\) -print0 | while IFS= read -r -d '' pass_dir; do + ## Get the name of the input directory for file naming + parent_dir="\${pass_dir%/*}"; parent_dir="\${parent_dir##*/}" + # Check if the pass directory contains any BAM files + bam_files=(\$(find -L "\$pass_dir" -type f -name '*.bam' | grep -v 'iltered')) + if [ \${#bam_files[@]} -gt 0 ]; then + # Merge BAM files using samtools + output_bam="./\${parent_dir}.bam" + samtools merge "./\$output_bam" "\${bam_files[@]}" + # Generate summary with dorado + dorado summary "\$output_bam" > "./\${parent_dir}.txt" + fi done """ } - diff --git a/modules/filter_bam.nf b/modules/filter_bam.nf index 0a7c9d38c0176d1247e3c77b963f19377e29a4d2..86d54e196fae064742f8b9af1ad961a2d436454d 100755 --- a/modules/filter_bam.nf +++ b/modules/filter_bam.nf @@ -1,58 +1,45 @@ process FILTER_BAM { - publishDir "results/${params.out_dir}/bam_filtering/", mode: "copy", pattern: "*.ba*", overwrite: true publishDir "results/${params.out_dir}/multiqc_input/minimap2/", mode: "copy", pattern: "*.*stat", overwrite: true - - label 'cpu' input: - tuple val(id), path(bam) - path(txt) - val(mapq) + tuple val(id), path(bam) + path txt + val mapq output: - env(id), emit: id, optional: true - path("*-Unfiltered.bam"), emit: total_bam, optional: true - path("*-Unfiltered.bam.bai"), emit: total_bai, optional: true - path("*-Filtered*.bam"), emit: filtered_bam, optional: true - path("*-Filtered*.bai"), emit: filtered_bai, optional: true - path("*-Unfiltered.flagstat"), emit: unfiltered_flagstat, optional: true - path("*-Filtered*.flagstat"), emit: filtered_flagstat, optional: true - path("*_sequencing_summary*"), emit: txt, optional: true - path("*.*stat"), emit: multiqc, optional: true + env (id), emit: id, optional: true + path ("*-Unfiltered.bam"), emit: total_bam, optional: true + path ("*-Unfiltered.bam.bai"), emit: total_bai, optional: true + path ("*-Filtered*.bam"), emit: filtered_bam, optional: true + path ("*-Filtered*.bai"), emit: filtered_bai, optional: true + path ("*-Unfiltered.flagstat"), emit: unfiltered_flagstat, optional: true + path ("*-Filtered*.flagstat"), emit: filtered_flagstat, optional: true + path ("*_sequencing_summary*"), emit: txt, optional: true + path ("*.*stat"), emit: multiqc, optional: true script: """ - cp "${bam}" "${id}-Unfiltered.bam" samtools index "${id}-Unfiltered.bam" - - samtools flagstat "${id}-Unfiltered.bam" > "${id}-Unfiltered.flagstat" + samtools flagstat "${id}-Unfiltered.bam" > "${id}-Unfiltered.flagstat" samtools idxstats "${id}-Unfiltered.bam" > "${id}-Unfiltered.idxstat" - - samtools view -b -q $mapq -F 2304 -@ 12 $bam > "intermediate.bam" + samtools view -b -q ${mapq} -F 2304 -@ 12 ${bam} > "intermediate.bam" samtools sort -@ 12 "intermediate.bam" -o "${id}-Filtered_primary_mapq_${mapq}.bam" samtools index "${id}-Filtered_primary_mapq_${mapq}.bam" samtools flagstat "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.flagstat" samtools idxstats "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.idxstat" - - - var=\$(awk 'NR==8 {print \$1}' "${id}-Filtered_primary_mapq_${mapq}.flagstat") - - ## If flagstat file says there are no mapped reads then delete bam and bai files. + var=\$(awk 'NR==8 {print \$1}' "${id}-Filtered_primary_mapq_${mapq}.flagstat") + ## If flagstat file says there are no mapped reads then delete bam and bai files. ## Since we passed optional on the output statement lines the script should still run fine even - ## When the files are deleted. However, I never used optional before so I am not 100% sure - if [ \$var -lt "${params.min_mapped_reads_thresh}" ]; then - rm ${id}-* - else - id="${id}" - cp "${txt}" "${id}_sequencing_summary.txt" + ## When the files are deleted. However, I never used optional before so I am not 100% sure + if [ \$var -lt "${params.min_mapped_reads_thresh}" ]; then + rm ${id}-* + else + id="${id}" + cp "${txt}" "${id}_sequencing_summary.txt" fi - rm "intermediate.bam" - """ - } - diff --git a/modules/modkit.nf b/modules/modkit.nf index c7f4a6b7627fce3a9d9387bf7d1bc45c53625f68..16b98bcaca90ab335048408a7ea7aaa8666a249f 100755 --- a/modules/modkit.nf +++ b/modules/modkit.nf @@ -1,38 +1,25 @@ process MODKIT { - publishDir "${params.steps_2_and_3_input_directory}/modkit/", mode: "copy", pattern: "*", overwrite: true - - label 'cpu' input: - tuple val(id), path(bam) - path(bai) + tuple val(id), path(bam) + path bai output: - path("*") + path "*" script: """ - - echo "starting modkit" - + echo "starting modkit" ## Make Methylation TSV file modkit extract full --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv" - - echo "extract successful" - + echo "extract successful" ## Make Methylation Table modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" - - echo "pileup successful" - + echo "pileup successful" ## Make Summary modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt" - - echo "summary successful" - + echo "summary successful" """ - } - diff --git a/modules/multiqc.nf b/modules/multiqc.nf index ced6882049803f4e54d6677ebf1e7d4d3b08a564..c2181eb875f0f5af522bb9fcbd02c7e7f94acc3e 100755 --- a/modules/multiqc.nf +++ b/modules/multiqc.nf @@ -1,19 +1,17 @@ process MULTIQC { - publishDir "${params.steps_2_and_3_input_directory}/multiQC_output", mode: "copy", overwrite: true - label 'cpu' input: - path(multiqc_input) - path(merged_coverage) - path(multiqc_config) - - output: - path "*" + path multiqc_input + path merged_coverage + path multiqc_config + + output: + path "*" script: """ - multiqc -c $multiqc_config -n multiQC_report.html . + multiqc -c ${multiqc_config} -n multiQC_report.html . """ } diff --git a/modules/num_reads_report.nf b/modules/num_reads_report.nf index f9f23722e21d1d19f7d2427baf901adf35ab38e0..7210826ee16b1a1f37c872f48324e4802e08dfc2 100755 --- a/modules/num_reads_report.nf +++ b/modules/num_reads_report.nf @@ -1,99 +1,73 @@ process MAKE_QC_REPORT { - publishDir "results/${params.out_dir}/intermediate_qc_reports/number_of_reads/", pattern: "*num_reads.tsv", mode: "copy", overwrite: true publishDir "results/${params.out_dir}/intermediate_qc_reports/read_length/", pattern: "*length.tsv", mode: "copy", overwrite: true publishDir "results/${params.out_dir}/intermediate_qc_reports/quality_score_thresholds/", pattern: "*thresholds.tsv", mode: "copy", overwrite: true - label 'cpu' input: - val(id) - path(unfiltered_flagstat) - path(filtered_flagstat) - path(unfiltered_pyco_json) - path(filtered_pyco_json) - val(mapq) - val(qscore_thresh) + val id + path unfiltered_flagstat + path filtered_flagstat + path unfiltered_pyco_json + path filtered_pyco_json + val mapq + val qscore_thresh output: - path("${id}_num_reads.tsv"), emit: num_reads - path("${id}_read_length.tsv"), emit: read_length - path("${id}_quality_thresholds.tsv"), emit: qscore_thresh + path ("${id}_num_reads.tsv"), emit: num_reads + path ("${id}_read_length.tsv"), emit: read_length + path ("${id}_quality_thresholds.tsv"), emit: qscore_thresh script: """ num_all_reads=\$(jq '.["All Reads"].basecall.reads_number' "${unfiltered_pyco_json}") - num_pass_reads=\$(jq '.["Pass Reads"].basecall.reads_number' "${unfiltered_pyco_json}") - num_primary_alignments=\$(grep "primary mapped" "${unfiltered_flagstat}" | awk {'print \$1}') - num_primary_alignments_filtered=\$(grep "primary mapped" "${filtered_flagstat}" | awk '{print \$1}') - N50_fastq_all=\$(jq '.["All Reads"].basecall.N50' "${unfiltered_pyco_json}") - median_read_length_fastq_all=\$(jq '.["All Reads"].basecall.len_percentiles[50]' "${unfiltered_pyco_json}") - N50_fastq_pass=\$(jq '.["Pass Reads"].basecall.N50' "${unfiltered_pyco_json}") - median_read_length_fastq_pass=\$(jq '.["Pass Reads"].basecall.len_percentiles[50]' "${unfiltered_pyco_json}") - N50_alignment_unfiltered=\$(jq '.["All Reads"].alignment.N50' "${unfiltered_pyco_json}") - median_read_length_alignment_unfiltered=\$(jq '.["All Reads"].alignment.len_percentiles[50]' "${unfiltered_pyco_json}") - N50_alignment_filtered=\$(jq '.["All Reads"].alignment.N50' "${filtered_pyco_json}") - median_read_length_alignment_filtered=\$(jq '.["All Reads"].alignment.len_percentiles[50]' "${filtered_pyco_json}") - echo "${id}\t\${num_all_reads}\t\${num_pass_reads}\t\${num_primary_alignments}\t\${num_primary_alignments_filtered}" > "${id}_num_reads.tsv" - echo "${id}\t\${N50_fastq_all}\t\${median_read_length_fastq_all}\t\${N50_fastq_pass}\t\${median_read_length_fastq_pass}\t\${N50_alignment_unfiltered}\t\${median_read_length_alignment_unfiltered}\t\${N50_alignment_filtered}\t\${median_read_length_alignment_filtered}" > "${id}_read_length.tsv" - echo "${id}\t${qscore_thresh}\t${mapq}" > "${id}_quality_thresholds.tsv" - """ - } process MERGE_QC_REPORT { - publishDir "${params.steps_2_and_3_input_directory}/reads_report/", pattern: "*", mode: "copy", overwrite: true - label 'cpu' input: - path(num_reads) - path(read_length) - path(qscore_thresh) + path num_reads + path read_length + path qscore_thresh output: - path("*") + path "*" script: - """ - + """ echo "# plot_type: 'table'" >> "Number_of_Reads_mqc.tsv" echo "# id: 'number of reads custom'" >> "Number_of_Reads_mqc.tsv" echo "# section_name: 'Number of reads per sample'" >> "Number_of_Reads_mqc.tsv" echo "Sample_ID\tAll Reads\tPass Reads\tPrimary Alignments\tFiltered Primary Alignments (MAPQ)" >> "Number_of_Reads_mqc.tsv" - cat $num_reads >> "Number_of_Reads_mqc.tsv" - - - + cat ${num_reads} >> "Number_of_Reads_mqc.tsv" + echo "# plot_type: 'table'" >> "Read_Length_mqc.tsv" echo "# id: 'read length custom'" >> "Read_Length_mqc.tsv" echo "# section_name: 'Read lengths per sample'" >> "Read_Length_mqc.tsv" echo "Sample_ID\tN50 All Reads\tMedian Read Length All Reads\tN50 Pass Reads\tMedian Read Length Pass Reads\tN50 Primary Alignments\tMedian Read Length Primary Alignments\tN50 Filtered Primary Alignments\tMedian Read Length Filtered Primary Alignments" >> "Read_Length_mqc.tsv" - cat $read_length >> "Read_Length_mqc.tsv" - + cat ${read_length} >> "Read_Length_mqc.tsv" echo "# plot_type: 'table'" >> "Quality_Thresholds_mqc.tsv" echo "# id: 'quality threholds'" >> "Quality_Thresholds_mqc.tsv" echo "# section_name: 'Quality thresholds for each sample'" >> "Quality_Thresholds_mqc.tsv" echo "Sample_ID\tRead Mean Base Quality Score Threshold (PHRED)\tMapping Quality Threshold (MAPQ)" >> "Quality_Thresholds_mqc.tsv" - cat $qscore_thresh >> "Quality_Thresholds_mqc.tsv" - + cat ${qscore_thresh} >> "Quality_Thresholds_mqc.tsv" """ - } diff --git a/modules/pycoqc.nf b/modules/pycoqc.nf index ace2eab5672484da97a3d900c7a39fc17d60e526..be1ff1b9c7e5fe764cd6588ccfe4c7c1316a4720 100755 --- a/modules/pycoqc.nf +++ b/modules/pycoqc.nf @@ -1,81 +1,68 @@ process PYCOQC_NO_FILTER { - publishDir "results/${params.out_dir}/pycoqc_no_filter/", mode: 'copy', overwrite: true, pattern: "*-Unfiltered_pycoqc*" - label 'cpu' input: - val(id) - path(total_bam) - path(total_bai) - path(filtered_bam) - path(filtered_bai) - path(unfiltered_flagstat) - path(filtered_flagstat) - path(seq_summary) - val(quality_score) + val id + path total_bam + path total_bai + path filtered_bam + path filtered_bai + path unfiltered_flagstat + path filtered_flagstat + path seq_summary + val quality_score output: - val("${id}"), emit: id - path("${filtered_bam}"), emit: filtered_bam - path("${filtered_bai}"), emit: filtered_bai - path("${unfiltered_flagstat}"), emit: unfiltered_flagstat - path("${filtered_flagstat}"), emit: filtered_flagstat - path("${seq_summary}"), emit: txt - path("*.html"), emit: unfiltered_pyco_html - path("*.json"), emit: unfiltered_pyco_json - - + val ("${id}"), emit: id + path ("${filtered_bam}"), emit: filtered_bam + path ("${filtered_bai}"), emit: filtered_bai + path ("${unfiltered_flagstat}"), emit: unfiltered_flagstat + path ("${filtered_flagstat}"), emit: filtered_flagstat + path ("${seq_summary}"), emit: txt + path ("*.html"), emit: unfiltered_pyco_html + path ("*.json"), emit: unfiltered_pyco_json + script: """ - - pycoQC -f "${seq_summary}" \ -v \ -a "${total_bam}" \ --min_pass_qual "${quality_score}" \ -o "./${id}-Unfiltered_pycoqc.html" \ -j "./${id}-Unfiltered_pycoqc.json" - """ } process PYCOQC_FILTER { - - publishDir "results/${params.out_dir}/pycoqc_filtered/", mode: 'copy', overwrite: true, pattern: "*-Filtered_pycoqc*" - label 'cpu' input: - val(id) - path(filtered_bam) - path(filtered_bai) - path(unfiltered_flagstat) - path(filtered_flagstat) - path(seq_summary) - val(quality_score) - path(unfiltered_pyco_json) - + val id + path filtered_bam + path filtered_bai + path unfiltered_flagstat + path filtered_flagstat + path seq_summary + val quality_score + path unfiltered_pyco_json output: - val("${id}"), emit: id - path("${unfiltered_flagstat}"), emit: unfiltered_flagstat - path("${filtered_flagstat}"), emit: filtered_flagstat - path("${unfiltered_pyco_json}"), emit: unfiltered_pyco_json - path("*-Filtered*.html"), emit: filtered_pyco_html - path("*-Filtered*.json"), emit: filtered_pyco_json - + val ("${id}"), emit: id + path ("${unfiltered_flagstat}"), emit: unfiltered_flagstat + path ("${filtered_flagstat}"), emit: filtered_flagstat + path ("${unfiltered_pyco_json}"), emit: unfiltered_pyco_json + path ("*-Filtered*.html"), emit: filtered_pyco_html + path ("*-Filtered*.json"), emit: filtered_pyco_json script: """ - pycoQC -f "${seq_summary}" \ -v \ -a "${filtered_bam}" \ --min_pass_qual "${quality_score}" \ -o "./${id}-Filtered_pycoqc.html" \ -j "./${id}-Filtered_pycoqc.json" - """ } diff --git a/sub_workflows/BASECALLING.nf b/sub_workflows/BASECALLING.nf index fcc11dd2ec4a201df832a62996e22993915bae14..1e0a62fb33dd1c0474f5d3c909eeabc1e86790c5 100755 --- a/sub_workflows/BASECALLING.nf +++ b/sub_workflows/BASECALLING.nf @@ -1,9 +1,6 @@ -// Import Modules - -include {FAST5_to_POD5; BASECALL_CPU; BASECALL_CPU_DEMUX; BASECALL_GPU; BASECALL_GPU_DEMUX} from '../modules/basecall.nf' +include { FAST5_to_POD5 ; BASECALL_CPU ; BASECALL_CPU_DEMUX ; BASECALL_GPU ; BASECALL_GPU_DEMUX } from '../modules/basecall.nf' workflow BASECALLING { - take: pod5_path fast5_path @@ -13,53 +10,34 @@ workflow BASECALLING { trim quality_score trim_barcode - devices - ref - - main: - - - FAST5_to_POD5(fast5_path) - pod5_path = FAST5_to_POD5.out.mix(pod5_path) - - - if (params.basecall_compute?.equalsIgnoreCase("cpu")) { - - if (params.basecall_demux == true) { - - BASECALL_CPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - - bams = BASECALL_CPU_DEMUX.out.bam.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = BASECALL_CPU_DEMUX.out.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - - } else { - - BASECALL_CPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - - bams = BASECALL_CPU.out.bam.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = BASECALL_CPU.out.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - + devices + ref + main: + FAST5_to_POD5(fast5_path) + pod5_path = FAST5_to_POD5.out.mix(pod5_path) + if (params.basecall_compute?.equalsIgnoreCase("cpu")) { + if (params.basecall_demux == true) { + BASECALL_CPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) + bams = BASECALL_CPU_DEMUX.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_CPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + else { + BASECALL_CPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) + bams = BASECALL_CPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_CPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } } - - } else if (params.basecall_compute?.equalsIgnoreCase("gpu")) { - - if (params.basecall_demux == true) { - - BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) - - bams = BASECALL_GPU_DEMUX.out.bam.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - txts = BASECALL_GPU_DEMUX.out.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - - } else { - - BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) - - bams = BASECALL_GPU.out.bam.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = BASECALL_GPU.out.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - + else if (params.basecall_compute?.equalsIgnoreCase("gpu")) { + if (params.basecall_demux == true) { + BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) + bams = BASECALL_GPU_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + txts = BASECALL_GPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + else { + BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) + bams = BASECALL_GPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_GPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } } - - } - } diff --git a/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf b/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf index e68e76caad4a43e1f405d10bf427d09415951228..90dc60c6d9260db7d5b5d7666177d211a0067493 100755 --- a/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf +++ b/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf @@ -1,38 +1,56 @@ // Import Modules - -include {PYCOQC_NO_FILTER ; PYCOQC_FILTER} from '../modules/pycoqc.nf' -include {FILTER_BAM} from '../modules/filter_bam.nf' -include {MAKE_QC_REPORT} from '../modules/num_reads_report.nf' -include {CONVERT_INPUT_FROM_MINKNOW_BARCODED; CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED} from '../modules/convert_input_from_minknow.nf' +include { PYCOQC_NO_FILTER ; PYCOQC_FILTER } from '../modules/pycoqc.nf' +include { FILTER_BAM } from '../modules/filter_bam.nf' +include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf' +include { CONVERT_INPUT_FROM_MINKNOW_BARCODED ; CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED } from '../modules/convert_input_from_minknow.nf' workflow FILTERING_AND_QC_FROM_MINKNOW { - take: input mapq quality_score main: - - if (params.is_barcoded == true) { - CONVERT_INPUT_FROM_MINKNOW_BARCODED(input) - converted_input = CONVERT_INPUT_FROM_MINKNOW_BARCODED.out - } else if (params.is_barcoded == false) { - CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input) - converted_input = CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED.out - } - - FILTER_BAM(converted_input.bam.flatten().map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2), - converted_input.txt.toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten(), mapq) - - PYCOQC_NO_FILTER(FILTER_BAM.out.id, FILTER_BAM.out.total_bam, FILTER_BAM.out.total_bai, FILTER_BAM.out.filtered_bam, - FILTER_BAM.out.filtered_bai, FILTER_BAM.out.unfiltered_flagstat, FILTER_BAM.out.filtered_flagstat, FILTER_BAM.out.txt, quality_score) - - PYCOQC_FILTER(PYCOQC_NO_FILTER.out.id, PYCOQC_NO_FILTER.out.filtered_bam, PYCOQC_NO_FILTER.out.filtered_bai, PYCOQC_NO_FILTER.out.unfiltered_flagstat, - PYCOQC_NO_FILTER.out.filtered_flagstat, PYCOQC_NO_FILTER.out.txt, quality_score, PYCOQC_NO_FILTER.out.unfiltered_pyco_json) - - MAKE_QC_REPORT(PYCOQC_FILTER.out.id, PYCOQC_FILTER.out.unfiltered_flagstat, PYCOQC_FILTER.out.filtered_flagstat, - PYCOQC_FILTER.out.unfiltered_pyco_json, PYCOQC_FILTER.out.filtered_pyco_json, mapq, quality_score) - - + if (params.is_barcoded == true) { + CONVERT_INPUT_FROM_MINKNOW_BARCODED(input) + converted_input = CONVERT_INPUT_FROM_MINKNOW_BARCODED.out + } else if (params.is_barcoded == false) { + CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input) + converted_input = CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED.out + } + FILTER_BAM( + converted_input.bam.flatten().map { file -> tuple(file.baseName, file) }.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2), + converted_input.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten(), + mapq, + ) + PYCOQC_NO_FILTER( + FILTER_BAM.out.id, + FILTER_BAM.out.total_bam, + FILTER_BAM.out.total_bai, + FILTER_BAM.out.filtered_bam, + FILTER_BAM.out.filtered_bai, + FILTER_BAM.out.unfiltered_flagstat, + FILTER_BAM.out.filtered_flagstat, + FILTER_BAM.out.txt, + quality_score, + ) + PYCOQC_FILTER( + PYCOQC_NO_FILTER.out.id, + PYCOQC_NO_FILTER.out.filtered_bam, + PYCOQC_NO_FILTER.out.filtered_bai, + PYCOQC_NO_FILTER.out.unfiltered_flagstat, + PYCOQC_NO_FILTER.out.filtered_flagstat, + PYCOQC_NO_FILTER.out.txt, + quality_score, + PYCOQC_NO_FILTER.out.unfiltered_pyco_json, + ) + MAKE_QC_REPORT( + PYCOQC_FILTER.out.id, + PYCOQC_FILTER.out.unfiltered_flagstat, + PYCOQC_FILTER.out.filtered_flagstat, + PYCOQC_FILTER.out.unfiltered_pyco_json, + PYCOQC_FILTER.out.filtered_pyco_json, + mapq, + quality_score, + ) } diff --git a/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf b/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf index abdaa47662f59ae9c80dab19981aa0a677237806..6f3a7113be41b75386ae141a2aab654777e74532 100755 --- a/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf +++ b/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf @@ -1,11 +1,10 @@ // Import Modules -include {PYCOQC_NO_FILTER ; PYCOQC_FILTER} from '../modules/pycoqc.nf' -include {FILTER_BAM} from '../modules/filter_bam.nf' -include {MAKE_QC_REPORT} from '../modules/num_reads_report.nf' +include { PYCOQC_NO_FILTER ; PYCOQC_FILTER } from '../modules/pycoqc.nf' +include { FILTER_BAM } from '../modules/filter_bam.nf' +include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf' workflow FILTERING_AND_QC_FROM_STEP_1 { - take: bams txts @@ -13,18 +12,35 @@ workflow FILTERING_AND_QC_FROM_STEP_1 { quality_score main: - - FILTER_BAM(bams, txts, mapq) - - PYCOQC_NO_FILTER(FILTER_BAM.out.id, FILTER_BAM.out.total_bam, FILTER_BAM.out.total_bai, FILTER_BAM.out.filtered_bam, FILTER_BAM.out.filtered_bai, - FILTER_BAM.out.unfiltered_flagstat, FILTER_BAM.out.filtered_flagstat, FILTER_BAM.out.txt, quality_score) - - PYCOQC_FILTER(PYCOQC_NO_FILTER.out.id, PYCOQC_NO_FILTER.out.filtered_bam, PYCOQC_NO_FILTER.out.filtered_bai, PYCOQC_NO_FILTER.out.unfiltered_flagstat, - PYCOQC_NO_FILTER.out.filtered_flagstat, PYCOQC_NO_FILTER.out.txt, quality_score, PYCOQC_NO_FILTER.out.unfiltered_pyco_json) - - - MAKE_QC_REPORT(PYCOQC_FILTER.out.id, PYCOQC_FILTER.out.unfiltered_flagstat, PYCOQC_FILTER.out.filtered_flagstat, - PYCOQC_FILTER.out.unfiltered_pyco_json, PYCOQC_FILTER.out.filtered_pyco_json, mapq, quality_score) - - + FILTER_BAM(bams, txts, mapq) + PYCOQC_NO_FILTER( + FILTER_BAM.out.id, + FILTER_BAM.out.total_bam, + FILTER_BAM.out.total_bai, + FILTER_BAM.out.filtered_bam, + FILTER_BAM.out.filtered_bai, + FILTER_BAM.out.unfiltered_flagstat, + FILTER_BAM.out.filtered_flagstat, + FILTER_BAM.out.txt, + quality_score, + ) + PYCOQC_FILTER( + PYCOQC_NO_FILTER.out.id, + PYCOQC_NO_FILTER.out.filtered_bam, + PYCOQC_NO_FILTER.out.filtered_bai, + PYCOQC_NO_FILTER.out.unfiltered_flagstat, + PYCOQC_NO_FILTER.out.filtered_flagstat, + PYCOQC_NO_FILTER.out.txt, + quality_score, + PYCOQC_NO_FILTER.out.unfiltered_pyco_json, + ) + MAKE_QC_REPORT( + PYCOQC_FILTER.out.id, + PYCOQC_FILTER.out.unfiltered_flagstat, + PYCOQC_FILTER.out.filtered_flagstat, + PYCOQC_FILTER.out.unfiltered_pyco_json, + PYCOQC_FILTER.out.filtered_pyco_json, + mapq, + quality_score, + ) } diff --git a/sub_workflows/MODKIT_AND_MULTIQC.nf b/sub_workflows/MODKIT_AND_MULTIQC.nf index 3fa7df684b77a19f96dc754a1584f8e479e6be13..35294c78f20ee2aef614636017fef98b5f09af10 100755 --- a/sub_workflows/MODKIT_AND_MULTIQC.nf +++ b/sub_workflows/MODKIT_AND_MULTIQC.nf @@ -1,12 +1,10 @@ // Import Modules - -include {MULTIQC} from '../modules/multiqc.nf' -include {MODKIT} from '../modules/modkit.nf' -include {MERGE_QC_REPORT} from '../modules//num_reads_report.nf' -include {CALCULATE_COVERAGE; MERGE_COVERAGE} from "../modules/calculate_coverage.nf" +include { MULTIQC } from '../modules/multiqc.nf' +include { MODKIT } from '../modules/modkit.nf' +include { MERGE_QC_REPORT } from '../modules//num_reads_report.nf' +include { CALCULATE_COVERAGE ; MERGE_COVERAGE } from "../modules/calculate_coverage.nf" workflow MODKIT_AND_MULTIQC { - take: filtered_bams filtered_bais @@ -16,18 +14,10 @@ workflow MODKIT_AND_MULTIQC { multiqc_config multiqc_input - - main: - - CALCULATE_COVERAGE(filtered_bams, filtered_bais) - - MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect()) - + CALCULATE_COVERAGE(filtered_bams, filtered_bais) + MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect()) MERGE_QC_REPORT(num_reads.collect(), read_length.collect(), quality_thresholds.collect()) - MULTIQC(multiqc_input.concat(MERGE_QC_REPORT.out.flatten()).collect(), MERGE_COVERAGE.out.mqc, multiqc_config) - MODKIT(filtered_bams, filtered_bais) - } diff --git a/workflow/main.nf b/workflow/main.nf index be8790855739d55c90f436599b90097ad922fa34..6b45ca519e429009579891ee0ac54220b26ab312 100755 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,73 +1,49 @@ // Make this pipeline a nextflow 2 implementation nextflow.enable.dsl=2 - if (params.step.toString() == "1") { -log.info """ - STEP 1 - OXFORD NANOPORE DNA SEQUENCING BASECALLING AND ALIGNMENT - Bernardo Aguzzoli Heberle -====================================================================================================================================================================================== - -path containing samples and files to be basecalled (basecall only) : ${params.basecall_path} -basecall speed (basecall only) : ${params.basecall_speed} -basecall modifications (basecall only) : ${params.basecall_mods} -basecall config (If "false" the basecaller will automatically pick one) : ${params.basecall_config} -basecall read trimming option : ${params.basecall_trim} -basecall quality score threshold for basecalling : ${params.qscore_thresh} -basecall demultiplexing : ${params.basecall_demux} -trim barcodes during demultiplexing : ${params.trim_barcode} -submission output file prefix : ${params.prefix} -GPU device for submission : ${params.gpu_devices} - -Output directory : ${params.out_dir} - -====================================================================================================================================================================================== - + log.info """ +================================================================= +STEP 1 - OXFORD NANOPORE DNA SEQUENCING BASECALLING AND ALIGNMENT +================================================================= +basecall files path containing : ${params.basecall_path} +basecall speed (basecall only) : ${params.basecall_speed} +basecall modifications (basecall only) : ${params.basecall_mods} +basecall config : ${params.basecall_config} +basecall read trimming option : ${params.basecall_trim} +basecall quality score threshold for basecalling : ${params.qscore_thresh} +basecall demultiplexing : ${params.basecall_demux} +trim barcodes during demultiplexing : ${params.trim_barcode} +submission output file prefix : ${params.prefix} +GPU device for submission : ${params.gpu_devices} +Output directory : ${params.out_dir} +================================================================= """ - } else if (params.step.toString() == "2_from_step_1" || params.step.toString() == "2_from_minknow") { - -log.info """ - STEP 2 - FILTERING AND QUALITY CONTROL - Bernardo Aguzzoli Heberle -====================================================================================================================================================================================== - -Input directory - should be the Output Directory from step 1 : ${params.steps_2_and_3_input_directory} - -Basecall quality score threshold for basecalling (make sure it is the same as in step 1) : ${params.qscore_thresh} - -MAPQ filtering threshold : ${params.mapq} - -Minimum number of mapped reads per sample/barcode for a file to be included in analysis : ${params.min_mapped_reads_thresh} - -Are input bam files barcoded? (Only relevant to "step_2_from_minknow" option) : ${params.is_barcoded} -====================================================================================================================================================================================== - +} else if (params.step.toString() == "2_from_step_1" || params.step.toString() == "2_from_minknow") { + log.info """ +====================================== +STEP 2 - FILTERING AND QUALITY CONTROL +====================================== +Input directory (output dir from step 1) : ${params.steps_2_and_3_input_directory} +Basecall quality score threshold : ${params.qscore_thresh} +MAPQ filtering threshold : ${params.mapq} +Min number of mapped reads per sample/barcode : ${params.min_mapped_reads_thresh} +BAM files barcoded? : ${params.is_barcoded} +====================================== """ - - - } else if (params.step.toString() == "3") { - - -log.info """ - STEP 3 - METHYLATION CALLING AND MULTIQC REPORT - Bernardo Aguzzoli Heberle -======================================================================================================================================================================================= - -Input directory - should be the Output Directory from step 1 and Input Directory from step 2 : ${params.steps_2_and_3_input_directory} - -MultiQC configuration file (provided, but may need to be altered for different use cases) : ${params.multiqc_config} - -====================================================================================================================================================================================== - +} else if (params.step.toString() == "3") { + log.info """ +=============================================== +STEP 3 - METHYLATION CALLING AND MULTIQC REPORT +=============================================== +Input directory (input dir from step 2) : ${params.steps_2_and_3_input_directory} +MultiQC configuration file : ${params.multiqc_config} +=============================================== """ - - - - } else { - +} else { println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://github.com/bernardo-heberle/DCNL_NANOPORE_PIPELINE" System.exit(1) - - } - - +} // Import Workflows include {BASECALLING} from '../sub_workflows/BASECALLING' @@ -75,25 +51,15 @@ include {FILTERING_AND_QC_FROM_STEP_1} from '../sub_workflows/FILTERING_AND_QC_F include {FILTERING_AND_QC_FROM_MINKNOW} from '../sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf' include {MODKIT_AND_MULTIQC} from '../sub_workflows/MODKIT_AND_MULTIQC.nf' - // Define initial files and channels - if (params.step.toString() == "1") { - if (params.prefix == "None") { - - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - + fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() } else { - - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - - } - + fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + } ref = file(params.ref) quality_score = Channel.value(params.qscore_thresh) basecall_speed = Channel.value(params.basecall_speed) @@ -103,30 +69,16 @@ if (params.step.toString() == "1") { basecall_compute = Channel.value(params.basecall_compute) trim_barcode = Channel.value(params.trim_barcode) devices = Channel.value(params.gpu_devices) - -} - -else if (params.step.toString() == "2_from_step_1") { - +} else if (params.step.toString() == "2_from_step_1") { total_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() mapq = Channel.value(params.mapq) quality_score = Channel.value(params.qscore_thresh) - -} - -else if (params.step.toString() == "2_from_minknow") { - +} else if (params.step.toString() == "2_from_minknow") { input_dir = Channel.fromPath("${params.steps_2_and_3_input_directory}/") mapq = Channel.value(params.mapq) quality_score = Channel.value(params.qscore_thresh) - -} - -else if (params.step.toString() == "3") { - - +} else if (params.step.toString() == "3") { filtered_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) filtered_bais = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() num_reads = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*") @@ -134,35 +86,17 @@ else if (params.step.toString() == "3") { quality_thresholds = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*") multiqc_config = Channel.fromPath(params.multiqc_config) multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") - - } - - +// Main logic workflow { - if (params.step.toString() == "1") { - BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, quality_score, trim_barcode, devices, ref) - } - - else if (params.step.toString() == "2_from_step_1") { - + } else if (params.step.toString() == "2_from_step_1") { FILTERING_AND_QC_FROM_STEP_1(total_bams, txts, mapq, quality_score) - - } - - else if (params.step.toString() == "2_from_minknow") { - + } else if (params.step.toString() == "2_from_minknow") { FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, quality_score) - - } - - else if (params.step.toString()== "3") { - + } else if (params.step.toString()== "3") { MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input) - } - } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 55d24823d571739ae15bb7a2ea171400a51ec190..3b104aad4ae276d2ccf00a6a157a83e55fa3b0be 100755 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -1,109 +1,72 @@ // CONFIGURATION FILE -// Pipeline parameter default values, can be modified by user when calling pipeline on command line (e.g. --ont_reads_fq sample_1.fastq) ## +// Pipeline parameter default values, can be modified by user when calling +// pipeline on command line (e.g. --ont_reads_fq sample_1.fastq) // Input reference fasta file params.ref = 'None' - // Step of pipeline to execute params.step = 'None' - // Output directory for pipeline results params.out_dir = "output_directory/" - // directory of basecalling data params.basecall_path = 'None' - // MAPQ filtering threshold for bam files, 0 for no filtering params.mapq = "10" - // Quality score threshold params.qscore_thresh = "9" - // Desired basecall speed params.basecall_speed = "hac" - // Desired basecaller modifications params.basecall_mods = false - // Threshold for mapped reasds params.min_mapped_reads_thresh = 500 - // Desired basecall configuration params.basecall_config = "None" - // Type of read trimming during basecalling ("all", "primers", "adapters", "none") params.basecall_trim = "none" - // Basecalling demultiplexing params.basecall_demux = false - // CPU vs GPU basecalling params.basecall_compute = "gpu" - // Trim barcodes (only counts if demultiplexing is enabled) params.trim_barcode = "True" - // Add prefix to all output files params.prefix = "None" - // Which GPU devices to use for basecalling? params.gpu_devices = "all" - // Previous results params.steps_2_and_3_input_directory = "None" - // MultiQC config params.multiqc_config = "None" - // Are the files from MinKNOW barcoded or not params.is_barcoded = true - - - +// Set queue size for the executor if (params.step == 1) { - queue_size = 1 - } else { - queue_size = 5 - } - process { - // Define local cpu execution - withLabel: cpu { executor='local' } - // Define local gpu execution withLabel: gpu { - executor='local' containerOptions = '--nv' } - // Define the singularity container for every process //container = "library://joaochrusciel/nanopore/ont_methylation:2024-10-18" container = "./images/debian-nanopore.sif" - } - executor { - name = 'local' queueSize = queue_size - } - - apptainer { - enabled = true pullTimeout = '60m' - }