diff --git a/README.md b/README.md index e62a135e9f011d52512839b699c863c780eca26a..ee239bc89d5fb674d3abf3ea3fbdc01bd406f096 100755 --- a/README.md +++ b/README.md @@ -254,7 +254,7 @@ The following examples assume your current directory is the root directory of th 1. STEP 1: GPU basecalling without demultiplexing ```sh - nextflow ./workflow/main.nf \ + nextflow ./src/main.nf \ --basecall_path "$BASECALL_PATH" \ --basecall_speed "hac" \ --step 1 \ @@ -274,7 +274,7 @@ The following examples assume your current directory is the root directory of th 1. STEP 2A: Alignment Filtering and Quality Control from STEP 1 ```sh - nextflow ./workflow/main.nf \ + nextflow ./src/main.nf \ --steps_2_and_3_input_directory "./results/$OUTPUT_DIR_NAME/" \ --min_mapped_reads_thresh 500 \ --qscore_thresh 9 \ @@ -286,7 +286,7 @@ The following examples assume your current directory is the root directory of th 1. STEP 2B (MinKNOW): Alignment Filtering and Quality Control from MinKNOW basecalling and alignment (bam files were generated by MinKNOW) ```sh - nextflow ./workflow/main.nf \ + nextflow ./src/main.nf \ --steps_2_and_3_input_directory "./results/$OUTPUT_DIR_NAME/" \ --min_mapped_reads_thresh 500 \ --is_barcoded "True" \ @@ -299,7 +299,7 @@ The following examples assume your current directory is the root directory of th 1. STEP 3: Methylation calling and MultiQC report ```sh - nextflow ./workflow/main.nf \ + nextflow ./src/main.nf \ --steps_2_and_3_input_directory "./results/$OUTPUT_DIR_NAME/" \ --multiqc_config "./references/multiqc_config.yaml" \ --step 3 \ diff --git a/assets/NanoporePipeline.drawio b/assets/NanoporePipeline.drawio new file mode 100644 index 0000000000000000000000000000000000000000..26974a0d261865d05a20d44dd537976b3187adb8 --- /dev/null +++ b/assets/NanoporePipeline.drawio @@ -0,0 +1,196 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/NanoporePipeline.png b/assets/NanoporePipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..d58ad4e5f0008f9a81e54bbbdf21b0ae61a8197f Binary files /dev/null and b/assets/NanoporePipeline.png differ diff --git a/assets/ResumoCodigo.pdf b/assets/ResumoCodigo.pdf new file mode 100644 index 0000000000000000000000000000000000000000..76c39aa2dedff5679c2d8b076970bcb43f151e5c Binary files /dev/null and b/assets/ResumoCodigo.pdf differ diff --git a/workflow/bin/merge_files.py b/src/bin/merge_files.py similarity index 100% rename from workflow/bin/merge_files.py rename to src/bin/merge_files.py diff --git a/src/configs/queue-basecalling.config b/src/configs/queue-basecalling.config new file mode 100644 index 0000000000000000000000000000000000000000..d4e6df1ae52a63d6ad0aac0e4d6b26b55960f8d4 --- /dev/null +++ b/src/configs/queue-basecalling.config @@ -0,0 +1 @@ +params.queue_size = 1 \ No newline at end of file diff --git a/src/configs/queue-default.config b/src/configs/queue-default.config new file mode 100644 index 0000000000000000000000000000000000000000..90330102f5cd6bbae4206dab8a9429f65dd27677 --- /dev/null +++ b/src/configs/queue-default.config @@ -0,0 +1 @@ +params.queue_size = 5 \ No newline at end of file diff --git a/src/main.nf b/src/main.nf new file mode 100755 index 0000000000000000000000000000000000000000..06e3012936f7326cf82f1f293c385b296d3c4265 --- /dev/null +++ b/src/main.nf @@ -0,0 +1,99 @@ +// Import sub-workflows +include {BASECALLING} from './sub_workflows/BASECALLING' +include {FILTERING_AND_QC_FROM_STEP_1} from './sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf' +include {FILTERING_AND_QC_FROM_MINKNOW} from './sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf' +include {MODKIT_AND_MULTIQC} from './sub_workflows/MODKIT_AND_MULTIQC.nf' + +// Main workflow logic +workflow { + // Log execution parameters + if (params.step.toString() == "1") { + log.info """ + ================================================================= + STEP 1 - OXFORD NANOPORE DNA SEQUENCING BASECALLING AND ALIGNMENT + ================================================================= + basecall files path containing : ${params.basecall_path} + basecall speed (basecall only) : ${params.basecall_speed} + basecall modifications (basecall only) : ${params.basecall_mods} + basecall config : ${params.basecall_config} + basecall read trimming option : ${params.basecall_trim} + basecall quality score threshold for basecalling : ${params.qscore_thresh} + basecall demultiplexing : ${params.basecall_demux} + trim barcodes during demultiplexing : ${params.trim_barcode} + submission output file prefix : ${params.prefix} + GPU device for submission : ${params.gpu_devices} + Output directory : ${params.out_dir} + ================================================================= + """ + } else if (params.step.toString() == "2_from_step_1" || params.step.toString() == "2_from_minknow") { + log.info """ + ====================================== + STEP 2 - FILTERING AND QUALITY CONTROL + ====================================== + Input directory (output dir from step 1) : ${params.steps_2_and_3_input_directory} + Basecall quality score threshold : ${params.qscore_thresh} + MAPQ filtering threshold : ${params.mapq} + Min number of mapped reads per sample/barcode : ${params.min_mapped_reads_thresh} + BAM files barcoded? : ${params.is_barcoded} + ====================================== + """ + } else if (params.step.toString() == "3") { + log.info """ + =============================================== + STEP 3 - METHYLATION CALLING AND MULTIQC REPORT + =============================================== + Input directory (input dir from step 2) : ${params.steps_2_and_3_input_directory} + MultiQC configuration file : ${params.multiqc_config} + =============================================== + """ + } else { + println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://github.com/bernardo-heberle/DCNL_NANOPORE_PIPELINE" + System.exit(1) + } + // Set initial files and channels + if (params.step.toString() == "1") { + if (params.prefix == "None") { + fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + } else { + fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + } + ref = file(params.ref) + quality_score = Channel.value(params.qscore_thresh) + basecall_speed = Channel.value(params.basecall_speed) + basecall_mods = Channel.value(params.basecall_mods) + basecall_config = Channel.value(params.basecall_config) + basecall_trim = Channel.value(params.basecall_trim) + // basecall_compute = Channel.value(params.basecall_compute) + trim_barcode = Channel.value(params.trim_barcode) + devices = Channel.value(params.gpu_devices) + } else if (params.step.toString() == "2_from_step_1") { + total_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) + txts = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() + mapq = Channel.value(params.mapq) + quality_score = Channel.value(params.qscore_thresh) + } else if (params.step.toString() == "2_from_minknow") { + input_dir = Channel.fromPath("${params.steps_2_and_3_input_directory}/") + mapq = Channel.value(params.mapq) + quality_score = Channel.value(params.qscore_thresh) + } else if (params.step.toString() == "3") { + filtered_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) + filtered_bais = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() + num_reads = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*") + read_length = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/read_length/*") + quality_thresholds = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*") + multiqc_config = Channel.fromPath(params.multiqc_config) + multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") + } + // Run steps + if (params.step.toString() == "1") { + BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, quality_score, trim_barcode, devices, ref) + } else if (params.step.toString() == "2_from_step_1") { + FILTERING_AND_QC_FROM_STEP_1(total_bams, txts, mapq, quality_score) + } else if (params.step.toString() == "2_from_minknow") { + FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, quality_score) + } else if (params.step.toString()== "3") { + MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input) + } +} diff --git a/modules/basecall.nf b/src/modules/basecall.nf similarity index 100% rename from modules/basecall.nf rename to src/modules/basecall.nf diff --git a/modules/calculate_coverage.nf b/src/modules/calculate_coverage.nf similarity index 100% rename from modules/calculate_coverage.nf rename to src/modules/calculate_coverage.nf diff --git a/modules/convert_input_from_minknow.nf b/src/modules/convert_input_from_minknow.nf similarity index 100% rename from modules/convert_input_from_minknow.nf rename to src/modules/convert_input_from_minknow.nf diff --git a/modules/filter_bam.nf b/src/modules/filter_bam.nf similarity index 100% rename from modules/filter_bam.nf rename to src/modules/filter_bam.nf diff --git a/modules/modkit.nf b/src/modules/modkit.nf similarity index 61% rename from modules/modkit.nf rename to src/modules/modkit.nf index 16b98bcaca90ab335048408a7ea7aaa8666a249f..db6d58b9b42410f9df4e5fcefc8ebfccb4d11e97 100755 --- a/modules/modkit.nf +++ b/src/modules/modkit.nf @@ -7,19 +7,23 @@ process MODKIT { path bai output: - path "*" - + path ("${id}_modkit_output.tsv"), emit: tsv + path ("${id}_modkit_pileup.bed"), emit: bed + path ("${id}_modkit_pileup.log"), emit: logfile + path ("${id}_modkit_summary.txt"), emit: summary + script: """ - echo "starting modkit" + set -euo pipefail + echo "starting modkit: ${id} " ## Make Methylation TSV file modkit extract full --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv" - echo "extract successful" + echo "extract successful: ${id}" ## Make Methylation Table modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" - echo "pileup successful" + echo "pileup successful: ${id}" ## Make Summary modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt" - echo "summary successful" + echo "summary successful: ${id}" """ } diff --git a/modules/multiqc.nf b/src/modules/multiqc.nf similarity index 51% rename from modules/multiqc.nf rename to src/modules/multiqc.nf index c2181eb875f0f5af522bb9fcbd02c7e7f94acc3e..cc2a196ff008e5e101061804402c105e5d79864b 100755 --- a/modules/multiqc.nf +++ b/src/modules/multiqc.nf @@ -1,5 +1,5 @@ process MULTIQC { - publishDir "${params.steps_2_and_3_input_directory}/multiQC_output", mode: "copy", overwrite: true + publishDir "${params.steps_2_and_3_input_directory}/multiQC_output/", mode: "copy", overwrite: true label 'cpu' input: @@ -8,10 +8,14 @@ process MULTIQC { path multiqc_config output: - path "*" - + path ("multiQC_report.html"), emit: report_html + path ("multiqc_data/"), emit: report_data_dir + script: - """ + """ + set -euo pipefail + echo "Starting MultiQC" multiqc -c ${multiqc_config} -n multiQC_report.html . + echo "MultiQC finished successfully." """ } diff --git a/modules/num_reads_report.nf b/src/modules/num_reads_report.nf similarity index 89% rename from modules/num_reads_report.nf rename to src/modules/num_reads_report.nf index 7210826ee16b1a1f37c872f48324e4802e08dfc2..0cc94c3055db82ea66d08a466fd6853b6defb529 100755 --- a/modules/num_reads_report.nf +++ b/src/modules/num_reads_report.nf @@ -1,7 +1,5 @@ process MAKE_QC_REPORT { - publishDir "results/${params.out_dir}/intermediate_qc_reports/number_of_reads/", pattern: "*num_reads.tsv", mode: "copy", overwrite: true - publishDir "results/${params.out_dir}/intermediate_qc_reports/read_length/", pattern: "*length.tsv", mode: "copy", overwrite: true - publishDir "results/${params.out_dir}/intermediate_qc_reports/quality_score_thresholds/", pattern: "*thresholds.tsv", mode: "copy", overwrite: true + publishDir "results/${params.out_dir}/intermediate_qc_reports", pattern: "*.tsv", mode: "copy", overwrite: true label 'cpu' input: @@ -20,6 +18,7 @@ process MAKE_QC_REPORT { script: """ + set -euo pipefail num_all_reads=\$(jq '.["All Reads"].basecall.reads_number' "${unfiltered_pyco_json}") num_pass_reads=\$(jq '.["Pass Reads"].basecall.reads_number' "${unfiltered_pyco_json}") num_primary_alignments=\$(grep "primary mapped" "${unfiltered_flagstat}" | awk {'print \$1}') @@ -39,7 +38,7 @@ process MAKE_QC_REPORT { } process MERGE_QC_REPORT { - publishDir "${params.steps_2_and_3_input_directory}/reads_report/", pattern: "*", mode: "copy", overwrite: true + publishDir "${params.steps_2_and_3_input_directory}/reads_report/", pattern: "*.tsv", mode: "copy", overwrite: true label 'cpu' input: @@ -48,10 +47,13 @@ process MERGE_QC_REPORT { path qscore_thresh output: - path "*" + + path "*.tsv" script: - """ + + """ + set -euo pipefail echo "# plot_type: 'table'" >> "Number_of_Reads_mqc.tsv" echo "# id: 'number of reads custom'" >> "Number_of_Reads_mqc.tsv" echo "# section_name: 'Number of reads per sample'" >> "Number_of_Reads_mqc.tsv" diff --git a/modules/pycoqc.nf b/src/modules/pycoqc.nf similarity index 100% rename from modules/pycoqc.nf rename to src/modules/pycoqc.nf diff --git a/src/nextflow.config b/src/nextflow.config new file mode 100755 index 0000000000000000000000000000000000000000..4c73a4929bcd3e803111cc62fc3a6ea3f2e5b546 --- /dev/null +++ b/src/nextflow.config @@ -0,0 +1,75 @@ +// CONFIGURATION FILE + +// Pipeline parameter default values, can be modified by user when calling +// pipeline on command line (e.g. --ont_reads_fq sample_1.fastq) + +params { + // Input reference fasta file + ref = 'None' + // Step of pipeline to execute + step = 'None' + // Output directory for pipeline results + out_dir = "output_directory/" + // directory of basecalling data + basecall_path = 'None' + // MAPQ filtering threshold for bam files, 0 for no filtering + mapq = "10" + // Quality score threshold + qscore_thresh = "9" + // Desired basecall speed + basecall_speed = "hac" + // Desired basecaller modifications + basecall_mods = false + // Threshold for mapped reasds + min_mapped_reads_thresh = 500 + // Desired basecall configuration + basecall_config = "None" + // Type of read trimming during basecalling ("all", "primers", "adapters", "none") + basecall_trim = "none" + // Basecalling demultiplexing + basecall_demux = false + // CPU vs GPU basecalling + basecall_compute = "gpu" + // Trim barcodes (only counts if demultiplexing is enabled) + trim_barcode = "True" + // Add prefix to all output files + prefix = "None" + // Which GPU devices to use for basecalling? + gpu_devices = "all" + // Previous results + steps_2_and_3_input_directory = "None" + // MultiQC config + multiqc_config = "None" + // Are the files from MinKNOW barcoded or not + is_barcoded = true +} + +// queue_size depends on the step +includeConfig ({ + if (params.step == 1) { return './configs/queue-basecalling.config' } + else { return './configs/queue-default.config' } +}()) + +process { + // Define local cpu execution + withLabel: cpu { + executor='local' + } + // Define local gpu execution + withLabel: gpu { + executor='local' + containerOptions = '--nv' + } + // Define the container for every process + container = "./images/debian-nanopore.sif" +} + +executor { + name = 'local' + queueSize = params.queue_size +} + +apptainer { + enabled = true + pullTimeout = '60m' +} diff --git a/sub_workflows/BASECALLING.nf b/src/sub_workflows/BASECALLING.nf similarity index 100% rename from sub_workflows/BASECALLING.nf rename to src/sub_workflows/BASECALLING.nf diff --git a/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf b/src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf similarity index 100% rename from sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf rename to src/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf diff --git a/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf b/src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf similarity index 100% rename from sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf rename to src/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf diff --git a/sub_workflows/MODKIT_AND_MULTIQC.nf b/src/sub_workflows/MODKIT_AND_MULTIQC.nf similarity index 100% rename from sub_workflows/MODKIT_AND_MULTIQC.nf rename to src/sub_workflows/MODKIT_AND_MULTIQC.nf diff --git a/workflow/main.nf b/workflow/main.nf deleted file mode 100755 index 6b45ca519e429009579891ee0ac54220b26ab312..0000000000000000000000000000000000000000 --- a/workflow/main.nf +++ /dev/null @@ -1,102 +0,0 @@ -// Make this pipeline a nextflow 2 implementation -nextflow.enable.dsl=2 - -if (params.step.toString() == "1") { - log.info """ -================================================================= -STEP 1 - OXFORD NANOPORE DNA SEQUENCING BASECALLING AND ALIGNMENT -================================================================= -basecall files path containing : ${params.basecall_path} -basecall speed (basecall only) : ${params.basecall_speed} -basecall modifications (basecall only) : ${params.basecall_mods} -basecall config : ${params.basecall_config} -basecall read trimming option : ${params.basecall_trim} -basecall quality score threshold for basecalling : ${params.qscore_thresh} -basecall demultiplexing : ${params.basecall_demux} -trim barcodes during demultiplexing : ${params.trim_barcode} -submission output file prefix : ${params.prefix} -GPU device for submission : ${params.gpu_devices} -Output directory : ${params.out_dir} -================================================================= -""" -} else if (params.step.toString() == "2_from_step_1" || params.step.toString() == "2_from_minknow") { - log.info """ -====================================== -STEP 2 - FILTERING AND QUALITY CONTROL -====================================== -Input directory (output dir from step 1) : ${params.steps_2_and_3_input_directory} -Basecall quality score threshold : ${params.qscore_thresh} -MAPQ filtering threshold : ${params.mapq} -Min number of mapped reads per sample/barcode : ${params.min_mapped_reads_thresh} -BAM files barcoded? : ${params.is_barcoded} -====================================== -""" -} else if (params.step.toString() == "3") { - log.info """ -=============================================== -STEP 3 - METHYLATION CALLING AND MULTIQC REPORT -=============================================== -Input directory (input dir from step 2) : ${params.steps_2_and_3_input_directory} -MultiQC configuration file : ${params.multiqc_config} -=============================================== -""" -} else { - println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://github.com/bernardo-heberle/DCNL_NANOPORE_PIPELINE" - System.exit(1) -} - -// Import Workflows -include {BASECALLING} from '../sub_workflows/BASECALLING' -include {FILTERING_AND_QC_FROM_STEP_1} from '../sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf' -include {FILTERING_AND_QC_FROM_MINKNOW} from '../sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf' -include {MODKIT_AND_MULTIQC} from '../sub_workflows/MODKIT_AND_MULTIQC.nf' - -// Define initial files and channels -if (params.step.toString() == "1") { - if (params.prefix == "None") { - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - } else { - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - } - ref = file(params.ref) - quality_score = Channel.value(params.qscore_thresh) - basecall_speed = Channel.value(params.basecall_speed) - basecall_mods = Channel.value(params.basecall_mods) - basecall_config = Channel.value(params.basecall_config) - basecall_trim = Channel.value(params.basecall_trim) - basecall_compute = Channel.value(params.basecall_compute) - trim_barcode = Channel.value(params.trim_barcode) - devices = Channel.value(params.gpu_devices) -} else if (params.step.toString() == "2_from_step_1") { - total_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txts = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - mapq = Channel.value(params.mapq) - quality_score = Channel.value(params.qscore_thresh) -} else if (params.step.toString() == "2_from_minknow") { - input_dir = Channel.fromPath("${params.steps_2_and_3_input_directory}/") - mapq = Channel.value(params.mapq) - quality_score = Channel.value(params.qscore_thresh) -} else if (params.step.toString() == "3") { - filtered_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - filtered_bais = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - num_reads = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*") - read_length = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/read_length/*") - quality_thresholds = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*") - multiqc_config = Channel.fromPath(params.multiqc_config) - multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") -} - -// Main logic -workflow { - if (params.step.toString() == "1") { - BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, quality_score, trim_barcode, devices, ref) - } else if (params.step.toString() == "2_from_step_1") { - FILTERING_AND_QC_FROM_STEP_1(total_bams, txts, mapq, quality_score) - } else if (params.step.toString() == "2_from_minknow") { - FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, quality_score) - } else if (params.step.toString()== "3") { - MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input) - } -} diff --git a/workflow/nextflow.config b/workflow/nextflow.config deleted file mode 100755 index 3b104aad4ae276d2ccf00a6a157a83e55fa3b0be..0000000000000000000000000000000000000000 --- a/workflow/nextflow.config +++ /dev/null @@ -1,72 +0,0 @@ -// CONFIGURATION FILE - -// Pipeline parameter default values, can be modified by user when calling -// pipeline on command line (e.g. --ont_reads_fq sample_1.fastq) - -// Input reference fasta file -params.ref = 'None' -// Step of pipeline to execute -params.step = 'None' -// Output directory for pipeline results -params.out_dir = "output_directory/" -// directory of basecalling data -params.basecall_path = 'None' -// MAPQ filtering threshold for bam files, 0 for no filtering -params.mapq = "10" -// Quality score threshold -params.qscore_thresh = "9" -// Desired basecall speed -params.basecall_speed = "hac" -// Desired basecaller modifications -params.basecall_mods = false -// Threshold for mapped reasds -params.min_mapped_reads_thresh = 500 -// Desired basecall configuration -params.basecall_config = "None" -// Type of read trimming during basecalling ("all", "primers", "adapters", "none") -params.basecall_trim = "none" -// Basecalling demultiplexing -params.basecall_demux = false -// CPU vs GPU basecalling -params.basecall_compute = "gpu" -// Trim barcodes (only counts if demultiplexing is enabled) -params.trim_barcode = "True" -// Add prefix to all output files -params.prefix = "None" -// Which GPU devices to use for basecalling? -params.gpu_devices = "all" -// Previous results -params.steps_2_and_3_input_directory = "None" -// MultiQC config -params.multiqc_config = "None" -// Are the files from MinKNOW barcoded or not -params.is_barcoded = true - -// Set queue size for the executor -if (params.step == 1) { - queue_size = 1 -} else { - queue_size = 5 -} -process { - // Define local cpu execution - withLabel: cpu { - executor='local' - } - // Define local gpu execution - withLabel: gpu { - executor='local' - containerOptions = '--nv' - } - // Define the singularity container for every process - //container = "library://joaochrusciel/nanopore/ont_methylation:2024-10-18" - container = "./images/debian-nanopore.sif" -} -executor { - name = 'local' - queueSize = queue_size -} -apptainer { - enabled = true - pullTimeout = '60m' -} diff --git a/workflow/sub_workflows/BASECALLING.nf b/workflow/sub_workflows/BASECALLING.nf new file mode 100755 index 0000000000000000000000000000000000000000..1e0a62fb33dd1c0474f5d3c909eeabc1e86790c5 --- /dev/null +++ b/workflow/sub_workflows/BASECALLING.nf @@ -0,0 +1,43 @@ +include { FAST5_to_POD5 ; BASECALL_CPU ; BASECALL_CPU_DEMUX ; BASECALL_GPU ; BASECALL_GPU_DEMUX } from '../modules/basecall.nf' + +workflow BASECALLING { + take: + pod5_path + fast5_path + speed + modifications + config + trim + quality_score + trim_barcode + devices + ref + + main: + FAST5_to_POD5(fast5_path) + pod5_path = FAST5_to_POD5.out.mix(pod5_path) + if (params.basecall_compute?.equalsIgnoreCase("cpu")) { + if (params.basecall_demux == true) { + BASECALL_CPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) + bams = BASECALL_CPU_DEMUX.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_CPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + else { + BASECALL_CPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) + bams = BASECALL_CPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_CPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + } + else if (params.basecall_compute?.equalsIgnoreCase("gpu")) { + if (params.basecall_demux == true) { + BASECALL_GPU_DEMUX(pod5_path, speed, modifications, config, trim, quality_score, trim_barcode, devices, ref) + bams = BASECALL_GPU_DEMUX.out.bam.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + txts = BASECALL_GPU_DEMUX.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + else { + BASECALL_GPU(pod5_path, speed, modifications, config, trim, quality_score, devices, ref) + bams = BASECALL_GPU.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) + txts = BASECALL_GPU.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() + } + } +} diff --git a/workflow/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf b/workflow/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf new file mode 100755 index 0000000000000000000000000000000000000000..90dc60c6d9260db7d5b5d7666177d211a0067493 --- /dev/null +++ b/workflow/sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf @@ -0,0 +1,56 @@ +// Import Modules +include { PYCOQC_NO_FILTER ; PYCOQC_FILTER } from '../modules/pycoqc.nf' +include { FILTER_BAM } from '../modules/filter_bam.nf' +include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf' +include { CONVERT_INPUT_FROM_MINKNOW_BARCODED ; CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED } from '../modules/convert_input_from_minknow.nf' + +workflow FILTERING_AND_QC_FROM_MINKNOW { + take: + input + mapq + quality_score + + main: + if (params.is_barcoded == true) { + CONVERT_INPUT_FROM_MINKNOW_BARCODED(input) + converted_input = CONVERT_INPUT_FROM_MINKNOW_BARCODED.out + } else if (params.is_barcoded == false) { + CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input) + converted_input = CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED.out + } + FILTER_BAM( + converted_input.bam.flatten().map { file -> tuple(file.baseName, file) }.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2), + converted_input.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten(), + mapq, + ) + PYCOQC_NO_FILTER( + FILTER_BAM.out.id, + FILTER_BAM.out.total_bam, + FILTER_BAM.out.total_bai, + FILTER_BAM.out.filtered_bam, + FILTER_BAM.out.filtered_bai, + FILTER_BAM.out.unfiltered_flagstat, + FILTER_BAM.out.filtered_flagstat, + FILTER_BAM.out.txt, + quality_score, + ) + PYCOQC_FILTER( + PYCOQC_NO_FILTER.out.id, + PYCOQC_NO_FILTER.out.filtered_bam, + PYCOQC_NO_FILTER.out.filtered_bai, + PYCOQC_NO_FILTER.out.unfiltered_flagstat, + PYCOQC_NO_FILTER.out.filtered_flagstat, + PYCOQC_NO_FILTER.out.txt, + quality_score, + PYCOQC_NO_FILTER.out.unfiltered_pyco_json, + ) + MAKE_QC_REPORT( + PYCOQC_FILTER.out.id, + PYCOQC_FILTER.out.unfiltered_flagstat, + PYCOQC_FILTER.out.filtered_flagstat, + PYCOQC_FILTER.out.unfiltered_pyco_json, + PYCOQC_FILTER.out.filtered_pyco_json, + mapq, + quality_score, + ) +} diff --git a/workflow/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf b/workflow/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf new file mode 100755 index 0000000000000000000000000000000000000000..6f3a7113be41b75386ae141a2aab654777e74532 --- /dev/null +++ b/workflow/sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf @@ -0,0 +1,46 @@ +// Import Modules + +include { PYCOQC_NO_FILTER ; PYCOQC_FILTER } from '../modules/pycoqc.nf' +include { FILTER_BAM } from '../modules/filter_bam.nf' +include { MAKE_QC_REPORT } from '../modules/num_reads_report.nf' + +workflow FILTERING_AND_QC_FROM_STEP_1 { + take: + bams + txts + mapq + quality_score + + main: + FILTER_BAM(bams, txts, mapq) + PYCOQC_NO_FILTER( + FILTER_BAM.out.id, + FILTER_BAM.out.total_bam, + FILTER_BAM.out.total_bai, + FILTER_BAM.out.filtered_bam, + FILTER_BAM.out.filtered_bai, + FILTER_BAM.out.unfiltered_flagstat, + FILTER_BAM.out.filtered_flagstat, + FILTER_BAM.out.txt, + quality_score, + ) + PYCOQC_FILTER( + PYCOQC_NO_FILTER.out.id, + PYCOQC_NO_FILTER.out.filtered_bam, + PYCOQC_NO_FILTER.out.filtered_bai, + PYCOQC_NO_FILTER.out.unfiltered_flagstat, + PYCOQC_NO_FILTER.out.filtered_flagstat, + PYCOQC_NO_FILTER.out.txt, + quality_score, + PYCOQC_NO_FILTER.out.unfiltered_pyco_json, + ) + MAKE_QC_REPORT( + PYCOQC_FILTER.out.id, + PYCOQC_FILTER.out.unfiltered_flagstat, + PYCOQC_FILTER.out.filtered_flagstat, + PYCOQC_FILTER.out.unfiltered_pyco_json, + PYCOQC_FILTER.out.filtered_pyco_json, + mapq, + quality_score, + ) +} diff --git a/workflow/sub_workflows/MODKIT_AND_MULTIQC.nf b/workflow/sub_workflows/MODKIT_AND_MULTIQC.nf new file mode 100755 index 0000000000000000000000000000000000000000..35294c78f20ee2aef614636017fef98b5f09af10 --- /dev/null +++ b/workflow/sub_workflows/MODKIT_AND_MULTIQC.nf @@ -0,0 +1,23 @@ +// Import Modules +include { MULTIQC } from '../modules/multiqc.nf' +include { MODKIT } from '../modules/modkit.nf' +include { MERGE_QC_REPORT } from '../modules//num_reads_report.nf' +include { CALCULATE_COVERAGE ; MERGE_COVERAGE } from "../modules/calculate_coverage.nf" + +workflow MODKIT_AND_MULTIQC { + take: + filtered_bams + filtered_bais + num_reads + read_length + quality_thresholds + multiqc_config + multiqc_input + + main: + CALCULATE_COVERAGE(filtered_bams, filtered_bais) + MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect()) + MERGE_QC_REPORT(num_reads.collect(), read_length.collect(), quality_thresholds.collect()) + MULTIQC(multiqc_input.concat(MERGE_QC_REPORT.out.flatten()).collect(), MERGE_COVERAGE.out.mqc, multiqc_config) + MODKIT(filtered_bams, filtered_bais) +}