diff --git a/containers/debian-nanopore.def b/containers/debian-nanopore.def index d8d4fa482ce7f29783655c33b443d0cd6ecfbe15..bc1f4a94a022d67f4e32009d59da71ee24a031bd 100644 --- a/containers/debian-nanopore.def +++ b/containers/debian-nanopore.def @@ -8,7 +8,6 @@ From: debian:12 locales \ wget \ git \ - jq="{{ JQ_VERSION }}" \ python3="{{ PYTHON3_VERSION }}" \ python3-pip \ cython3 \ @@ -16,6 +15,11 @@ From: debian:12 bedtools="{{ BEDTOOLS_VERSION }}" \ && apt-get clean && rm -rf /var/lib/apt/lists/* + # Install latest jq from GitHub + JQ_URL="https://github.com/jqlang/jq/releases/download/jq-{{ JQ_VERSION }}/jq-linux-amd64" + wget -O /usr/local/bin/jq "$JQ_URL" + chmod +x /usr/local/bin/jq + # Set timezone and language for container ## ln -fs /usr/share/zoneinfo/America/Sao_Paulo /etc/localtime locale-gen en_US.UTF-8 @@ -26,10 +30,15 @@ From: debian:12 # Install python packages # using no-cache-dir so we dont keep a copy of the downloaded package # and break-system-packages to override PEP 668 (which blocks pip installs) - pip install --no-cache-dir --break-system-packages \ - pod5=={{ POD5_VERSION }} \ - multiqc=={{ MULTIQC_VERSION }} \ - plotly=={{ PLOTLY_VERSION }} + # 1) NumPy/pandas compatíveis com pycoQC + python3 -m pip install --no-cache-dir --break-system-packages \ + "numpy<2.0.0" "pandas<2.2.0" + + # 2) demais libs + python3 -m pip install --no-cache-dir --break-system-packages \ + pod5=={{ POD5_VERSION }} \ + multiqc=={{ MULTIQC_VERSION }} \ + plotly=={{ PLOTLY_VERSION }} # Clone and install pycoQC from duceppemo fork # (change plotly version to match MultiQC plotly version >= 5.18) diff --git a/containers/versions.txt b/containers/versions.txt index 3805f83381a4b9583b08ae7a35402e486f0e7058..84029179a795f53b1bd5c048f1420f974a5238ea 100644 --- a/containers/versions.txt +++ b/containers/versions.txt @@ -1,5 +1,5 @@ -DEBIAN_NANOPORE_VERSION="v0.5.2" -JQ_VERSION="1.6-2.1+deb12u1" +DEBIAN_NANOPORE_VERSION="v0.6.2" +JQ_VERSION="1.7.1" SAMTOOLS_VERSION="1.16.1-1" BEDTOOLS_VERSION="2.30.0+dfsg-3" PYTHON3_VERSION="3.11.2-1+b1" @@ -7,5 +7,5 @@ POD5_VERSION="0.3.23" MULTIQC_VERSION="1.28" PLOTLY_VERSION="5.18.0" PYCOQC_VERION="3.0.0" -MODKIT_VERSION="v0.4.3" -DORADO_VERSION="0.9.1" +MODKIT_VERSION="v0.6.0" +DORADO_VERSION="1.3.0" diff --git a/parameters/human_blood/basecall.yaml b/parameters/human_blood/basecall.yaml index 4e476093e00f11bfe1706200156d3eba6d787120..ce8ff82fe2cfc99fcd169c3869312e51131357f4 100644 --- a/parameters/human_blood/basecall.yaml +++ b/parameters/human_blood/basecall.yaml @@ -1,5 +1,5 @@ project_name: "human_blood" step: 1 -basecall_path: "./data/pod5" +basecall_path: "./data" reference_file: "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa" out_dir: "results_human_blood/" \ No newline at end of file diff --git a/parameters/human_blood/modkit.yaml b/parameters/human_blood/modkit.yaml index 5149ad894bc1204d3e42e2a5579b0d4274dcaa6e..3a694f0d54cf2ad28e2bcf93b1ed0bc5fd91fab1 100644 --- a/parameters/human_blood/modkit.yaml +++ b/parameters/human_blood/modkit.yaml @@ -1,4 +1,5 @@ project_name: "results_human_blood" step: 3 steps_2_and_3_input_directory: "./results/results_human_blood/" +reference_file: "./references/Homo_sapiens.GRCh38.dna.primary_assembly.fa" out_dir: "results_human_blood/" \ No newline at end of file diff --git a/src/main.nf b/src/main.nf index c9db6e6d4effe126035ba9e570df45f693fc152d..4c75d3105d41663f01ec71ce75a46fdf3c540459 100755 --- a/src/main.nf +++ b/src/main.nf @@ -1,107 +1,210 @@ -// Import sub-workflows +// import sub-workflows include {BASECALLING} from './sub_workflows/BASECALLING' include {FILTERING_AND_QC_FROM_STEP_1} from './sub_workflows/FILTERING_AND_QC_FROM_STEP_1.nf' include {FILTERING_AND_QC_FROM_MINKNOW} from './sub_workflows/FILTERING_AND_QC_FROM_MINKNOW.nf' include {MODKIT_AND_MULTIQC} from './sub_workflows/MODKIT_AND_MULTIQC.nf' -// Main workflow logic +// main workflow logic workflow { - // Log execution parameters + // log execution parameters if (params.step == 1) { log.info """ ================================================================= - STEP 1 - OXFORD NANOPORE DNA SEQUENCING BASECALLING AND ALIGNMENT + STEP 1 - BASECALLING & ALIGNMENT ================================================================= - basecall files path containing : ${params.basecall_path} - basecall speed (basecall only) : ${params.basecall_speed} - basecall modifications (basecall only) : ${params.basecall_mods} - basecall config : ${params.basecall_config} - basecall read trimming option : ${params.basecall_trim} - basecall quality score threshold for basecalling : ${params.qscore_thresh} - basecall demultiplexing : ${params.basecall_demux} - trim barcodes during demultiplexing : ${params.trimmed_barcodes} - submission output file prefix : ${params.prefix} - GPU device for submission : ${params.gpu_devices} - Output directory : ${params.out_dir} + basecall input path : ${params.basecall_path} + input file types : fast5 / pod5 (auto-detected) + output prefix : ${params.prefix} + basecalling speed : ${params.basecall_speed} + basecall config : ${params.basecall_config} + basecall modifications : ${params.basecall_mods} + read trimming : ${params.basecall_trim} + quality threshold (basecalling) : ${params.qscore_thresh} + demultiplexing enabled : ${params.basecall_demux} + trimmed barcodes : ${params.trimmed_barcodes} + barcoding kit : ${params.barcoding_kit} + GPU devices : ${params.gpu_devices} + pod5 threads : ${params.pod5_threads} + samtools threads : ${params.samtools_threads} + reference fasta : ${params.reference_file} + output directory : ${params.out_dir} ================================================================= - """ + """ } else if (params.step == "2_from_step_1" || params.step == "2_from_minknow") { log.info """ ====================================== - STEP 2 - FILTERING AND QUALITY CONTROL + STEP 2 - FILTERING & QUALITY CONTROL ====================================== - Input directory (output dir from step 1) : ${params.steps_2_and_3_input_directory} - Basecall quality score threshold : ${params.qscore_thresh} - MAPQ filtering threshold : ${params.mapq} - Min number of mapped reads per sample/barcode : ${params.min_mapped_reads_thresh} - BAM files barcoded? : ${params.is_barcoded} + mode / entry point : ${params.step} + input directory (from step 1 / MINKNOW): ${params.steps_2_and_3_input_directory} + basecall quality threshold : ${params.qscore_thresh} + MAPQ filter threshold : ${params.mapq} + min mapped reads per sample/barcode : ${params.min_mapped_reads_thresh} + BAMs are barcoded : ${params.is_barcoded} + samtools threads : ${params.samtools_threads} + expected output dirs : bam_filtering, intermediate_qc_reports, multiqc_input ====================================== """ } else if (params.step == 3) { log.info """ =============================================== - STEP 3 - METHYLATION CALLING AND MULTIQC REPORT + STEP 3 - METHYLATION CALLING & MULTIQC =============================================== - Input directory (input dir from step 2) : ${params.steps_2_and_3_input_directory} - MultiQC configuration file : ${params.multiqc_config} + input directory : ${params.steps_2_and_3_input_directory} + multiqc config : ${params.multiqc_config} + multiqc input path : ${params.steps_2_and_3_input_directory}/multiqc_input + modkit threads : ${params.modkit_threads} + modkit insert size (isize) : ${params.modkit_isize} + modkit extract qsize : ${params.modkit_extract_qsize} + reference fasta : ${params.reference_file} + samtools threads : ${params.samtools_threads} + expected inputs : bam_filtering/*-Filtered*.bam + intermediate_qc_reports =============================================== """ } else { println "ERROR: You must set parameter --step to '1' or '2_from_step_1' or '2_from_minknow' or '3'. Please refer to documentation at: https://gmapsrv.pucrs.br/gitlab/ccd-public/nanopore" System.exit(1) } - // Set initial files and channels - - // always used - samtools_threads = Channel.value(params.samtools_threads) - // step conditionals + // set initial files and channels + // commonly used interchannels + samtools_threads_ch = channel.value(params.samtools_threads) + // step-specific conditionals if (params.step == 1) { if (params.prefix == null) { - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + fast5_path = channel.fromPath("${params.basecall_path}/**.fast5") + .map {file -> + def parent_parts = file.parent.toString().split('/') + def run_id = parent_parts[-3] + def sample = file.simpleName.split('_')[0] + def date_bits = file.simpleName.split('_')[-3..-2].join('_') + tuple("${run_id}_${sample}_${date_bits}", file) + } + .groupTuple() + pod5_path = channel.fromPath("${params.basecall_path}/**.pod5") + .map {file -> + def parent_parts = file.parent.toString().split('/') + def run_id = parent_parts[-3] + def sample = file.simpleName.split('_')[0] + def date_bits = file.simpleName.split('_')[-3..-2].join('_') + tuple("${run_id}_${sample}_${date_bits}", file) + } + .groupTuple() + } else { + fast5_path = channel.fromPath("${params.basecall_path}/**.fast5") + .map {file -> + def parent_parts = file.parent.toString().split('/') + def run_id = parent_parts[-2] + def sample = file.simpleName.split('_')[0] + def date_bits = file.simpleName.split('_')[-3..-2].join('_') + tuple("${params.prefix}_${run_id}_${sample}_${date_bits}", file) + } + .groupTuple() + pod5_path = channel.fromPath("${params.basecall_path}/**.pod5") + .map {file -> + def parent_parts = file.parent.toString().split('/') + def run_id = parent_parts[-2] + def sample = file.simpleName.split('_')[0] + def date_bits = file.simpleName.split('_')[-3..-2].join('_') + tuple("${params.prefix}_${run_id}_${sample}_${date_bits}", file) + } + .groupTuple() + } + // define basecall_arg_val to select speed/mods/config + def basecall_arg_val + if (params.basecall_config != null) { + basecall_arg_val = params.basecall_config + } else if (params.basecall_mods != null) { + basecall_arg_val = "${params.basecall_speed},${params.basecall_mods}" } else { - fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() - pod5_path = Channel.fromPath("${params.basecall_path}/**.pod5").map{file -> tuple("${params.prefix}_" + file.parent.toString().split("/")[-2] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple() + basecall_arg_val = params.basecall_speed } - basecall_speed = params.basecall_speed - basecall_mods = params.basecall_mods - basecall_config = params.basecall_config - basecall_trim = Channel.value(params.basecall_trim) - qscore_thresh = Channel.value(params.qscore_thresh) - barcoding_kit = Channel.value(params.barcoding_kit) - trimmed_barcodes = Channel.value(params.trimmed_barcodes) - gpu_devices = Channel.value(params.gpu_devices) - reference_file = file(params.reference_file) - pod5_threads = Channel.value(params.pod5_threads) + // parameter channels + basecall_arg_ch = channel.value(basecall_arg_val) + basecall_trim_ch = channel.value(params.basecall_trim) + qscore_thresh_ch = channel.value(params.qscore_thresh) + barcoding_kit_ch = channel.value(params.barcoding_kit) + trimmed_barcodes_ch = channel.value(params.trimmed_barcodes) + gpu_devices_ch = channel.value(params.gpu_devices) + reference_file = file(params.reference_file) + pod5_threads_ch = channel.value(params.pod5_threads) } else if (params.step == "2_from_step_1") { - bam_files = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - txt_files = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - mapq = Channel.value(params.mapq) - qscore_thresh = Channel.value(params.qscore_thresh) + bam_files = channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam") + .map {file -> tuple(file.baseName, file) } + .toSortedList { a, b -> a[0] <=> b[0] } + .flatten() + .buffer(size: 2) + txt_files = channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt") + .toSortedList { a, b -> a.baseName <=> b.baseName } + .flatten() + mapq_ch = channel.value(params.mapq) + qscore_thresh_ch = channel.value(params.qscore_thresh) } else if (params.step == "2_from_minknow") { - input_dir = Channel.fromPath("${params.steps_2_and_3_input_directory}/") - mapq = Channel.value(params.mapq) - qscore_thresh = Channel.value(params.qscore_thresh) + input_dir = channel.fromPath("${params.steps_2_and_3_input_directory}/") + mapq_ch = channel.value(params.mapq) + qscore_thresh_ch = channel.value(params.qscore_thresh) } else if (params.step == 3) { - filtered_bams = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2) - filtered_bais = Channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten() - num_reads = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*") - read_length = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/read_length/*") - quality_thresholds = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*") - multiqc_config = Channel.fromPath(params.multiqc_config) - multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") - modkit_threads = Channel.value(params.modkit_threads) - modkit_isize = Channel.value(params.modkit_isize) - modkit_extract_qsize = Channel.value(params.modkit_extract_qsize) + filtered_bams = channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam") + .map { file -> tuple(file.baseName, file) } + .toSortedList { a, b -> a[0] <=> b[0] } + .flatten() + .buffer(size: 2) + filtered_bais = channel.fromPath("${params.steps_2_and_3_input_directory}/bam_filtering/*-Filtered*.bam.bai") + .toSortedList { a, b -> a.baseName <=> b.baseName } + .flatten() + num_reads = channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/number_of_reads/*") + read_length = channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/read_length/*") + quality_thresholds = channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*") + multiqc_config = channel.fromPath(params.multiqc_config) + multiqc_input = channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file") + reference_file = file(params.reference_file) + modkit_threads_ch = channel.value(params.modkit_threads) + modkit_isize_ch = channel.value(params.modkit_isize) + modkit_qsize_ch = channel.value(params.modkit_extract_qsize) } - // Run steps + // run steps if (params.step == 1) { - BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file, pod5_threads, samtools_threads) + BASECALLING( + pod5_path, + fast5_path, + basecall_arg_ch, + basecall_trim_ch, + qscore_thresh_ch, + barcoding_kit_ch, + trimmed_barcodes_ch, + gpu_devices_ch, + reference_file, + pod5_threads_ch, + samtools_threads_ch + ) } else if (params.step == "2_from_step_1") { - FILTERING_AND_QC_FROM_STEP_1(bam_files, txt_files, mapq, qscore_thresh, samtools_threads) + FILTERING_AND_QC_FROM_STEP_1( + bam_files, + txt_files, + mapq_ch, + qscore_thresh_ch, + samtools_threads_ch + ) } else if (params.step == "2_from_minknow") { - FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, qscore_thresh, samtools_threads) + FILTERING_AND_QC_FROM_MINKNOW( + input_dir, + mapq_ch, + qscore_thresh_ch, + samtools_threads_ch + ) } else if (params.step == 3) { - MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input, samtools_threads, modkit_threads, modkit_isize, modkit_extract_qsize) + MODKIT_AND_MULTIQC( + filtered_bams, + filtered_bais, + num_reads, + read_length, + quality_thresholds, + multiqc_config, + multiqc_input, + reference_file, + samtools_threads_ch, + modkit_threads_ch, + modkit_isize_ch, + modkit_qsize_ch + ) } -} +} \ No newline at end of file diff --git a/src/modules/basecall.nf b/src/modules/basecall.nf index 9ea9c169d4eb17fb17214aa3979122e100de2ef4..7e7fc6c63c3bc2e02e6015c872fd0f4eda0cb859 100755 --- a/src/modules/basecall.nf +++ b/src/modules/basecall.nf @@ -34,18 +34,19 @@ process BASECALL { val samtools_threads output: - path ("*.bam"), emit: bam - path ("*.txt"), emit: txt + path("*.bam"), emit: bam + path("*.txt"), emit: txt script: """ echo "Basecalling started for: ${id}" + dorado basecaller "${basecall_arg}" . \ - ${barcoding_kit != null ? "--kit-name ${barcoding_kit}" : ""} \ - --trim "${basecall_trim}" \ - --min-qscore "${qscore_thresh}" \ - --reference "${reference_file}" \ - --device "cuda:${gpu_devices}" > "${id}.bam" + ${barcoding_kit != null ? "--kit-name ${barcoding_kit}" : ""} \ + --trim "${basecall_trim}" \ + --min-qscore "${qscore_thresh}" \ + --reference "${reference_file}" \ + --device "cuda:${gpu_devices}" > "${id}.bam" echo "Basecalling completed, sorting bams..." samtools sort -@ ${samtools_threads} "${id}.bam" -o "${id}_sorted.bam" @@ -63,24 +64,22 @@ process BASECALL { echo "Demultiplexing completed, sorting barcode files..." cd ./demux_data/ - for file in *; do - samtools sort -@ ${samtools_threads} "\$file" -o "${id}_\$file" - rm "\$file" + + # Find all BAMs and sort them + find . -type f -name "*.bam" -print0 | while IFS= read -r -d '' file; do + base="\$(basename "\$file")" + echo "Sorting demux BAM: \$file" + samtools sort -@ ${samtools_threads} "\$file" -o "../${id}_\${base}" done - cd ../ - mv ./demux_data/* ./ - rm -r ./demux_data/ - + cd .. + rm -rf ./demux_data/ + echo "Bams sorted, generating summary with dorado..." for file in *.bam; do new_id="\${file%%.*}" dorado summary "\$file" > "\${new_id}.txt" done echo "Process completed for: ${id}" - - # echo "Bams sorted, generating summary with dorado..." - # dorado summary "${id}.bam" > "${id}.txt" - # echo "Process completed for: ${id}" """ } \ No newline at end of file diff --git a/src/modules/calculate_coverage.nf b/src/modules/calculate_coverage.nf index 577ea6f782e84065418db40460d8422d2c9c7d8c..274cfa6420eed4641f621fa55671530ed2f57106 100755 --- a/src/modules/calculate_coverage.nf +++ b/src/modules/calculate_coverage.nf @@ -24,23 +24,30 @@ process CALCULATE_COVERAGE { }' >> ${id}.coverage.tsv """ } + process MERGE_COVERAGE { - publishDir "${params.steps_2_and_3_input_directory}/calculate_coverage/", mode: "copy", pattern: "*", overwrite: true + publishDir "${params.steps_2_and_3_input_directory}/calculate_coverage/", + mode: "copy", + pattern: "*", + overwrite: true + label 'cpu' input: path coverage_files + path merge_script output: - path ("merged_coverage.tsv"), emit: output - path ("*mqc*"), emit: mqc + path("merged_coverage.tsv"), emit: output + path("Chr_Coverage_mqc.tsv"), emit: mqc script: """ - merge_files.py ${coverage_files.join(' ')} merged_coverage.tsv - echo "# plot_type: 'table'" >> "Chr_Coverage_mqc.tsv" + set -euo pipefail + "${merge_script}" ${coverage_files.join(' ')} merged_coverage.tsv + echo "# plot_type: 'table'" > "Chr_Coverage_mqc.tsv" echo "# id: 'chromosome coverage custom'" >> "Chr_Coverage_mqc.tsv" echo "# section_name: 'Average chromosome coverage per sample'" >> "Chr_Coverage_mqc.tsv" cat merged_coverage.tsv >> "Chr_Coverage_mqc.tsv" """ -} +} \ No newline at end of file diff --git a/src/modules/collapse_strands.nf b/src/modules/collapse_strands.nf new file mode 100644 index 0000000000000000000000000000000000000000..f1dee7ced3dd2326b5bf1e48f2f9accab6df1c9b --- /dev/null +++ b/src/modules/collapse_strands.nf @@ -0,0 +1,50 @@ +process COLLAPSE_STRANDS { + tag "${id}" + label 'cpu' + + publishDir "${params.steps_2_and_3_input_directory}/collapsed/", + mode: 'copy', overwrite: true + + input: + tuple val(id), path(pileup_bed) + path reference + + output: + tuple val(id), path("${id}_pileup_collapsed.bed") + + script: + """ + set -euo pipefail + samtools faidx "${reference}" + awk -v FS='\\t' -v OFS='\\t' -v REF="${reference}" ' + BEGIN { + # minus-strand dinucleotide -> plus-strand equivalent + map["TG"]="CA"; map["GT"]="AC"; + map["AG"]="CT"; map["TC"]="GA"; + } + # mtDNA selection (supports MT/chrM/chrMT) + (\$1=="MT" || \$1=="chrM" || \$1=="chrMT") { + if (\$6 == "-") { + cmd = sprintf("samtools faidx %s %s:%d-%d", + REF, \$1, \$2, \$2+1) + cmd | getline hdr; + cmd | getline seq; + close(cmd); + \$2 -= 1; + \$3 -= 1; + if (seq in map) + \$4 = map[seq]; + \$6 = "+"; + } + print; + } + ' "${pileup_bed}" \ + | sort -k1,1 -k2,2n \ + | bedtools groupby -i - \ + -g 1,2,3,4,10 \ + -c 11,12,13 \ + -o sum,sum,sum \ + | awk 'BEGIN{OFS="\\t"} { \$14=(\$11/\$10)*100; print }' \ + > "${id}_pileup_collapsed.bed" + """ +} \ No newline at end of file diff --git a/src/modules/modkit.nf b/src/modules/modkit.nf index d9549373aec63153ac06499c28b6cb393f5075df..5a8d6b6e19811cf1376f30cbaca8f3b86d30b986 100755 --- a/src/modules/modkit.nf +++ b/src/modules/modkit.nf @@ -5,24 +5,58 @@ process MODKIT { input: tuple val(id), path(bam) path bai + path reference_file val modkit_threads val modkit_isize val modkit_extract_qsize output: - path "*" + tuple val(id), path("${id}_modkit_pileup.bed"), emit: pileup + path "*", emit: allfiles script: """ - echo "starting modkit" + echo "[$id] starting base modification probability sampling (sample-probs)" + ## Inspect base modification probabilities + modkit sample-probs \ + --hist \ + --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --out-dir "./modkit_prob/" "${bam}" + echo "[$id] sample-probs completed" + + echo "[$id] starting modkit extract calls (read-level table)" ## Make Methylation TSV file - modkit extract full --bgzf --queue-size ${modkit_extract_qsize} --threads ${modkit_threads} --interval-size ${modkit_isize} --mapped-only "${bam}" "${id}_modkit_output.tsv" - echo "extract successful" - ## Make Methylation Table (--chunk-size is a multiplier of threads, 1.5x by default) - modkit pileup --threads ${modkit_threads} --interval-size ${modkit_isize} "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log" - echo "pileup successful" + modkit extract calls \ + --bgzf \ + --queue-size ${modkit_extract_qsize} \ + --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --reference "${reference_file}" \ + --filter-threshold 0.8 \ + --mapped-only \ + "${bam}" "${id}_modkit_calls" + echo "[$id] extract calls completed" + + echo "[$id] starting modkit pileup (site-level bedMethyl)" + ## Make Methylation Table + modkit pileup \ + --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --filter-threshold 0.8 \ + --modified-bases 6mA 5mC 5hmC \ + --reference "${reference_file}" \ + "${bam}" "${id}_modkit_pileup.bed" \ + --log-filepath "${id}_modkit_pileup.log" + echo "[$id] pileup successful" + + echo "[$id] starting modkit summary (global stats per mod/base)" ## Make Summary - modkit summary --threads ${modkit_threads} --interval-size ${modkit_isize} ${bam} > "${id}_modkit_summary.txt" - echo "summary successful" + modkit summary \ + --threads ${modkit_threads} \ + --interval-size ${modkit_isize} \ + --filter-threshold 0.8 \ + ${bam} > "${id}_modkit_summary.txt" + echo "[$id] summary successful" """ } diff --git a/src/nextflow.config b/src/nextflow.config index 119d5deb70ae64dd79cd828360e771f061475517..bb3ef127946e388681a33de7e3600664bc9180cc 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -5,16 +5,57 @@ cleanup = true params { + // ----------------- + // BASIC SETTINGS + // ----------------- // Project name (used to identify which project you're working on) project_name = "default" - // Input reference fasta file - reference_file = null - // Step of pipeline to execute + // Define run ID to identify the run (e.g. "run01", "run02_1", "run02_2", etc.); based on the sequencing run folder name + run_id = null + // SeaDrive root (~/SeaDrive/My Libraries/My Library/phd_nanopore) + seadrive_root = "${System.getenv('HOME')}/SeaDrive/My Libraries/My Library/phd_nanopore" + // Step of pipeline to execute (1, "2_from_step_1", "2_from_minknow", 3) step = null - // Output directory for pipeline results - out_dir = "results_${params.project_name}/" - // directory of basecalling data - basecall_path = "./data" + + // ----------------- + // INPUTS + // ----------------- + // Reference genome file path (stored in SeaDrive) + reference_file = "${System.getenv('HOME')}/SeaDrive/My Libraries/My Library/phd_nanopore/metadata/references/Homo_sapiens.GRCh38.dna.primary_assembly.fa" + // RAW data input for basecalling + basecall_path = params.run_id + ? "${params.seadrive_root}/raw/${params.run_id}" + : "./data" + // Step 2 (from step 1) reads from basecalling outputs + steps_2_input_directory = params.run_id + ? params.basecalling_out_dir + : null + // Step 3 reads from QC outputs + steps_3_input_directory = params.run_id + ? params.qc_out_dir + : null + + // ------------------ + // OUTPUTS + // ------------------ + // Output directory for pipeline results (legacy version; deprecated) + out_dir = "results_${params.project_name}/" + // Step specific outputs + basecalling_out_dir = params.run_id + ? "${params.seadrive_root}/basecalling/${params.run_id}" + : "results_${params.project_name}/basecalling" + + qc_out_dir = params.run_id + ? "${params.seadrive_root}/qc/${params.run_id}" + : "results_${params.project_name}/qc" + + modkit_out_dir = params.run_id + ? "${params.seadrive_root}/modkit/${params.run_id}" + : "results_${params.project_name}/modkit" + + // ------------------ + // OTHER PARAMETERS + // ------------------ // MAPQ filtering threshold for bam files, 0 for no filtering mapq = "10" // Quality score threshold @@ -22,18 +63,18 @@ params { // Desired basecall speed ("fast", "hac", "sup"; @latest <- latest version available) basecall_speed = "sup@latest" // Desired basecaller modifications (4mC_5mC, 5mCG_5hmCG, 5mC_5hmC, 6mA). Can't use more than one modification per nucleotide. - basecall_mods = "5mC_5hmC" + basecall_mods = "5mC_5hmC,6mA" // Kit name (kit used to barcode the samples (e.g. SQK-RBK114-24); Use null to skip --kit-name in basecalling) barcoding_kit = "SQK-RBK114-24" // Threshold for mapped reasds min_mapped_reads_thresh = 500 - // Desired basecall model version as a path (e.g. ./models/dna_r10.4.1_e8.2_400bps_sup@v5.2.0) + // Basecall model path (e.g. ./models/dna_r10.4.1_e8.2_400bps_sup@v5.2.0); null -> use speed+mods basecall_config = null // Type of read trimming during basecalling ("all", "primers", "adapters", "none"); You should change to "none" if you don't want to trim in the basecalling basecall_trim = "all" // Basecalling demultiplexing basecall_demux = false - // Barcodes were trimmed? (if True = demux will only separate the files; if False = demux will trim after basecalling and separate them) + // Barcodes were trimmed? (if true = demux will only separate the files; if false = demux will trim after basecalling and separate them) trimmed_barcodes = true // Add prefix to all output files prefix = null diff --git a/src/sub_workflows/BASECALLING.nf b/src/sub_workflows/BASECALLING.nf index c457c1f26280b5304cf9d50069ab8b057b24810b..57b756f721f0d3a875b90c870ae217a3718923e8 100755 --- a/src/sub_workflows/BASECALLING.nf +++ b/src/sub_workflows/BASECALLING.nf @@ -4,9 +4,7 @@ workflow BASECALLING { take: pod5_path fast5_path - basecall_speed - basecall_mods - basecall_config + basecall_arg basecall_trim qscore_thresh barcoding_kit @@ -17,23 +15,29 @@ workflow BASECALLING { samtools_threads main: + // convert FAST5 to POD5 FAST5_to_POD5(fast5_path, pod5_threads) pod5_path = FAST5_to_POD5.out.mix(pod5_path) - // Saves model, modifications and speed on a separate variable for the basecall - basecall_arg = null - if(basecall_config != null) { - basecall_arg = basecall_config - } else if(basecall_mods != null) { - basecall_arg = "${basecall_speed},${basecall_mods}" - } else { - basecall_arg = basecall_speed - } - basecall_arg = Channel.value(basecall_arg) - BASECALL(pod5_path, basecall_arg, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file, samtools_threads) - bams = BASECALL.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2) - txts = BASECALL.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten() - + // basecalling + BASECALL( + pod5_path, + basecall_arg, + basecall_trim, + qscore_thresh, + barcoding_kit, + trimmed_barcodes, + gpu_devices, + reference_file, + samtools_threads + ) + bams = BASECALL.out.bam + .toSortedList { a, b -> a[0] <=> b[0] } + .flatten() + .buffer(size: 2) + txts = BASECALL.out.txt + .toSortedList { a, b -> a.baseName <=> b.baseName } + .flatten() emit: bam_files = bams txt_files = txts -} +} \ No newline at end of file diff --git a/src/sub_workflows/MODKIT_AND_MULTIQC.nf b/src/sub_workflows/MODKIT_AND_MULTIQC.nf index f2c5812c1adf3029e89092de1008ece107135eae..85aade8621548af7a1c3d070af34533742f0d96d 100755 --- a/src/sub_workflows/MODKIT_AND_MULTIQC.nf +++ b/src/sub_workflows/MODKIT_AND_MULTIQC.nf @@ -1,8 +1,9 @@ -// Import Modules -include { MULTIQC } from '../modules/multiqc.nf' -include { MODKIT } from '../modules/modkit.nf' -include { MERGE_QC_REPORT } from '../modules//num_reads_report.nf' +// import Modules +include { MULTIQC } from '../modules/multiqc.nf' +include { MODKIT } from '../modules/modkit.nf' +include { MERGE_QC_REPORT } from '../modules/num_reads_report.nf' include { CALCULATE_COVERAGE ; MERGE_COVERAGE } from "../modules/calculate_coverage.nf" +include { COLLAPSE_STRANDS } from '../modules/collapse_strands.nf' workflow MODKIT_AND_MULTIQC { take: @@ -13,15 +14,23 @@ workflow MODKIT_AND_MULTIQC { quality_thresholds multiqc_config multiqc_input + reference_file samtools_threads modkit_threads modkit_isize modkit_extract_qsize main: + // run coverage per BAM CALCULATE_COVERAGE(filtered_bams, filtered_bais, samtools_threads) - MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect()) + merge_script_ch = channel.fromPath("merge_files.py") + MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect(), merge_script_ch) + // QC summary tables MERGE_QC_REPORT(num_reads.collect(), read_length.collect(), quality_thresholds.collect()) + // multiQC using merged coverage MULTIQC(multiqc_input.concat(MERGE_QC_REPORT.out.flatten()).collect(), MERGE_COVERAGE.out.mqc, multiqc_config) - MODKIT(filtered_bams, filtered_bais, modkit_threads, modkit_isize, modkit_extract_qsize) -} + // modkit process + MODKIT(filtered_bams, filtered_bais, reference_file, modkit_threads, modkit_isize, modkit_extract_qsize) + // strand collapsing + COLLAPSE_STRANDS(MODKIT.out.pileup, reference_file) +} \ No newline at end of file