Skip to content
Commits on Source (12)
......@@ -120,6 +120,12 @@ The default values for all parameters are set in `src/nexflow.config`. Please no
<Type: String. Adds a prefix to the beggining of your filenames, good when wanting to keep track of batches of data. Example: "Batch_1". Default: "None">
```
```txt
--samtools_threads
<Type: Integer. Global number of threads to use when running samtools. Example: 24. Default: 12>
```
### Step 1: Basecalling
Many of the parameters for this step are based on dorado basecaller, see their [documentation](https://github.com/nanoporetech/dorado) to understand it better.
......@@ -190,6 +196,12 @@ Many of the parameters for this step are based on dorado basecaller, see their [
<Type: String. Which gpu devices to use for basecalling. Only relevant when parameter "--basecall_compute" is set to "gpu". For troubleshooting, you can use the 'nvidia-smi' command to see all the available gpu devices. Options: "0", "0,1,2", ... . Default: "all".
```
```txt
--pod5_threads
<Type: Integer. Number of threads to use when running pod5 convert. Example: 24. Default: 12>
```
### Step 2: Alignment Filtering and Quality Control
```txt
......@@ -224,6 +236,26 @@ Many of the parameters for this step are based on dorado basecaller, see their [
<Type: Path. MultiQC configuration file. We provide a template that works well under "./references/multiqc_config.yaml" in this repository, but you are welcome to customize it as you see fit. Default: "./references/multiqc_config.yaml">
```
> !Note. The following parameters can greatly influence the memory usage. If you're running into `Out of Memory` (OOM) issues, you might want to lower one or more of the default values. Conversely, you can trade memory for cpu overhead by increasing such values.
```txt
--modkit_threads
<Type: Integer. Global number of threads when running modkit. Default: 8>
```
```txt
--modkit_extract_qsize
<Type: Integer. Queue size when running modkit extract full. Default: 1000>
```
```txt
--modkit_isize
<Type: Integer. Interval size when running modkit. Default: 10000>
```
### Parameter Files
The pipeline also supports running [pre-configured parameter files](https://www.nextflow.io/docs/latest/cli.html#pipeline-parameters). The currently supported analyses are under the `parameters/` subdir and can be used via the option `-params-file "./parameters/<type>/<filename>.yaml"` in `nextflow run`. All such files make assumptions about the type of data to be used and where they are being stored.
......
......@@ -51,6 +51,10 @@ workflow {
System.exit(1)
}
// Set initial files and channels
// always used
samtools_threads = Channel.value(params.samtools_threads)
// step conditionals
if (params.step == 1) {
if (params.prefix == "None") {
fast5_path = Channel.fromPath("${params.basecall_path}/**.fast5").map{file -> tuple(file.parent.toString().split("/")[-3] + "_" + file.simpleName.split('_')[0] + "_" + file.simpleName.split('_')[-3..-2].join("_"), file) }.groupTuple()
......@@ -68,6 +72,7 @@ workflow {
trimmed_barcodes = Channel.value(params.trimmed_barcodes)
gpu_devices = Channel.value(params.gpu_devices)
reference_file = file(params.reference_file)
pod5_threads = Channel.value(params.pod5_threads)
} else if (params.step == "2_from_step_1") {
bam_files = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.bam").map {file -> tuple(file.baseName, file) }.toSortedList( { a, b -> a[0] <=> b[0] } ).flatten().buffer(size:2)
txt_files = Channel.fromPath("${params.steps_2_and_3_input_directory}/basecalling_output/*.txt").toSortedList( { a, b -> a.baseName <=> b.baseName } ).flatten()
......@@ -85,15 +90,18 @@ workflow {
quality_thresholds = Channel.fromPath("${params.steps_2_and_3_input_directory}/intermediate_qc_reports/quality_score_thresholds/*")
multiqc_config = Channel.fromPath(params.multiqc_config)
multiqc_input = Channel.fromPath("${params.steps_2_and_3_input_directory}/multiqc_input/**", type: "file")
modkit_threads = Channel.value(params.modkit_threads)
modkit_isize = Channel.value(params.modkit_isize)
modkit_extract_qsize = Channel.value(params.modkit_extract_qsize)
}
// Run steps
if (params.step == 1) {
BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file)
BASECALLING(pod5_path, fast5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file, pod5_threads, samtools_threads)
} else if (params.step == "2_from_step_1") {
FILTERING_AND_QC_FROM_STEP_1(bam_files, txt_files, mapq, qscore_thresh)
FILTERING_AND_QC_FROM_STEP_1(bam_files, txt_files, mapq, qscore_thresh, samtools_threads)
} else if (params.step == "2_from_minknow") {
FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, qscore_thresh)
FILTERING_AND_QC_FROM_MINKNOW(input_dir, mapq, qscore_thresh, samtools_threads)
} else if (params.step == 3) {
MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input)
MODKIT_AND_MULTIQC(filtered_bams, filtered_bais, num_reads, read_length, quality_thresholds, multiqc_config, multiqc_input, samtools_threads, modkit_threads, modkit_isize, modkit_extract_qsize)
}
}
......@@ -4,6 +4,7 @@ process FAST5_to_POD5 {
input:
tuple val(id), path(fast5)
val pod5_threads
output:
tuple val("${id}"), path("*.pod5")
......@@ -13,7 +14,7 @@ process FAST5_to_POD5 {
pod5 convert fast5 *.fast5 \
--output . \
--one-to-one . \
--threads 12
--threads ${pod5_threads}
"""
}
......@@ -32,6 +33,7 @@ process BASECALL {
val trimmed_barcodes
val gpu_devices
path reference_file
val samtools_threads
output:
path ("*.bam"), emit: bam
......@@ -66,7 +68,7 @@ process BASECALL {
fi
echo "Basecalling completed, sorting bams..."
samtools sort -@ 12 "${id}.bam" -o "${id}_sorted.bam"
samtools sort -@ ${samtools_threads} "${id}.bam" -o "${id}_sorted.bam"
rm "${id}.bam"
mv "${id}_sorted.bam" "${id}.bam"
......@@ -82,7 +84,7 @@ process BASECALL {
echo "Demultiplexing completed, sorting barcode files..."
cd ./demux_data/
for file in *; do
samtools sort -@ 12 "\$file" -o "${id}_\$file"
samtools sort -@ ${samtools_threads} "\$file" -o "${id}_\$file"
rm "\$file"
done
......
......@@ -4,6 +4,7 @@ process CALCULATE_COVERAGE {
input:
tuple val(id), path(bam)
path bai
val samtools_threads
output:
path "*coverage*"
......@@ -11,7 +12,7 @@ process CALCULATE_COVERAGE {
script:
"""
echo -e "Chromosome\\t${id}" > ${id}.coverage.tsv
samtools depth -a ${bam} | awk '
samtools depth -@ ${samtools_threads} -a ${bam} | awk '
{
sum[\$1] += \$3;
count[\$1]++;
......
......@@ -4,6 +4,7 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED {
input:
path input
val samtools_threads
output:
path ("*.bam"), emit: bam
......@@ -32,7 +33,7 @@ process CONVERT_INPUT_FROM_MINKNOW_BARCODED {
# Merge BAM files using samtools
barcode_name=\$(basename "\$barcode_dir")
output_bam="./\${parent_dir_two_above}_\${barcode_name}.bam"
samtools merge "./\$output_bam" "\${bam_files[@]}"
samtools merge -@ ${samtools_threads} "./\$output_bam" "\${bam_files[@]}"
# Generate summary with dorado
dorado summary "\$output_bam" > "./\${parent_dir_two_above}_\${barcode_name}.txt"
fi
......@@ -48,6 +49,7 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED {
input:
path input
val samtools_threads
output:
path ("*.bam"), emit: bam
......@@ -73,7 +75,7 @@ process CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED {
if [ \${#bam_files[@]} -gt 0 ]; then
# Merge BAM files using samtools
output_bam="./\${parent_dir}.bam"
samtools merge "./\$output_bam" "\${bam_files[@]}"
samtools merge -@ ${samtools_threads} "./\$output_bam" "\${bam_files[@]}"
# Generate summary with dorado
dorado summary "\$output_bam" > "./\${parent_dir}.txt"
fi
......
......@@ -7,6 +7,7 @@ process FILTER_BAM {
tuple val(id), path(bam)
path txt
val mapq
val samtools_threads
output:
env (id), emit: id, optional: true
......@@ -22,14 +23,14 @@ process FILTER_BAM {
script:
"""
cp "${bam}" "${id}-Unfiltered.bam"
samtools index "${id}-Unfiltered.bam"
samtools flagstat "${id}-Unfiltered.bam" > "${id}-Unfiltered.flagstat"
samtools idxstats "${id}-Unfiltered.bam" > "${id}-Unfiltered.idxstat"
samtools view -b -q ${mapq} -F 2304 -@ 12 ${bam} > "intermediate.bam"
samtools sort -@ 12 "intermediate.bam" -o "${id}-Filtered_primary_mapq_${mapq}.bam"
samtools index "${id}-Filtered_primary_mapq_${mapq}.bam"
samtools flagstat "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.flagstat"
samtools idxstats "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.idxstat"
samtools index -@ ${samtools_threads} "${id}-Unfiltered.bam"
samtools flagstat -@ ${samtools_threads} "${id}-Unfiltered.bam" > "${id}-Unfiltered.flagstat"
samtools idxstats -@ ${samtools_threads} "${id}-Unfiltered.bam" > "${id}-Unfiltered.idxstat"
samtools view -@ ${samtools_threads} -b -q ${mapq} -F 2304 ${bam} > "intermediate.bam"
samtools sort -@ ${samtools_threads} "intermediate.bam" -o "${id}-Filtered_primary_mapq_${mapq}.bam"
samtools index -@ ${samtools_threads} "${id}-Filtered_primary_mapq_${mapq}.bam"
samtools flagstat -@ ${samtools_threads} "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.flagstat"
samtools idxstats -@ ${samtools_threads} "${id}-Filtered_primary_mapq_${mapq}.bam" > "${id}-Filtered_primary_mapq_${mapq}.idxstat"
var=\$(awk 'NR==8 {print \$1}' "${id}-Filtered_primary_mapq_${mapq}.flagstat")
## If flagstat file says there are no mapped reads then delete bam and bai files.
## Since we passed optional on the output statement lines the script should still run fine even
......
......@@ -5,6 +5,9 @@ process MODKIT {
input:
tuple val(id), path(bam)
path bai
val modkit_threads
val modkit_isize
val modkit_extract_qsize
output:
path "*"
......@@ -13,13 +16,13 @@ process MODKIT {
"""
echo "starting modkit"
## Make Methylation TSV file
modkit extract full --bgzf --queue-size 1000 -t 4 -i 10000 --mapped-only "${bam}" "${id}_modkit_output.tsv"
modkit extract full --bgzf --queue-size ${modkit_extract_qsize} --threads ${modkit_threads} --interval-size ${modkit_isize} --mapped-only "${bam}" "${id}_modkit_output.tsv"
echo "extract successful"
## Make Methylation Table
modkit pileup -t 4 -i 10000 --chunk-size 4 "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log"
## Make Methylation Table (--chunk-size is a multiplier of threads, 1.5x by default)
modkit pileup --threads ${modkit_threads} --interval-size ${modkit_isize} "${bam}" "${id}_modkit_pileup.bed" --log-filepath "${id}_modkit_pileup.log"
echo "pileup successful"
## Make Summary
modkit summary -i 10000 ${bam} > "${id}_modkit_summary.txt"
modkit summary --threads ${modkit_threads} --interval-size ${modkit_isize} ${bam} > "${id}_modkit_summary.txt"
echo "summary successful"
"""
}
......@@ -45,6 +45,14 @@ params {
multiqc_config = "./references/multiqc_config.yaml"
// Are the files from MinKNOW barcoded or not
is_barcoded = true
// pod5
pod5_threads = 12
// samtools
samtools_threads = 12
// modkit
modkit_threads = 8
modkit_extract_qsize = 1000
modkit_isize = 10000
}
// queue_size depends on the step
......
......@@ -13,16 +13,17 @@ workflow BASECALLING {
trimmed_barcodes
gpu_devices
reference_file
pod5_threads
samtools_threads
main:
FAST5_to_POD5(fast5_path)
pod5_path = FAST5_to_POD5.out.mix(pod5_path)
BASECALL(pod5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file)
FAST5_to_POD5(fast5_path, pod5_threads)
pod5_path = FAST5_to_POD5.out.mix(pod5_path)
BASECALL(pod5_path, basecall_speed, basecall_mods, basecall_config, basecall_trim, qscore_thresh, barcoding_kit, trimmed_barcodes, gpu_devices, reference_file, samtools_threads)
bams = BASECALL.out.bam.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2)
txts = BASECALL.out.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten()
emit:
bam_files = bams
txt_files = txts
bam_files = bams
txt_files = txts
}
......@@ -9,19 +9,21 @@ workflow FILTERING_AND_QC_FROM_MINKNOW {
input
mapq
qscore_thresh
samtools_threads
main:
if (params.is_barcoded) {
CONVERT_INPUT_FROM_MINKNOW_BARCODED(input)
CONVERT_INPUT_FROM_MINKNOW_BARCODED(input, samtools_threads)
converted_input = CONVERT_INPUT_FROM_MINKNOW_BARCODED.out
} else {
CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input)
CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED(input, samtools_threads)
converted_input = CONVERT_INPUT_FROM_MINKNOW_NOT_BARCODED.out
}
FILTER_BAM(
converted_input.bam.flatten().map { file -> tuple(file.baseName, file) }.toSortedList { a, b -> a[0] <=> b[0] }.flatten().buffer(size: 2),
converted_input.txt.toSortedList { a, b -> a.baseName <=> b.baseName }.flatten(),
mapq,
samtools_threads
)
PYCOQC_NO_FILTER(
FILTER_BAM.out.id,
......
......@@ -10,9 +10,10 @@ workflow FILTERING_AND_QC_FROM_STEP_1 {
txt_files
mapq
qscore_thresh
samtools_threads
main:
FILTER_BAM(bam_files, txt_files, mapq)
FILTER_BAM(bam_files, txt_files, mapq, samtools_threads)
PYCOQC_NO_FILTER(
FILTER_BAM.out.id,
FILTER_BAM.out.total_bam,
......
......@@ -13,11 +13,15 @@ workflow MODKIT_AND_MULTIQC {
quality_thresholds
multiqc_config
multiqc_input
samtools_threads
modkit_threads
modkit_isize
modkit_extract_qsize
main:
CALCULATE_COVERAGE(filtered_bams, filtered_bais)
CALCULATE_COVERAGE(filtered_bams, filtered_bais, samtools_threads)
MERGE_COVERAGE(CALCULATE_COVERAGE.out.collect())
MERGE_QC_REPORT(num_reads.collect(), read_length.collect(), quality_thresholds.collect())
MULTIQC(multiqc_input.concat(MERGE_QC_REPORT.out.flatten()).collect(), MERGE_COVERAGE.out.mqc, multiqc_config)
MODKIT(filtered_bams, filtered_bais)
MODKIT(filtered_bams, filtered_bais, modkit_threads, modkit_isize, modkit_extract_qsize)
}