Examples

All examples use FASTA files but should work with FASTQ, compressed .gz files, VCF from VariantCallFormat.jl and XAM.jl types. More examples can be found in the tests.

Reading a FASTA file into memory

using BioRecordsProcessing, FASTX, BioSequences

# the file contains two 20bp reads
p = Pipeline(
    Reader(FASTX.FASTA, File(filepath)),
    record -> begin
        sequence(LongDNA{4}, record)
    end,
    Collect(LongDNA{4}),
)
run(p)

# output
2-element Vector{LongSequence{DNAAlphabet{4}}}:
 CTTGGCATACTCAAACTCTT
 TGGCATACTCACTAACTCTT

Transforming a FASTA file

using BioRecordsProcessing, FASTX, BioSequences

# the file contains two 20bp reads, trim first 10bp
p = Pipeline(
    Reader(FASTX.FASTA, File(filepath)),
    record -> begin
        seq = sequence(LongDNA{4}, record)
        FASTA.Record(FASTA.identifier(record), seq[10:end])
    end,
    Writer(FASTX.FASTA, dir; suffix = ".trimmed"),
)
out = run(p)
run(`head $out`)

# output
>seq1
CTCAAACTCTT
>seq2
CACTAACTCTT
Process(`head /var/folders/8g/xj7pzy251n53px06l17vr0_00000gr/T/jl_mL4pM7/test_1.trimmed.fa`, ProcessExited(0))

Reading a pair of FASTA file

using BioRecordsProcessing, FASTX, BioSequences

# first in pair is named "_1.fasta", second "_2.fasta"
p = Pipeline(
    Reader(FASTX.FASTA, File(filepath; second_in_pair = x -> replace(x, "_1" => "_2"))),
    (r1, r2) -> begin
        sequence(LongDNA{4}, r1), sequence(LongDNA{4}, r2)
    end,
    Collect(LongDNA{4}; paired = true),
)
run(p)

# output
2-element Vector{Tuple{LongSequence{DNAAlphabet{4}}, LongSequence{DNAAlphabet{4}}}}:
 (CTTGGCATACTCAAACTCTT, GCAAACTCTTCTTGGCATAC)
 (TGGCATACTCACTAACTCTT, ATACTCAAACTCTTCTTGGC)

Processing all files in a directory

using BioRecordsProcessing, FASTX, BioSequences


p = Pipeline(
    Reader(FASTX.FASTA, Directory(dir, "*.fa")),
    record -> begin
        seq = sequence(LongDNA{4}, record)
        FASTA.Record(FASTA.identifier(record), seq[10:end])
    end,
    Writer(FASTX.FASTA, dir; suffix = ".trimmed"),
)
out = run(p; verbose = false)
basename.(out)# run returns the path to output files

# output
2-element Vector{String}:
 "test_1.trimmed.fa"
 "test_2.trimmed.fa"

Write sequences in memory into a file

using BioRecordsProcessing, FASTX, BioSequences

data = [FASTA.Record("seq1", dna"ATGC")]

p = Pipeline(
    Buffer(data; filename = "test.fa"),
    Writer(FASTX.FASTA, dir),
)
out = run(p)
run(`head $out`)

# output
>seq1
ATGC
Process(`head /var/folders/8g/xj7pzy251n53px06l17vr0_00000gr/T/jl_NSdfEq/test.fa`, ProcessExited(0))

Cookbook

Loading paired-end reads from a BAM file in a genomic interval

In general records can be grouped using a RecordGrouper. For pair-end BAMs BAMPairedReadGrouper is provided. It will group the two first reads (primary alignments only) with the same read name it encounters and release them for processing as a pair:

using BioRecordsProcessing, XAM, GenomicFeatures

region = Interval("9", 22331023, 24542023)

p = Pipeline(
    Reader(XAM.BAM, File(bamfile; interval = region)),
    BioRecordsProcessing.BAMPairedReadGrouper(),
    (r1,r2) -> begin 
        (r1,r2)
    end,
    Collect(Tuple{XAM.BAM.Record, XAM.BAM.Record}),
)
out = run(p)

Generating artificial FASTQ reads

A generator can be passed as input in a Buffer :

using BioRecordsProcessing, FASTX, BioSequences

generate_read(i) = FASTA.Record("seq$i", randdnaseq(150))

p = Pipeline(
    Buffer(generate_read(i) for i in 1:10; filename = "test.fa"),
    Writer(FASTX.FASTA, dir),
)
out = run(p)

# output
"/tmp/jl_0mSqQJ/test.fastq"

Paired-end reads can also be generated :

generate_read(i) = FASTQ.Record("seq$i", randdnaseq(150), fill(UInt8(40), 150))
generate_reads(i) = (generate_read(i), generate_read(i))

p = Pipeline(
    Buffer(generate_reads(i) for i in 1:10; filename = "test_R1.fastq.gz"),
    Writer(FASTX.FASTQ, dir; paired = true, second_in_pair = x -> replace(x, "_R1" => "_R2")),
)
out = run(p)

# output
2-element Vector{String}:
 "/tmp/jl_srnqiA/test_R1.fastq.gz"
 "/tmp/jl_srnqiA/test_R2.fastq.gz"

Reading a VCF in a DataFrame

Note : This probably wont' work because of compatibilty issues, see : https://github.com/rasmushenningsson/VariantCallFormat.jl/issues/5

using BioRecordsProcessing, VariantCallFormat, DataFrames

get_depth(r) = parse(Int, VCF.genotype(r, 1, "DP"))
get_vaf(r) = parse(Float64, VCF.genotype(r, 1, "VAF"))

# define output type (we could just use Any)
T = NamedTuple{(:chr, :pos, :ref, :alt, :depth, :vaf, :quality), Tuple{String, Int64, String, String, Int64, String, Float64}}

p = Pipeline(
    Reader(VCF, File(filepath)),
    r -> begin
        VCF.filter(r) !=  ["PASS"] && return nothing #filter variants that are not PASS

        (chr = VCF.chrom(r), pos = VCF.pos(r), ref = VCF.ref(r), alt = VCF.alt(r)[1], 
        depth=get_depth(r), vaf=VCF.genotype(r, 1, "VAF"), quality = VCF.qual(r)
        )
    end,
    Collect(T),
)

df = run(p) |> DataFrame

Aligning a FASTQ file to the reference genome :

See this blog post : https://jonathanbieler.github.io/blog/fastq2cnv/

Converting a BAM file into FASTQ's

using BioRecordsProcessing, XAM, FASTX

p = Pipeline(
    Reader(XAM.BAM, File(bamfile)),
    BAMPairedReadGrouper(),
    (r1,r2) -> begin 
        f1 = FASTQ.Record(BAM.tempname(r1), BAM.sequence(r1), BAM.quality(r1))
        f2 = FASTQ.Record(BAM.tempname(r2), BAM.sequence(r2), BAM.quality(r2))
        (f1,f2)
    end,
    Writer(
        FASTQ, dir; 
        paired = true, 
        second_in_pair = x -> x * "_R2", # append _R2 to bam name
        extension = ".fastq.gz" # since the output file type is different from input we need to provide extension
    ),
)
run(p; max_records = 100)

# output
2-element Vector{String}:
 "/tmp/jl_srnqiA/bwa.fastq.gz"
 "/tmp/jl_srnqiA/bwa_R2.fastq.gz