% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/vs_fastx_trim_filt.R
\name{vs_fastx_trim_filt}
\alias{vs_fastx_trim_filt}
\alias{vs_fastq_trim_filt}
\alias{vs_fasta_trim_filt}
\alias{fastx_trim_filt}
\alias{trim_filt}
\title{Trim and/or filter sequences in FASTA/FASTQ format}
\usage{
vs_fastx_trim_filt(
  fastx_input,
  reverse = NULL,
  output_format = "fastq",
  fastaout = NULL,
  fastqout = NULL,
  fastaout_rev = NULL,
  fastqout_rev = NULL,
  trunclen = NULL,
  truncqual = 1,
  truncee = NULL,
  truncee_rate = NULL,
  stripright = 0,
  stripleft = 0,
  maxee_rate = 0.01,
  minlen = 0,
  maxlen = NULL,
  maxns = 0,
  minsize = NULL,
  maxsize = NULL,
  minqual = 0,
  relabel = NULL,
  relabel_sha1 = FALSE,
  fasta_width = 0,
  sample = NULL,
  stats = TRUE,
  log_file = NULL,
  threads = 1,
  vsearch_options = NULL,
  tmpdir = NULL
)
}
\arguments{
\item{fastx_input}{(Required). A FASTA/FASTQ file path or FASTA/FASTQ object
containing (forward) reads. See \emph{Details}.}

\item{reverse}{(Optional). A FASTA/FASTQ file path or object containing
reverse reads. If \code{fastx_input} is a \code{"pe_df"} object and
\code{reverse} is not provided, the reverse reads will be extracted from its
\code{"reverse"} attribute.}

\item{output_format}{(Optional). Desired output format of file or tibble:
\code{"fasta"} or \code{"fastq"} (default). If \code{fastx_input} is a FASTA
file path or a FASTA object, \code{output_format} cannot be \code{"fastq"}.}

\item{fastaout}{(Optional). Name of the FASTA output file for the sequences
given in \code{fastx_input}. If \code{NULL} (default), no FASTA sequences are
written to file. See \emph{Details}.}

\item{fastqout}{(Optional). Name of the FASTQ output file for the sequences
given in \code{fastx_input}. If \code{NULL} (default), no FASTQ sequences are
written to file. See \emph{Details}.}

\item{fastaout_rev}{(Optional). Name of the FASTA output file for the reverse
sequences. If \code{NULL} (default), no FASTA sequences are written to file.
See \emph{Details}.}

\item{fastqout_rev}{(Optional). Name of the FASTQ output file for the reverse
sequences. If \code{NULL} (default), no FASTQ sequences are written to file.
See \emph{Details}.}

\item{trunclen}{(Optional). Truncate sequences to the specified length.
Shorter sequences are discarded. If \code{NULL} (default), the trimming is
not applied.}

\item{truncqual}{(Optional). Truncate sequences starting from the first base
with a quality score of the specified value or lower. Defaults to \code{1}.}

\item{truncee}{(Optional). Truncate sequences so that their total expected
error does not exceed the specified value. If \code{NULL} (default), the
trimming is not applied.}

\item{truncee_rate}{(Optional). Truncate sequences so that their average
expected error per base is not higher than the specified value. The
truncation will happen at first occurrence. The average expected error per
base is calculated as the total expected number of errors divided by the
length of the sequence after truncation. If \code{NULL} (default), the
trimming is not applied.}

\item{stripright}{(Optional). Number of bases stripped from the right end of
the reads. Defaults to \code{0}.}

\item{stripleft}{(Optional). Number of bases stripped from the left end of
the reads. Defaults to \code{0}.}

\item{maxee_rate}{(Optional). Threshold for average expected error. Numeric
value ranging form \code{0.0} to \code{1.0}. Defaults to \code{0.01}. See
\emph{Details}.}

\item{minlen}{(Optional). Minimum number of bases a sequence must have to be
retained. Defaults to \code{0}. See \emph{Details}.}

\item{maxlen}{(Optional). Maximum number of bases a sequences can have to be
retained. If \code{NULL} (default), the filter is not applied.}

\item{maxns}{(Optional). Maximum number of N's for a given sequence.
Sequences with more N's than the specified number are discarded. Defaults to
\code{0}.}

\item{minsize}{(Optional). Minimum abundance for a given sequence. Sequences
with lower abundance are discarded. If \code{NULL} (default), the filter is
not applied.}

\item{maxsize}{(Optional). Maximum abundance for a given sequence. Sequences
with higher abundance are discarded. If \code{NULL} (default), the filter is
not applied.}

\item{minqual}{(Optional). Minimum base quality for a read to be retained. A
read is discarded if it contains bases with a quality score below the given
value. Defaults to \code{0}, meaning no reads are discarded.}

\item{relabel}{(Optional). Relabel sequences using the given prefix and a
ticker to construct new headers. Defaults to \code{NULL}.}

\item{relabel_sha1}{(Optional). If \code{TRUE} (default), relabel sequences
using the SHA1 message digest algorithm. Defaults to \code{FALSE}.}

\item{fasta_width}{(Optional). Number of characters per line in the output
FASTA file. Defaults to \code{0}, which eliminates wrapping.}

\item{sample}{(Optional). Add the given sample identifier string to sequence
headers. For instance, if the given string is "ABC", the text ";sample=ABC"
will be added to the header. If \code{NULL} (default), no identifier is added.}

\item{stats}{(Optional). If \code{TRUE} (default), a tibble with statistics
about the filtering is added as an attribute of the returned tibble. If
\code{FALSE}, no statistics are added.}

\item{log_file}{(Optional). Name of the log file to capture messages from
\code{VSEARCH}. If \code{NULL} (default), no log file is created.}

\item{threads}{(Optional). Number of computational threads to be used by
\code{VSEARCH}. Defaults to \code{1}.}

\item{vsearch_options}{(Optional). Additional arguments to pass to
\code{VSEARCH}. Defaults to \code{NULL}. See \emph{Details}.}

\item{tmpdir}{(Optional). Path to the directory where temporary files should
be written when tables are used as input or output. Defaults to
\code{NULL}, which resolves to the session-specific temporary directory
(\code{tempdir()}).}
}
\value{
A tibble or \code{NULL}.

If output files are specified, the results are written directly to the
specified output files, and no tibble is returned.

If output files (\code{fastaout}/\code{fastqout} and
\code{fastaout_rev}/\code{fastqout_rev}) are \code{NULL}, a tibble containing
the trimmed and/or filtered reads from \code{fastx_input} in the format
specified by \code{output_format} is returned.

If \code{reverse} is provided, a tibble containing the trimmed and/or
filtered reverse sequences is attached as an attribute, named
\code{"reverse"} to the returned table.

When the reverse reads are present, the returned tibble is assigned the
class \code{"pe_df"}, identifying it as paired-end data.

The \code{"statistics"} attribute of the returned tibble (when
output files are \code{NULL}) is a tibble with the
following columns:
\itemize{
  \item \code{Kept_Sequences}: Number of retained sequences.
  \item \code{Discarded_Sequences}: Number of discarded sequences.
  \item \code{fastx_source}: Name of the file/object with forward (R1) reads.
  \item \code{reverse_source}: (If \code{reverse} is specified) Name of the
  file/object with reverse (R2) reads.
}
}
\description{
\code{vs_fastx_trim_filt} trims and/or filters FASTA/FASTQ
sequences using \code{VSEARCH}. This function processes both forward and
reverse reads (if provided) and allows for various filtering criteria based
on sequence quality, length, abundance, and more.
}
\details{
Reads from the input files/objects (\code{fastx_input} and \code{reverse})
are trimmed and/or filtered based on the specified criteria using
\code{VSEARCH}.

\code{fastx_input} and \code{reverse} can either be file paths to FASTA/FASTQ
files or FASTA/FASTQ objects. FASTA objects are tibbles that contain the
columns \code{Header} and \code{Sequence}, see
\code{\link[microseq]{readFasta}}. FASTQ objects are tibbles that contain the
columns \code{Header}, \code{Sequence}, and \code{Quality}, see
\code{\link[microseq]{readFastq}}.

If \code{fastx_input} is an object of class \code{"pe_df"}, the reverse reads
are automatically extracted from its \code{"reverse"} attribute unless
explicitly provided via the \code{reverse} argument.

If \code{reverse} is provided, it is processed alongside \code{fastx_input}
using the same trimming/filtering criteria.

Note that if you want to trim/filter the forward and reverse reads
differently, you must pass them separately to this function, get two result
files/objects, and then use \code{\link{fastx_synchronize}} to synchronize
the read pairs again.

If \code{fastaout} and \code{fastaout_rev} or \code{fastqout} and
\code{fastqout_rev} are specified, trimmed and/or filtered sequences are
written to these files in the specified format.

If output files are \code{NULL}, results are returned as a tibbles. When
returning tibbles, the reverse sequences (if provided) are attached as an
attribute named \code{"reverse"}.

When reverse reads are returned as an attribute, the primary tibble is also
assigned the S3 class \code{"pe_df"} to indicate that it represents
paired-end data. This class tag can be used by downstream tools to recognize
paired-end tibbles.

Note that certain options are not compatible with both file formats. For
instance, options that trim or filter sequences based on quality scores are
unavailable when the input is of type \code{"fasta"}. Visit the
\code{VSEARCH}
\href{https://github.com/torognes/vsearch?tab=readme-ov-file#getting-help}{documentation}
for more details.

Sequences with an average expected error greater than the specified
\code{maxee_rate} are discarded. For a given sequence, the average expected
error is the sum of error probabilities for all the positions in the sequence,
divided by the length of the sequence.

Any input sequence with fewer bases than the value set in \code{minlen} will
be discarded. By default, \code{minlen} is set to 0, which means that no
sequences are removed. However, using the default value may allow empty
sequences to remain in the results.

\code{vsearch_options} allows users to pass additional command-line arguments
to \code{VSEARCH} that are not directly supported by this function. Refer to
the \code{VSEARCH} manual for more details.
}
\examples{
\dontrun{
# Define arguments
fastx_input <- file.path(file.path(path.package("Rsearch"), "extdata"),
                         "small_R1.fq")
reverse <- file.path(file.path(path.package("Rsearch"), "extdata"),
                     "small_R1.fq")
output_format <- "fastq"
maxee_rate <- 0.01
minlen <- 0

# Trim/filter sequences and return a FASTQ tibble
filt_seqs <- vs_fastx_trim_filt(fastx_input = fastx_input,
                                reverse = reverse,
                                output_format = output_format,
                                maxee_rate = maxee_rate,
                                minlen = minlen)

# Extract tibbles
R1_filt <- filt_seqs
R2_filt <- attr(filt_seqs, "reverse")

# Extract filtering statistics
statistics <- attr(filt_seqs, "statistics")

# Trim/filter sequences and write results to FASTQ files
vs_fastx_trim_filt(fastx_input = fastx_input,
                   reverse = reverse,
                   fastqout = "filt_R1.fq",
                   fastqout_rev = "filt_R2.fq",
                   output_format = output_format,
                   maxee_rate = maxee_rate,
                   minlen = minlen)
}

}
\references{
\url{https://github.com/torognes/vsearch}
}
