% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/vs_fastx_uniques.R
\name{vs_fastx_uniques}
\alias{vs_fastx_uniques}
\alias{vs_fastq_uniques}
\alias{vs_fasta_uniques}
\alias{vs_fastx_dereplicate}
\alias{vs_fastq_dereplicate}
\alias{vs_fasta_dereplicate}
\alias{fastx_uniques}
\alias{dereplicate}
\title{Dereplicate sequences}
\usage{
vs_fastx_uniques(
  fastx_input,
  output_format = "fasta",
  fastx_output = NULL,
  minuniquesize = 1,
  strand = "plus",
  sizein = TRUE,
  sizeout = TRUE,
  relabel = NULL,
  relabel_sha1 = FALSE,
  fastq_qout_max = FALSE,
  fasta_width = 0,
  sample = NULL,
  vsearch_options = NULL,
  tmpdir = NULL
)
}
\arguments{
\item{fastx_input}{(Required). A FASTA/FASTQ file path or FASTA/FASTQ object.
See \emph{Details}.}

\item{output_format}{(Optional). Desired output format of file or tibble:
\code{"fasta"} (default) or \code{"fastq"}. If \code{fastx_input} is a FASTA
file path or a FASTA object, \code{output_format} cannot be \code{"fastq"}.}

\item{fastx_output}{(Optional). Name of the output file for dereplicated
reads from \code{fastx_input}. File can be in either FASTA or FASTQ format,
depending on \code{output_format}. If \code{NULL} (default), no sequences are
written to file. See \emph{Details}.}

\item{minuniquesize}{(Optional). Minimum abundance value post-dereplication
for a sequence not to be discarded. Defaults to \code{1}.}

\item{strand}{(Optional). Specifies which strand to consider when comparing
sequences. Can be either \code{"plus"} (default) or \code{"both"}.}

\item{sizein}{(Optional). If \code{TRUE} (default), abundance annotations
present in sequence headers are taken into account.}

\item{sizeout}{(Optional). If \code{TRUE} (default), abundance annotations
are added to FASTA headers.}

\item{relabel}{(Optional). Relabel sequences using the given prefix and a
ticker to construct new headers. Defaults to \code{NULL}.}

\item{relabel_sha1}{(Optional). If \code{TRUE} (default), relabel sequences
using the SHA1 message digest algorithm. Defaults to \code{FALSE}.}

\item{fastq_qout_max}{(Optional). If \code{TRUE}, the quality score will be
the highest (best) quality score observed in each position. Defaults to
\code{FALSE}.}

\item{fasta_width}{(Optional). Number of characters per line in the output
FASTA file. Defaults to \code{0}, which eliminates wrapping.}

\item{sample}{(Optional). Add the given sample identifier string to sequence
headers. For nstance, if the given string is "ABC", the text ";sample=ABC"
will be added to the header. If \code{NULL} (default), no identifier is added.}

\item{vsearch_options}{(Optional). A character string of additional arguments
to pass to \code{VSEARCH}. Defaults to \code{NULL}. See \emph{Details}.}

\item{tmpdir}{(Optional). Path to the directory where temporary files should
be written when tables are used as input or output. Defaults to
\code{NULL}, which resolves to the session-specific temporary directory
(\code{tempdir()}).}
}
\value{
A tibble or \code{NULL}.

If \code{fastx_output} is specified, the dereplicated sequences are written
to the specified output file, and no tibble is returned.

If \code{fastx_output} \code{NULL}, a tibble containing the dereplicated
reads in the format specified by \code{output_format} is returned.
}
\description{
\code{vs_fastx_uniques} performs dereplication of sequences in a
FASTA/FASTQ file or object by merging identical sequences using
\code{VSEARCH}.
}
\details{
Sequences in the input file/object (\code{fastx_input}) are dereplicated by
merging identical sequences. Identical sequences are defined as sequences
with the same length and the same string of nucleotides (case insensitive, T
and U are considered the same).

\code{fastx_input} can either be a FASTA/FASTQ file or a FASTA/FASTQ object.
FASTA objects are tibbles that contain the columns \code{Header} and
\code{Sequence}, see \code{\link[microseq]{readFasta}}. FASTQ objects are
tibbles that contain the columns \code{Header}, \code{Sequence}, and
\code{Quality}, see \code{\link[microseq]{readFastq}}.

By default, the quality scores in FASTQ output files will correspond to the
average error probability of the nucleotides in the each position.
If \code{fastq_qout_max = TRUE}, the quality score will be the highest (best)
quality score observed in each position.

If \code{fastx_output} is specified, the dereplicated sequences are output to
this file in format given by \code{output_format}.
If \code{fastx_output} is \code{NULL}, the dereplicated  sequences are
returned as a FASTA or FASTQ object, depending on \code{output_format}.

\code{vsearch_options} allows users to pass additional command-line arguments
to \code{VSEARCH} that are not directly supported by this function. Refer to
the \code{VSEARCH} manual for more details.
}
\examples{
\dontrun{
# Define arguments
fastx_input <- file.path(file.path(path.package("Rsearch"), "extdata"),
                         "small_R1.fq")
fastx_output <- NULL
output_format <- "fastq"

# Dereplicate sequences and return a FASTQ tibble
derep_R1 <- vs_fastx_uniques(fastx_input = fastx_input,
                             fastx_output = fastx_output,
                             output_format = output_format)

# Dereplicate sequences and write derelicated sequences to a file
vs_fastx_uniques(fastx_input = fastx_input,
                 fastx_output = "dereplicated_sequences.fq",
                 output_format = output_format)
}

}
\references{
\url{https://github.com/torognes/vsearch}
}
