% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/build.pubchem.bio.R
\name{build.pubchem.bio}
\alias{build.pubchem.bio}
\title{build.pubchem.bio}
\usage{
build.pubchem.bio(
  pc.directory = NULL,
  use.bio.sources = TRUE,
  bio.sources = c("Metabolomics Workbench", "Human Metabolome Database (HMDB)", "ChEBI",
    "LIPID MAPS", "MassBank of North America (MoNA)"),
  use.pathways = TRUE,
  pathway.sources = NULL,
  use.taxid = TRUE,
  taxonomy.sources = NULL,
  use.parent.cid = TRUE,
  remove.salts = TRUE,
  remove.inorganics = FALSE,
  mw.range = c(50, 2000),
  get.properties = TRUE,
  threads = 8,
  rcdk.desc = c("org.openscience.cdk.qsar.descriptors.molecular.XLogPDescriptor",
    "org.openscience.cdk.qsar.descriptors.molecular.AcidicGroupCountDescriptor",
    "org.openscience.cdk.qsar.descriptors.molecular.BasicGroupCountDescriptor",
    "org.openscience.cdk.qsar.descriptors.molecular.TPSADescriptor"),
  cid.lca.object = NULL,
  cid.sid.object = NULL,
  cid.pwid.object = NULL,
  cid.parent.object = NULL,
  cid.taxid.object = NULL,
  cid.formula.object = NULL,
  cid.smiles.object = NULL,
  cid.inchikey.object = NULL,
  cid.inchi.object = NULL,
  cid.monoisotopic.mass.object = NULL,
  cid.title.object = NULL,
  cid.cas.object = NULL,
  cid.pmid.ct.object = NULL,
  output.directory = NULL
)
}
\arguments{
\item{pc.directory}{directory from which to load pubchem .Rdata files.  alternatively, provide  R data.tables for ALL cid.\emph{property}.object options defined below.}

\item{use.bio.sources}{logical.  If TRUE (default) use the bio.source vector of sources, incorporating all CIDs from those bio databases.}

\item{bio.sources}{vector of source names from which to extract pubchem CIDs.  all can be found here: https://pubchem.ncbi.nlm.nih.gov/sources/.  deafults to c("Metabolomics Workbench", "Human Metabolome Database (HMDB)", "ChEBI", "LIPID MAPS",  "MassBank of North America (MoNA)")}

\item{use.pathways}{logical.  should all CIDs from any biological pathway data be incorporated into database?}

\item{pathway.sources}{character. vector of sources to be used when adding metabolites to pubchem bio database. default = NULL, using all pathway sources.}

\item{use.taxid}{logical.  should all CIDs associated with a taxonomic identifier (taxid) be used?}

\item{taxonomy.sources}{character. vector of sources to be used when adding taxonomically related metabolites to database.  Default = NULL, using all sources.}

\item{use.parent.cid}{logical. should CIDs be replaced with parent CIDs?  default = TRUE.}

\item{remove.salts}{logical.  should salts be removed from dataset?  default = TRUE.  salts recognized as '.' in smiles string.  performed after 'use.parent.cid'.}

\item{remove.inorganics}{logical. should inorganic molecules (those with no carbon) be removed? default = FALSE.}

\item{mw.range}{vector. numerical vector of length = 2.  default = c(50, 2000).}

\item{get.properties}{logical. if TRUE, will return rcdk calculated properties:  XLogP, TPSA, HBondDonorCount and HBondAcceptorCount.}

\item{threads}{integer. how many threads to use when calculating rcdk properties.  parallel processing via DoParallel and foreach packages.}

\item{rcdk.desc}{vector. character vector of valid rcdk descriptors.  default = rcdk.desc <- c("org.openscience.cdk.qsar.descriptors.molecular.XLogPDescriptor", "org.openscience.cdk.qsar.descriptors.molecular.AcidicGroupCountDescriptor", "org.openscience.cdk.qsar.descriptors.molecular.BasicGroupCountDescriptor", "org.openscience.cdk.qsar.descriptors.molecular.TPSADescriptor"). To see descriptor categories: 'dc <- rcdk::get.desc.categories(); dc' .  To see the descriptors within one category: 'dn <- rcdk::get.desc.names(dc[4]); dn'. Note that the four default parameters are relatively fast to calculate - some descriptors take a very long time to calculate.  you can calculate as many as you wish, but processing time will increase the more descriptors are added.}

\item{cid.lca.object}{R data.table, generally produced by build.cid.lca; preferably, define pc.directory}

\item{cid.sid.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.pwid.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.parent.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.taxid.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.formula.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.smiles.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.inchikey.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.inchi.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.monoisotopic.mass.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.title.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.cas.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{cid.pmid.ct.object}{R data.table, generally produced by get.pubchem.ftp; preferably, define pc.directory}

\item{output.directory}{directory to which the pubchem.bio database is saved.  If NULL, will try to save in pc.directory (if provided), else not saved.}
}
\value{
a data frame containing pubchem CID, title, formula, monoisotopic molecular weight, inchikey, smiles, cas, optionally rcdk properties
}
\description{
utilizes downloaded and properly formatted local pubchem data created by 'get.pubchem.ftp' function
}
\details{
utilizes downloaded and properly formatted local pubchem data created by 'get.pubchem.ftp' function
}
\examples{
data('cid.sid', package = "pubchem.bio")
data('cid.pwid', package = "pubchem.bio")
data('cid.parent', package = "pubchem.bio")
data('cid.taxid', package = "pubchem.bio")
data('cid.formula', package = "pubchem.bio")
data('cid.smiles', package = "pubchem.bio")
data('cid.inchikey', package = "pubchem.bio")
data('cid.inchi', package = "pubchem.bio")
data('cid.monoisotopic.mass', package = "pubchem.bio")
data('cid.title', package = "pubchem.bio")
data('cid.cas', package = "pubchem.bio")
data('cid.pmid.ct', package = "pubchem.bio")
data('cid.lca', package = "pubchem.bio")
pc.bio.out <- build.pubchem.bio(use.pathways = FALSE, use.parent.cid = FALSE,
get.properties = FALSE, threads = 1,
cid.sid.object = cid.sid, cid.pwid.object = cid.pwid,
cid.parent.object = cid.parent, cid.taxid.object = cid.taxid,
cid.formula.object = cid.formula, cid.smiles.object = cid.smiles,
cid.inchikey.object = cid.inchikey, cid.inchi.object = cid.inchi,
cid.monoisotopic.mass.object = cid.monoisotopic.mass,
cid.title.object = cid.title, cid.cas.object = cid.cas,
cid.pmid.ct.object = cid.pmid.ct, cid.lca.object = cid.lca)
head(pc.bio.out)
}
\author{
Corey Broeckling
}
