% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stat-dens2d-filter.r
\name{stat_dens2d_filter}
\alias{stat_dens2d_filter}
\alias{stat_dens2d_filter_g}
\title{Filter observations by local 2D density}
\usage{
stat_dens2d_filter(
  mapping = NULL,
  data = NULL,
  geom = "point",
  position = "identity",
  ...,
  keep.fraction = 0.1,
  keep.number = Inf,
  keep.sparse = TRUE,
  keep.these = FALSE,
  exclude.these = FALSE,
  these.target = "label",
  pool.along = c("xy", "x", "y", "none"),
  xintercept = 0,
  yintercept = 0,
  invert.selection = FALSE,
  na.rm = TRUE,
  show.legend = FALSE,
  inherit.aes = TRUE,
  h = NULL,
  n = NULL,
  return.density = FALSE
)

stat_dens2d_filter_g(
  mapping = NULL,
  data = NULL,
  geom = "point",
  position = "identity",
  ...,
  keep.fraction = 0.1,
  keep.number = Inf,
  keep.sparse = TRUE,
  keep.these = FALSE,
  exclude.these = FALSE,
  these.target = "label",
  pool.along = c("xy", "x", "y", "none"),
  xintercept = 0,
  yintercept = 0,
  invert.selection = FALSE,
  na.rm = TRUE,
  show.legend = FALSE,
  inherit.aes = TRUE,
  h = NULL,
  n = NULL,
  return.density = FALSE
)
}
\arguments{
\item{mapping}{The aesthetic mapping, usually constructed with
\code{\link[ggplot2]{aes}} or \code{\link[ggplot2]{aes_}}. Only needs
to be set at the layer level if you are overriding the plot defaults.}

\item{data}{A layer specific dataset - only needed if you want to override
the plot defaults.}

\item{geom}{The geometric object to use display the data.}

\item{position}{The position adjustment to use for overlapping points on this
layer}

\item{...}{other arguments passed on to \code{\link[ggplot2]{layer}}. This
can include aesthetics whose values you want to set, not map. See
\code{\link[ggplot2]{layer}} for more details.}

\item{keep.fraction}{numeric [0..1]. The fraction of the observations (or
rows) in \code{data} to be retained.}

\item{keep.number}{integer Set the maximum number of observations to retain,
effective only if obeying \code{keep.fraction} would result in a larger
number.}

\item{keep.sparse}{logical If \code{TRUE}, the default, observations from the
more sparse regions are retained, if \code{FALSE} those from the densest
regions.}

\item{keep.these, exclude.these}{character vector, integer vector, logical
vector or function that takes one or more variables in data selected by
\code{these.target}. Negative integers behave as in R's extraction methods.
The rows from \code{data} indicated by \code{keep.these} and
\code{exclude.these} are kept or excluded irrespective of the local
density.}

\item{these.target}{character, numeric or logical selecting one or more
column(s) of \code{data}. If \code{TRUE} the whole \code{data} object is
passed.}

\item{pool.along}{character, one of \code{"none"}, \code{"x"}, \code{"y"}, or
\code{"xy"} indicating if selection should be done pooling the observations
along the \emph{x}, \code{y}, both axes or none based on quadrants given by
\code{xintercept} and \code{yintercept}.}

\item{xintercept, yintercept}{numeric The center point of the quadrants.}

\item{invert.selection}{logical If \code{TRUE}, the complement of the
selected rows are returned.}

\item{na.rm}{a logical value indicating whether NA values should be stripped
before the computation proceeds.}

\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE}
never includes, and \code{TRUE} always includes.}

\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather
than combining with them. This is most useful for helper functions that
define both data and aesthetics and shouldn't inherit behaviour from the
default plot specification, e.g. \code{\link[ggplot2]{borders}}.}

\item{h}{vector of bandwidths for x and y directions. Defaults to normal
reference bandwidth (see bandwidth.nrd). A scalar value will be taken to
apply to both directions.}

\item{n}{Number of grid points in each direction. Can be scalar or a
length-2 integer vector}

\item{return.density}{logical vector of lenght 1. If \code{TRUE} add columns
\code{"density"} and \code{"keep.obs"} to the returned data frame.}
}
\value{
A plot layer instance. Using as output \code{data} a subset of the
  rows in input \code{data} retained based on a 2D-density-based filtering
  criterion.
}
\description{
\code{stat_dens2d_filter} Filters-out/filters-in observations in
  regions of a plot panel with high density of observations, based on the
  values mapped to both \code{x} and \code{y} aesthetics.
  \code{stat_dens2d_filter_g} does the filtering by group instead of by
  panel. This second stat is useful for highlighting observations, while the
  first one tends to be most useful when the aim is to prevent clashes among
  text labels. If there is no mapping to \code{label} in \code{data}, the
  mapping is silently set to \code{rownames(data)}.
}
\details{
The local density of observations in 2D (\emph{x} and \emph{y}) is
  computed with function \code{\link[MASS]{kde2d}} and used to select
  observations, passing to the geom a subset of the rows in its \code{data}
  input. The default is to select observations in sparse regions of the plot,
  but the selection can be inverted so that only observations in the densest
  regions are returned. Specific observations can be protected from being
  deselected and "kept" by passing a suitable argument to \code{keep.these}.
  Logical and integer vectors work as indexes to rows in \code{data}, while a
  character vector values are compared to the character values mapped to the
  \code{label} aesthetic. A function passed as argument to keep.these will
  receive as argument the values in the variable mapped to \code{label} and
  should return a character, logical or numeric vector as described above. If
  no variable has been mapped to \code{label}, row names are used in its
  place.

  How many rows are retained in addition to those in \code{keep.these} is
  controlled with arguments passed to \code{keep.number} and
  \code{keep.fraction}. \code{keep.number} sets the maximum number of
  observations selected, whenever \code{keep.fraction} results in fewer
  observations selected, it is obeyed.

  Computation of density and of the default bandwidth require at least
  two observations with different values. If data do not fulfill this
  condition, they are kept only if \code{keep.fraction = 1}. This is correct
  behavior for a single observation, but can be surprising in the case of
  multiple observations.

  Parameters \code{keep.these} and \code{exclude.these} make it possible to
  force inclusion or exclusion of observations after the density is computed.
  In case of conflict, \code{exclude.these} overrides \code{keep.these}.
}
\note{
Which points are kept and which not depends on how dense a grid is used
  and how flexible the density surface estimate is. This depends on the
  values passed as arguments to parameters \code{n}, \code{bw} and
  \code{kernel}. It is also important to be aware that both
  \code{geom_text()} and \code{geom_text_repel()} can avoid overplotting by
  discarding labels at the plot rendering stage, i.e., what is plotted may
  differ from what is returned by this statistic.
}
\examples{

random_string <-
  function(len = 6) {
    paste(sample(letters, len, replace = TRUE), collapse = "")
  }

# Make random data.
set.seed(1001)
d <- tibble::tibble(
  x = rnorm(100),
  y = rnorm(100),
  group = rep(c("A", "B"), c(50, 50)),
  lab = replicate(100, { random_string() })
)

# filter (and here highlight) 1/10 observations in sparsest regions
ggplot(data = d, aes(x, y)) +
  geom_point() +
  stat_dens2d_filter(colour = "red")

# filter observations not in the sparsest regions
ggplot(data = d, aes(x, y)) +
  geom_point() +
  stat_dens2d_filter(colour = "blue", invert.selection = TRUE)

# filter observations in dense regions of the plot
ggplot(data = d, aes(x, y)) +
  geom_point() +
  stat_dens2d_filter(colour = "blue", keep.sparse = FALSE)

# filter 1/2 the observations
ggplot(data = d, aes(x, y)) +
  geom_point() +
  stat_dens2d_filter(colour = "red", keep.fraction = 0.5)

# filter 1/2 the observations but cap their number to maximum 12 observations
ggplot(data = d, aes(x, y)) +
  geom_point() +
  stat_dens2d_filter(colour = "red",
                     keep.fraction = 0.5,
                     keep.number = 12)

# density filtering done jointly across groups
ggplot(data = d, aes(x, y, colour = group)) +
  geom_point() +
  stat_dens2d_filter(shape = 1, size = 3, keep.fraction = 1/4)

# density filtering done independently for each group
ggplot(data = d, aes(x, y, colour = group)) +
  geom_point() +
  stat_dens2d_filter_g(shape = 1, size = 3, keep.fraction = 1/4)

# density filtering done jointly across groups by overriding grouping
ggplot(data = d, aes(x, y, colour = group)) +
  geom_point() +
  stat_dens2d_filter_g(colour = "black",
                       shape = 1, size = 3, keep.fraction = 1/4)

# label observations
ggplot(data = d, aes(x, y, label = lab, colour = group)) +
  geom_point() +
  stat_dens2d_filter(geom = "text")

ggplot(data = d, aes(x, y, label = lab, colour = group)) +
  geom_point() +
  stat_dens2d_filter(geom = "text",
                     keep.these = function(x) {grepl("^u", x)})

ggplot(data = d, aes(x, y, label = lab, colour = group)) +
  geom_point() +
  stat_dens2d_filter(geom = "text",
                     keep.these = function(x) {grepl("^u", x)})

ggplot(data = d, aes(x, y, label = lab, colour = group)) +
  geom_point() +
  stat_dens2d_filter(geom = "text",
                     keep.these = 1:30)

# looking under the hood with gginnards::geom_debug_group()
gginnards.installed <- requireNamespace("gginnards", quietly = TRUE)
if (gginnards.installed) {
  library(gginnards)

  ggplot(data = d, aes(x, y, label = lab, colour = group)) +
    stat_dens2d_filter(geom = "debug_group")
}
if (gginnards.installed) {
  ggplot(data = d, aes(x, y, label = lab, colour = group)) +
    geom_point() +
    stat_dens2d_filter(geom = "debug_group", return.density = TRUE)
}

}
\seealso{
\code{\link{stat_dens2d_labels}} and \code{\link[MASS]{kde2d}} used
  internally. Parameters \code{n}, \code{h} in these statistics correspond to
  the parameters with the same name in this imported function. Limits are set
  to the limits of the plot scales.

Other statistics returning a subset of data: 
\code{\link{stat_dens1d_filter}()},
\code{\link{stat_dens1d_labels}()},
\code{\link{stat_dens2d_labels}()}
}
\concept{statistics returning a subset of data}
