Commit 2a35c382 authored by fzyang's avatar fzyang
Browse files

add param approxQuantileRelativeError in IForest

parent 645eb888
......@@ -40,8 +40,10 @@ The default value will be about log2(numSamples).
- *contamination:* The proportion of outliers in the data set, the value should be in (0, 1).
It is only used in the prediction phase to convert anomaly score to predicted labels.
In order to enhance performance, Our method to get anomaly score threshold is caculated by approxQuantile.
Note that this is an approximate quantiles computation, if you want an exactly answer,
you can extract ”$anomalyScoreCol" to select your anomalies.
You can set the param approxQuantileRelativeError greater than 0,
in order to calculate an approximate quantile threshold of anomaly scores for large dataset.
- *approxQuantileRelativeError:* Relative Error for Approximate Quantile Calculation (0 <= value <= 1),
default is 0 for calculating the exact value, which would be expensive for large datasets.
- *bootstrap:* If true, individual trees are fit on random subsets of the training data sampled with replacement.
If false, sampling without replacement is performed.
- *seed:* The seed used by the randam number generator.
......
......@@ -219,15 +219,19 @@ class IForest(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasSeed, JavaMLWr
"If true, the training data sampled with replacement (boolean)",
typeConverter=TypeConverters.toBoolean)
approxQuantileRelativeError = Param(Params._dummy(), "approxQuantileRelativeError",
"Relative Error for anomaly score approximate quantile calculaion (0 <= e <= 1)",
typeConverter=TypeConverters.toFloat)
@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", anomalyScore="anomalyScore",
numTrees=100, maxSamples=1.0, maxFeatures=1.0, maxDepth=10, contamination=0.1,
bootstrap=False):
bootstrap=False, approxQuantileRelativeError=0.):
super(IForest, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.iforest.IForest", self.uid)
self._setDefault(numTrees=100, maxSamples=1.0, maxFeatures=1.0, maxDepth=10, contamination=0.1,
bootstrap=False)
bootstrap=False, approxQuantileRelativeError=0.)
kwargs = self._input_kwargs
self.setParams(**kwargs)
......@@ -238,9 +242,9 @@ class IForest(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasSeed, JavaMLWr
@since("2.1.0")
def setParams(self, featuresCol="features", predictionCol="prediction", anomalyScore="anomalyScore",
numTrees=100, maxSamples=1.0, maxFeatures=1.0, maxDepth=10, contamination=0.1,
bootstrap=False):
bootstrap=False, approxQuantileRelativeError=0.):
"""
Sets params for KMeans.
Sets params for IForest.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
......@@ -328,3 +332,17 @@ class IForest(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasSeed, JavaMLWr
Gets the value of `bootstrap`
"""
return self.getOrDefault(self.bootstrap)
@since("2.1.0")
def setApproxQuantileRelativeError(self, value):
"""
Sets the value of :py:attr:`approxQuantileRelativeError`.
"""
return self._set(approxQuantileRelativeError=value)
@since("2.1.0")
def getApproxQuantileRelativeError(self):
"""
Gets the value of `approxQuantileRelativeError`
"""
return self.getOrDefault(self.approxQuantileRelativeError)
......@@ -87,7 +87,7 @@ class IForestModel (
val scoreDataset = dataset.withColumn($(anomalyScoreCol), scoreUDF(col($(featuresCol))))
// get threshold value
val threshold = scoreDataset.stat.approxQuantile($(anomalyScoreCol),
Array(1 - $(contamination)), 0)
Array(1 - $(contamination)), $(approxQuantileRelativeError))
// set anomaly instance label 1
val predictUDF = udf { (anomalyScore: Double) =>
if (anomalyScore > threshold(0)) 1.0 else 0.0
......@@ -328,7 +328,8 @@ class IForest (
maxDepth -> 10,
contamination -> 0.1,
bootstrap -> false,
seed -> this.getClass.getName.hashCode.toLong
seed -> this.getClass.getName.hashCode.toLong,
approxQuantileRelativeError -> 0d
)
def this() = this(Identifiable.randomUID("IForest"))
......@@ -367,6 +368,9 @@ class IForest (
/** @group setParam */
def setAnomalyScoreCol(value: String): this.type = set(anomalyScoreCol, value)
/** @group setParam */
def setApproxQuantileRelativeError(value: Double): this.type = set(approxQuantileRelativeError, value)
override def copy(extra: ParamMap): IForest = defaultCopy(extra)
lazy val rng = new Random($(seed))
......@@ -480,7 +484,7 @@ class IForest (
instr.logPipelineStage(this)
instr.logDataset(dataset)
instr.logParams(this, numTrees, maxSamples, maxFeatures, maxDepth, contamination,
bootstrap, seed, featuresCol, predictionCol, labelCol)
approxQuantileRelativeError, bootstrap, seed, featuresCol, predictionCol, labelCol)
// Each iTree of the iForest will be built on parallel and collected in the driver.
// Approximate memory usage for iForest model is calculated, a warning will be raised if iForest is too large.
......@@ -704,9 +708,9 @@ trait IForestParams extends Params {
* The proportion of outliers in the data set (0< contamination < 1).
* It will be used in the prediction. In order to enhance performance,
* Our method to get anomaly score threshold adopts DataFrameStsFunctions.approxQuantile,
* which is designed for performance with some extent accuracy loss. Note
* that this is an approximate quantiles computation, if you want an exactly
* answer, you can extract "anomalyScoreCol" to select your anomalies.
* which is designed for performance with some extent accuracy loss.
* Set the param approxQuantileRelativeError (0 < e < 1) to calculate
* an approximate quantile threshold of anomaly scores for large dataset.
* @group param
*/
final val contamination: DoubleParam =
......@@ -716,6 +720,19 @@ trait IForestParams extends Params {
/** @group getParam */
def getContamination: Double = $(contamination)
/**
* Relative Error for Approximate Quantile (0 <= value <= 1), default is 0.
* @group param
*/
final val approxQuantileRelativeError: Param[Double] =
new Param[Double](parent = this, name ="approxQuantileRelativeError", doc = "relative error for approximate quantile")
/** @group setParam */
setDefault(approxQuantileRelativeError, value = 0d)
/** @group getParam */
final def getApproxQuantileRelativeError: Double = $(approxQuantileRelativeError)
/**
* If true, individual trees are fit on random subsets of the training data
* sampled with replacement. If false, sampling without replacement is performed.
......@@ -730,7 +747,7 @@ trait IForestParams extends Params {
def getBootstrap: Boolean = $(bootstrap)
/**
* The seed used by the randam number generator.
* The seed used by the random number generator.
* @group param
*/
final val seed: LongParam = new LongParam(this, "seed", "random seed")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment