Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Maria Karanasou
spark-iforest
Commits
2a35c382
Commit
2a35c382
authored
Apr 03, 2019
by
fzyang
Browse files
add param approxQuantileRelativeError in IForest
parent
645eb888
Changes
3
Hide whitespace changes
Inline
Side-by-side
README.md
View file @
2a35c382
...
...
@@ -40,8 +40,10 @@ The default value will be about log2(numSamples).
-
*contamination:*
The proportion of outliers in the data set, the value should be in (0, 1).
It is only used in the prediction phase to convert anomaly score to predicted labels.
In order to enhance performance, Our method to get anomaly score threshold is caculated by approxQuantile.
Note that this is an approximate quantiles computation, if you want an exactly answer,
you can extract ”$anomalyScoreCol" to select your anomalies.
You can set the param approxQuantileRelativeError greater than 0,
in order to calculate an approximate quantile threshold of anomaly scores for large dataset.
-
*approxQuantileRelativeError:*
Relative Error for Approximate Quantile Calculation (0 <= value <= 1),
default is 0 for calculating the exact value, which would be expensive for large datasets.
-
*bootstrap:*
If true, individual trees are fit on random subsets of the training data sampled with replacement.
If false, sampling without replacement is performed.
-
*seed:*
The seed used by the randam number generator.
...
...
python/pyspark_iforest/ml/iforest.py
View file @
2a35c382
...
...
@@ -219,15 +219,19 @@ class IForest(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasSeed, JavaMLWr
"If true, the training data sampled with replacement (boolean)"
,
typeConverter
=
TypeConverters
.
toBoolean
)
approxQuantileRelativeError
=
Param
(
Params
.
_dummy
(),
"approxQuantileRelativeError"
,
"Relative Error for anomaly score approximate quantile calculaion (0 <= e <= 1)"
,
typeConverter
=
TypeConverters
.
toFloat
)
@
keyword_only
def
__init__
(
self
,
featuresCol
=
"features"
,
predictionCol
=
"prediction"
,
anomalyScore
=
"anomalyScore"
,
numTrees
=
100
,
maxSamples
=
1.0
,
maxFeatures
=
1.0
,
maxDepth
=
10
,
contamination
=
0.1
,
bootstrap
=
False
):
bootstrap
=
False
,
approxQuantileRelativeError
=
0.
):
super
(
IForest
,
self
).
__init__
()
self
.
_java_obj
=
self
.
_new_java_obj
(
"org.apache.spark.ml.iforest.IForest"
,
self
.
uid
)
self
.
_setDefault
(
numTrees
=
100
,
maxSamples
=
1.0
,
maxFeatures
=
1.0
,
maxDepth
=
10
,
contamination
=
0.1
,
bootstrap
=
False
)
bootstrap
=
False
,
approxQuantileRelativeError
=
0.
)
kwargs
=
self
.
_input_kwargs
self
.
setParams
(
**
kwargs
)
...
...
@@ -238,9 +242,9 @@ class IForest(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasSeed, JavaMLWr
@
since
(
"2.1.0"
)
def
setParams
(
self
,
featuresCol
=
"features"
,
predictionCol
=
"prediction"
,
anomalyScore
=
"anomalyScore"
,
numTrees
=
100
,
maxSamples
=
1.0
,
maxFeatures
=
1.0
,
maxDepth
=
10
,
contamination
=
0.1
,
bootstrap
=
False
):
bootstrap
=
False
,
approxQuantileRelativeError
=
0.
):
"""
Sets params for
KMeans
.
Sets params for
IForest
.
"""
kwargs
=
self
.
_input_kwargs
return
self
.
_set
(
**
kwargs
)
...
...
@@ -328,3 +332,17 @@ class IForest(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasSeed, JavaMLWr
Gets the value of `bootstrap`
"""
return
self
.
getOrDefault
(
self
.
bootstrap
)
@
since
(
"2.1.0"
)
def
setApproxQuantileRelativeError
(
self
,
value
):
"""
Sets the value of :py:attr:`approxQuantileRelativeError`.
"""
return
self
.
_set
(
approxQuantileRelativeError
=
value
)
@
since
(
"2.1.0"
)
def
getApproxQuantileRelativeError
(
self
):
"""
Gets the value of `approxQuantileRelativeError`
"""
return
self
.
getOrDefault
(
self
.
approxQuantileRelativeError
)
src/main/scala/org/apache/spark/ml/iforest/IForest.scala
View file @
2a35c382
...
...
@@ -87,7 +87,7 @@ class IForestModel (
val
scoreDataset
=
dataset
.
withColumn
(
$
(
anomalyScoreCol
),
scoreUDF
(
col
(
$
(
featuresCol
))))
// get threshold value
val
threshold
=
scoreDataset
.
stat
.
approxQuantile
(
$
(
anomalyScoreCol
),
Array
(
1
-
$
(
contamination
)),
0
)
Array
(
1
-
$
(
contamination
)),
$
(
approxQuantileRelativeError
)
)
// set anomaly instance label 1
val
predictUDF
=
udf
{
(
anomalyScore
:
Double
)
=>
if
(
anomalyScore
>
threshold
(
0
))
1.0
else
0.0
...
...
@@ -328,7 +328,8 @@ class IForest (
maxDepth
->
10
,
contamination
->
0.1
,
bootstrap
->
false
,
seed
->
this
.
getClass
.
getName
.
hashCode
.
toLong
seed
->
this
.
getClass
.
getName
.
hashCode
.
toLong
,
approxQuantileRelativeError
->
0d
)
def
this
()
=
this
(
Identifiable
.
randomUID
(
"IForest"
))
...
...
@@ -367,6 +368,9 @@ class IForest (
/** @group setParam */
def
setAnomalyScoreCol
(
value
:
String
)
:
this.
type
=
set
(
anomalyScoreCol
,
value
)
/** @group setParam */
def
setApproxQuantileRelativeError
(
value
:
Double
)
:
this.
type
=
set
(
approxQuantileRelativeError
,
value
)
override
def
copy
(
extra
:
ParamMap
)
:
IForest
=
defaultCopy
(
extra
)
lazy
val
rng
=
new
Random
(
$
(
seed
))
...
...
@@ -480,7 +484,7 @@ class IForest (
instr
.
logPipelineStage
(
this
)
instr
.
logDataset
(
dataset
)
instr
.
logParams
(
this
,
numTrees
,
maxSamples
,
maxFeatures
,
maxDepth
,
contamination
,
bootstrap
,
seed
,
featuresCol
,
predictionCol
,
labelCol
)
approxQuantileRelativeError
,
bootstrap
,
seed
,
featuresCol
,
predictionCol
,
labelCol
)
// Each iTree of the iForest will be built on parallel and collected in the driver.
// Approximate memory usage for iForest model is calculated, a warning will be raised if iForest is too large.
...
...
@@ -704,9 +708,9 @@ trait IForestParams extends Params {
* The proportion of outliers in the data set (0< contamination < 1).
* It will be used in the prediction. In order to enhance performance,
* Our method to get anomaly score threshold adopts DataFrameStsFunctions.approxQuantile,
* which is designed for performance with some extent accuracy loss.
Note
*
tha
t th
is is an approximate quantiles computation, if you want an exactly
* an
swer, you can extract "
anomaly
S
core
Col" to select your anomalies
.
* which is designed for performance with some extent accuracy loss.
*
Se
t th
e param approxQuantileRelativeError (0 < e < 1) to calculate
* an
approximate quantile threshold of
anomaly
s
core
s for large dataset
.
* @group param
*/
final
val
contamination
:
DoubleParam
=
...
...
@@ -716,6 +720,19 @@ trait IForestParams extends Params {
/** @group getParam */
def
getContamination
:
Double
=
$
(
contamination
)
/**
* Relative Error for Approximate Quantile (0 <= value <= 1), default is 0.
* @group param
*/
final
val
approxQuantileRelativeError
:
Param
[
Double
]
=
new
Param
[
Double
](
parent
=
this
,
name
=
"approxQuantileRelativeError"
,
doc
=
"relative error for approximate quantile"
)
/** @group setParam */
setDefault
(
approxQuantileRelativeError
,
value
=
0d
)
/** @group getParam */
final
def
getApproxQuantileRelativeError
:
Double
=
$
(
approxQuantileRelativeError
)
/**
* If true, individual trees are fit on random subsets of the training data
* sampled with replacement. If false, sampling without replacement is performed.
...
...
@@ -730,7 +747,7 @@ trait IForestParams extends Params {
def
getBootstrap
:
Boolean
=
$
(
bootstrap
)
/**
* The seed used by the rand
a
m number generator.
* The seed used by the rand
o
m number generator.
* @group param
*/
final
val
seed
:
LongParam
=
new
LongParam
(
this
,
"seed"
,
"random seed"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment