Summary: This tutorial demonstrates DBSCAN with FSharp.Stats and how to visualize the results with Plotly.NET.
In the previous article of this series hierarchical clustering using FSharp.Stats was introduced.
Clustering methods can be used to group elements of a huge data set based on their similarity. Elements sharing similar properties cluster together and can be reported as coherent group. Density-Based Spatial Clustering of Applications with Noise (DBSCAN) was developed to identify clusters with similar density and allows the exclusion of noise points.
For demonstration of DBSCAN, the classic iris data set is used, which consists of 150 records, each of which contains four measurements and a species identifier. In this tutorial we are going to perform DBSCAN on two- and three-dimensional data.
#r "nuget: Deedle.Interactive, 3.0.0"
#r "nuget: FSharp.Stats, 0.4.3"
#r "nuget: Plotly.NET.Interactive, 4.0.0"
#r "nuget: FSharp.Data, 4.2.7"
Loading extensions from `C:\Users\schne\.nuget\packages\deedle.interactive\3.0.0\interactive-extensions\dotnet\Deedle.Interactive.dll`
Loading extensions from `C:\Users\schne\.nuget\packages\plotly.net.interactive\4.0.0\interactive-extensions\dotnet\Plotly.NET.Interactive.dll`
open FSharp.Data
open FSharp.Stats
open Deedle
// Retrieve data using the FSharp.Data package and read it as dataframe using the Deedle package
let rawData = Http.RequestString @"https://raw.githubusercontent.com/fslaborg/datasets/main/data/iris.csv"
let df = Frame.ReadCsvString(rawData)
df
sepal_length | sepal_width | petal_length | petal_width | species | (Decimal) | (Decimal) | (Decimal) | (Decimal) | (string) |
---|---|---|---|---|---|---|
0 | -> | 5.5 | 2.4 | 3.8 | 1.1 | versicolor |
1 | -> | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
2 | -> | 7.6 | 3 | 6.6 | 2.1 | virginica |
3 | -> | 5.6 | 2.8 | 4.9 | 2 | virginica |
4 | -> | 6.1 | 3 | 4.9 | 1.8 | virginica |
: | ... | ... | ... | ... | ... | |
145 | -> | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
146 | -> | 5.7 | 2.6 | 3.5 | 1 | versicolor |
147 | -> | 5.9 | 3 | 5.1 | 1.8 | virginica |
148 | -> | 6.8 | 3.2 | 5.9 | 2.3 | virginica |
149 | -> | 5 | 3.6 | 1.4 | 0.2 | setosa |
150 rows x 5 columns
0 missing values
Let's take a first look at the data with 2D and 3D scatter plots using Plotly.NET. Each of the 150 records consists of four measurements and a species identifier. Since the species identifier occur several times (Iris-virginica, Iris-versicolor, and Iris-setosa), we create unique labels by adding the rows index to the species identifier.
open Plotly.NET
open FSharp.Stats.ML.Unsupervised
let header2D = ["petal_length";"petal_width"]
let header3D = ["sepal_length";"petal_length";"petal_width"]
//extract petal length and petal width
let data2D =
Frame.sliceCols header2D df
|> Frame.toJaggedArray
//extract sepal length, petal length, and petal width
let data3D =
Frame.sliceCols header3D df
|> Frame.toJaggedArray
let labels =
Frame.getCol "species" df
|> Series.values
|> Seq.mapi (fun i s -> sprintf "%s_%i" s i)
let rawChart2D =
let unzippedData =
data2D
|> Array.map (fun x -> x.[0],x.[1])
Chart.Scatter(unzippedData,mode=StyleParam.Mode.Markers,MultiText=labels)
|> Chart.withXAxisStyle header2D.[0]
|> Chart.withYAxisStyle header2D.[1]
|> Chart.withTitle "rawChart2D"
let rawChart3D =
let unzippedData =
data3D
|> Array.map (fun x -> x.[0],x.[1],x.[2])
Chart.Scatter3D(unzippedData,mode=StyleParam.Mode.Markers,MultiText=labels)
|> Chart.withXAxisStyle header3D.[0]
|> Chart.withYAxisStyle header3D.[1]
|> Chart.withZAxisStyle header3D.[2]
|> Chart.withTitle "rawChart3D"
rawChart2D
rawChart3D
The function that performs DBSCAN can be found at FSharp.Stats.ML.Unsupervised.DbScan.compute
. It requires four input parameters:
from FSharp.Stats.ML.DistanceMetrics
) (seq<'T> -> seq<'T> -> float
)int
)float
)seq<#seq<'T>>
)The clustering result consists of a sequence of noise point coordinates and a sequence of clusters containing all related point coordinates.
open FSharp.Stats.ML
open FSharp.Stats.ML.Unsupervised
let eps2D = 0.5
let eps3D = 0.7
let minPts = 20
let result2D = DbScan.compute DistanceMetrics.Array.euclidean minPts eps2D data2D
result2D.ToString()
{ Clusterlist = seq [seq [[|1.5; 0.1|]; [|1.4; 0.2|]; [|1.7; 0.4|]; [|1.5; 0.2|]; ...]; seq [[|4.9; 2.0|]; [|4.9; 1.8|]; [|4.8; 1.8|]; [|5.0; 2.0|]; ...]] Noisepoints = seq [[|6.6; 2.1|]; [|3.0; 1.1|]; [|6.7; 2.0|]; [|6.4; 2.0|]; ...] }
let result3D = DbScan.compute DistanceMetrics.Array.euclidean minPts eps3D data3D
result3D.ToString()
{ Clusterlist = seq [seq [[|5.5; 3.8; 1.1|]; [|5.6; 4.1; 1.3|]; [|5.6; 3.9; 1.1|]; [|5.6; 3.6; 1.3|]; ...]; seq [[|4.9; 1.5; 0.1|]; [|4.9; 1.4; 0.2|]; [|5.4; 1.7; 0.4|]; [|5.4; 1.5; 0.2|]; ...]] Noisepoints = seq [[|7.6; 6.6; 2.1|]; [|7.2; 6.1; 2.5|]; [|7.7; 6.1; 2.3|]; [|5.1; 3.0; 1.1|]; ...] }
let chartCluster2D =
result2D.Clusterlist
|> Seq.mapi (fun i l ->
l
|> Seq.map (fun x -> x.[0],x.[1])
|> Seq.distinct //more efficient visualization; no difference in plot but in point numbers
|> Chart.Point
|> Chart.withTraceInfo (sprintf "Cluster %i" i))
|> Chart.combine
let chartNoise2D =
result2D.Noisepoints
|> Seq.map (fun x -> x.[0],x.[1])
|> Seq.distinct //more efficient visualization; no difference in plot but in point numbers
|> Chart.Point
|> Chart.withTraceInfo "Noise"
let chartTitle2D =
let noiseCount = result2D.Noisepoints |> Seq.length
let clusterCount = result2D.Clusterlist |> Seq.length
let clPtsCount = result2D.Clusterlist |> Seq.sumBy Seq.length
$"eps: %.1f{eps2D} minPts: %i{minPts} pts: %i{noiseCount + clPtsCount} cluster: %i{clusterCount} noisePts: %i{noiseCount}"
[chartNoise2D;chartCluster2D]
|> Chart.combine
|> Chart.withTitle chartTitle2D
|> Chart.withXAxisStyle header2D.[0]
|> Chart.withYAxisStyle header2D.[1]
let chartCluster3D =
result3D.Clusterlist
|> Seq.mapi (fun i l ->
l
|> Seq.map (fun x -> x.[0],x.[1],x.[2])
|> Seq.distinct //faster visualization; no difference in plot but in point number
|> fun x -> Chart.Scatter3D (x,StyleParam.Mode.Markers)
|> Chart.withTraceInfo (sprintf "Cluster_%i" i))
|> Chart.combine
let chartNoise3D =
result3D.Noisepoints
|> Seq.map (fun x -> x.[0],x.[1],x.[2])
|> Seq.distinct //faster visualization; no difference in plot but in point number
|> fun x -> Chart.Scatter3D (x,StyleParam.Mode.Markers)
|> Chart.withTraceInfo "Noise"
let chartname3D =
let noiseCount = result3D.Noisepoints |> Seq.length
let clusterCount = result3D.Clusterlist |> Seq.length
let clPtsCount = result3D.Clusterlist |> Seq.sumBy Seq.length
$"eps: %.1f{eps3D} minPts: %i{minPts} pts: %i{noiseCount + clPtsCount} cluster: %i{clusterCount} noisePts: %i{noiseCount}"
[chartNoise3D;chartCluster3D]
|> Chart.combine
|> Chart.withTitle chartname3D
|> Chart.withXAxisStyle header3D.[0]
|> Chart.withYAxisStyle header3D.[1]
|> Chart.withZAxisStyle header3D.[2]
//for faster computation you can use the squaredEuclidean distance and set your eps to its square
let clusteredChart3D() = DbScan.compute DistanceMetrics.Array.euclideanNaNSquared 20 (0.7**2.) data3D
the absolute variation is low.