#r "nuget: Deedle.Interactive, 3.0.0"
#r "nuget: FSharp.Stats, 0.4.3"
#r "nuget: Plotly.NET.Interactive, 4.0.0"
#r "nuget: FSharp.Data, 4.2.7"

Loading extensions from `C:\Users\schne\.nuget\packages\plotly.net.interactive\4.0.0\interactive-extensions\dotnet\Plotly.NET.Interactive.dll`

Loading extensions from `C:\Users\schne\.nuget\packages\deedle.interactive\3.0.0\interactive-extensions\dotnet\Deedle.Interactive.dll`

open FSharp.Data
open Deedle

// Retrieve data using the FSharp.Data package and read it as dataframe using the Deedle package
let rawData = Http.RequestString @"https://raw.githubusercontent.com/fslaborg/datasets/main/data/iris.csv"
let df = Frame.ReadCsvString(rawData)

df

open Plotly.NET

let colNames = ["sepal_length";"sepal_width";"petal_length";"petal_width"]

// isolate data as float [] []
let data = 
    Frame.dropCol "species" df
    |> Frame.toJaggedArray
    

// isolate labels as seq<string>
let labels = 
    Frame.getCol "species" df
    |> Series.values
    |> Seq.mapi (fun i s -> sprintf "%s_%i" s i)
    |> Array.ofSeq


Chart.Heatmap(data,colNames=colNames,rowNames=labels)
// required to fit the species identifier on the left side of the heatmap
|> Chart.withMarginSize(Left=100.)
|> Chart.withTitle "raw iris data"

open FSharp.Stats.ML
open FSharp.Stats.ML.Unsupervised

let distanceMeasure = DistanceMetrics.euclideanNaNSquared

let linker = HierarchicalClustering.Linker.centroidLwLinker

// calculates the clustering and reports a single root cluster (node), 
// that may recursively contains further nodes
let clusterResultH = 
    HierarchicalClustering.generate distanceMeasure linker data

// If a desired cluster number is specified, the following function cuts the cluster according
// to the depth, that results in the respective number of clusters (here 3). Only leaves are reported.
let threeClusters = HierarchicalClustering.cutHClust 3 clusterResultH

// Detailed information for 3 clusters are given
let inspectThreeClusters =
    threeClusters
    |> List.map (fun cluster -> 
        cluster
        |> List.map (fun leaf -> 
            labels.[HierarchicalClustering.getClusterId leaf]
            )
        )

inspectThreeClusters
|> List.mapi (fun i x -> 
    let truncCluster = x.[0..4] |> String.concat "; " 
    sprintf "Cluster%i: [%s ...]" i truncCluster 
    )
|> String.concat "\n"

Cluster0: [versicolor_44; versicolor_76; versicolor_89; versicolor_12; versicolor_93 ...]
Cluster1: [setosa_16; setosa_36; setosa_74; setosa_42; setosa_122 ...]
Cluster2: [virginica_72; virginica_135; virginica_14; virginica_7; virginica_73 ...]

// To recursevely flatten the cluster tree into leaves only, use flattenHClust.
// A leaf list is reported, that does not contain any cluster membership, 
// but is sorted by the clustering result.
let hLeaves = 
    clusterResultH
    |> HierarchicalClustering.flattenHClust
    
// Takes the sorted cluster result and reports a tuple of label and data value.
let dataSortedByClustering =    
    hLeaves
    |> Seq.choose (fun c -> 
        let label  = labels.[HierarchicalClustering.getClusterId c]
        let values = HierarchicalClustering.tryGetLeafValue c
        match values with
        | None -> None
        | Some x -> Some (label,x)
        )

open FSharpAux

let (hlable,hdata) =
    dataSortedByClustering
    |> Seq.unzip
Chart.Heatmap(hdata,colNames=colNames,rowNames=hlable)
// required to fit the species identifier on the left side of the heatmap
|> Chart.withMarginSize(Left=100.)
|> Chart.withTitle "Clustered iris data (hierarchical clustering)"

		sepal_length	sepal_width	petal_length	petal_width	species
		(Decimal)	(Decimal)	(Decimal)	(Decimal)	(string)
0	->	5.5	2.4	3.8	1.1	versicolor
1	->	4.9	3.1	1.5	0.1	setosa
2	->	7.6	3	6.6	2.1	virginica
3	->	5.6	2.8	4.9	2	virginica
4	->	6.1	3	4.9	1.8	virginica
:		...	...	...	...	...
145	->	7.7	2.6	6.9	2.3	virginica
146	->	5.7	2.6	3.5	1	versicolor
147	->	5.9	3	5.1	1.8	virginica
148	->	6.8	3.2	5.9	2.3	virginica
149	->	5	3.6	1.4	0.2	setosa

Clustering with FSharp.Stats II: hierarchical clustering

Posted on 2021-7-28 by Benedikt Venn in Data Science

Clustering with FSharp.Stats II: hierarchical clustering¶

Introduction¶

There are two types of hClust:¶

Distance measures¶

Linker¶

Referencing packages¶

Loading data¶

Clustering¶

Limitations¶

Notes¶

References¶

Further reading¶