#r "nuget: Deedle.Interactive, 3.0.0"
#r "nuget: FSharp.Stats, 0.4.3"
#r "nuget: Plotly.NET.Interactive, 4.0.0"
#r "nuget: FSharp.Data, 4.2.7"

open FSharp.Data
open Deedle
open Plotly.NET

// We retrieve the dataset via FSharp.Data:
let rawDataHousefly = Http.RequestString @"https://raw.githubusercontent.com/fslaborg/datasets/main/data/HouseflyWingLength.txt"

let dataHousefly : seq<float> = 
    Frame.ReadCsvString(rawDataHousefly, false, schema = "wing length (mm * 10^1)")
    |> Frame.getCol "wing length (mm * 10^1)"
    |> Series.values
    // We convert the values to mm
    |> Seq.map (fun x -> x / 10.)

Chart.BoxPlot(
    Y = dataHousefly, 
    Name = "housefly", 
    BoxPoints = StyleParam.BoxPoints.All, 
    Jitter = 0.2
)
|> Chart.withYAxisStyle "wing length [mm]"

open FSharp.Stats
open FSharp.Stats.Testing

// The testing module in FSharp.Stats require vectors as input types, thus we transform our array into a vector:
let vectorDataHousefly = vector dataHousefly

// The expected value of our population.
let expectedValue = 4.5

// Perform the one-sample t-test with our vectorized data and our exptected value as parameters.
let oneSampleResult = TTest.oneSample vectorDataHousefly expectedValue

oneSampleResult

1.2756249193674383

99

0.8974634107766597

0.1025365892233403

0.2050731784466806

open System.Text

let rawDataAthletes = Http.RequestString @"https://raw.githubusercontent.com/fslaborg/datasets/main/data/ConcussionsInMaleAndFemaleCollegeAthletes_adapted.tsv"

let dataAthletesAsStream = new System.IO.MemoryStream(rawDataAthletes |> Encoding.UTF8.GetBytes)

// The schema helps us setting column keys.
let dataAthletesAsFrame = Frame.ReadCsv(dataAthletesAsStream, hasHeaders = false, separators = "\t", schema = "Gender, Sports, Year, Concussion, Count")

dataAthletesAsFrame

// We need to filter out the columns and rows we don't need. Thus, we filter out the rows where the athletes suffered no concussions  
// as well as filter out the columns without the number of concussions.
let dataAthletesFemale, dataAthletesMale =
    let getAthleteGenderData gender =
        let dataAthletesOnlyConcussion =
            dataAthletesAsFrame
            |> Frame.filterRows (fun r objS -> objS.GetAs "Concussion")
        let dataAthletesGenderFrame =
            dataAthletesOnlyConcussion
            |> Frame.filterRows (fun r objS -> objS.GetAs "Gender" = gender)
        dataAthletesGenderFrame
        |> Frame.getCol "Count" 
        |> Series.values
        |> vector
    getAthleteGenderData "Female", getAthleteGenderData "Male"

[
    Chart.BoxPlot(Y = dataAthletesFemale, Name = "female college athletes", BoxPoints = StyleParam.BoxPoints.All, Jitter = 0.2)
    Chart.BoxPlot(Y = dataAthletesMale, Name = "male college athletes", BoxPoints = StyleParam.BoxPoints.All, Jitter = 0.2)
]
|> Chart.combine
|> Chart.withYAxisStyle "number of concussions over 3 years"

// We test both samples against each other, assuming equal variances.
let twoSampleResult = TTest.twoSample true dataAthletesFemale dataAthletesMale

twoSampleResult

0.56161040164984

28

0.7105752703384163

0.2894247296615837

0.5788494593231674

let rawDataCaffeine = Http.RequestString @"https://raw.githubusercontent.com/fslaborg/datasets/main/data/CaffeineAndEndurance(wide)_adapted.tsv"

let dataCaffeineAsStream = new System.IO.MemoryStream(rawDataCaffeine |> Encoding.UTF8.GetBytes)
let dataCaffeineAsFrame = Frame.ReadCsv(dataCaffeineAsStream, hasHeaders = false, separators = "\t", schema = "Subject ID, no Dose, 5 mg, 9 mg, 13 mg")

// We want to compare the subjects' performances under the influence of 13 mg caffeine and in the control situation.
let dataCaffeineNoDose, dataCaffeine13mg =
    let getVectorFromCol col = 
        dataCaffeineAsFrame
        |> Frame.getCol col
        |> Series.values
        |> vector
    getVectorFromCol "no Dose", getVectorFromCol "13 mg"

// Transforming our data into a chart.

Seq.zip dataCaffeineNoDose dataCaffeine13mg
|> Seq.mapi (fun i (control,treatment) -> 
    let participant = "Person " + string i 
    Chart.Line(["no dose", control; "13 mg", treatment], Name = participant)
    )
|> Chart.combine
|> Chart.withXAxisStyle ""
|> Chart.withYAxisStyle("endurance performance", MinMax = (0.,100.))

let twoSamplePairedResult = TTest.twoSamplePaired dataCaffeineNoDose dataCaffeine13mg
twoSamplePairedResult

3.2525076715666534

8

0.9941713793753144

0.005828620624685588

0.011657241249371175

		Gender	Sports	Year	Concussion	Count
		(string)	(string)	(int)	(Boolean)	(int)
0	->	Female	Soccer	1997	False	24930
1	->	Female	Soccer	1997	True	51
2	->	Female	Soccer	1998	False	22887
3	->	Female	Soccer	1998	True	47
4	->	Female	Soccer	1999	False	27107
:		...	...	...	...	...
55	->	Male	Gymnastics	1997	True	0
56	->	Male	Gymnastics	1998	False	221
57	->	Male	Gymnastics	1998	True	0
58	->	Male	Gymnastics	1999	False	1179
59	->	Male	Gymnastics	1999	True	0

Statistic	3.2525076715666534
DegreesOfFreedom	8
PValueLeft	0.9941713793753144
PValueRight	0.005828620624685588
PValue	0.011657241249371175

Testing with FSharp.Stats I: t-test

Posted on 2021-7-29 by Oliver Maus in Data Science

Testing with FSharp.Stats I: t-test¶

Getting started: The t-test¶

One-sample t-test¶

Two-sample t-test (unpaired data)¶

Two-sample t-test (paired data)¶

Statistic	1.2756249193674383
DegreesOfFreedom	99
PValueLeft	0.8974634107766597
PValueRight	0.1025365892233403
PValue	0.2050731784466806

Statistic	0.56161040164984
DegreesOfFreedom	28
PValueLeft	0.7105752703384163
PValueRight	0.2894247296615837
PValue	0.5788494593231674