// Packages hosted by the Fslab community
#r "nuget: Deedle.Interactive, 3.0.0"
#r "nuget: FSharp.Stats"
// third party .net packages 
#r "nuget: Plotly.NET.Interactive, 4.0.0"
#r "nuget: FSharpAux"
#r "nuget: FSharp.Data"

open FSharp.Stats

SpecialFunctions.Factorial.factorial 3

open FSharp.Data
open Deedle

// Retrieve data using the FSharp.Data package
let rawData = Http.RequestString @"https://raw.githubusercontent.com/dotnet/machinelearning/master/test/data/housing.txt"

// And create a data frame object using the ReadCsvString method provided by Deedle.
// Note: Of course you can directly provide the path to a local source.
let df = Frame.ReadCsvString(rawData,hasHeaders=true,separators="\t")

df

// Note: If you are working outside of a notebook, you may want to print the dataframe using
// df.Print true

let housesNotAtRiver = 
    df
    |> Frame.sliceCols ["RoomsPerDwelling";"MedianHomeValue";"CharlesRiver"]
    |> Frame.filterRowValues (fun s -> s.GetAs<bool>("CharlesRiver") |> not ) 

housesNotAtRiver

open Plotly.NET

// Note that we explicitly specify that we want to work with the values as floats. 
// Since the row identity is not needed anymore when plotting the distribution we can
// directly convert the collection to a FSharp Sequence. 
let pricesNotAtRiver : seq<float> = 
    housesNotAtRiver
    |> Frame.getCol "MedianHomeValue"
    |> Series.values
    

Chart.Histogram(pricesNotAtRiver)
|> Chart.withYAxisStyle("median value of owner occupied homes in 1000s")
|> Chart.withXAxisStyle("price distribution")

// Note: If you are working outside of a notebook, you may want to show the chart in browser using
// |> Chart.show

let housesAtRiver = 
    df
    |> Frame.sliceCols ["RoomsPerDwelling";"MedianHomeValue";"CharlesRiver"]
    |> Frame.filterRowValues (fun s -> s.GetAs<bool>("CharlesRiver"))

let pricesAtRiver : seq<float> = 
    housesAtRiver
    |> Frame.getCol "MedianHomeValue"
    |> Series.values


[
    Chart.Histogram(pricesNotAtRiver, Opacity = 0.66, OffsetGroup = "A")
    |> Chart.withTraceInfo "not at river"
    Chart.Histogram(pricesAtRiver, Opacity = 0.66, OffsetGroup = "A")
    |> Chart.withTraceInfo "at river"
]
|> Chart.combine
|> Chart.withYAxisStyle("median value of owner occupied homes in 1000s")
|> Chart.withXAxisStyle("Comparison of price distributions")

open FSharp.Stats
open FSharpAux
open FSharp.Stats.Correlation

let pricesAll :Series<int,float> = 
    df
    |> Frame.getCol "MedianHomeValue"

let roomsPerDwellingAll :Series<int,float> = 
    df
    |> Frame.getCol "RoomsPerDwelling"   

let correlation = 
    let tmpPrices, tmpRooms = 
        Series.zipInner pricesAll roomsPerDwellingAll    
        |> Series.values 
        |> Seq.unzip
    Seq.pearson tmpPrices tmpRooms

correlation

open Fitting.LinearRegression.OLS

let predictPricesByRooms description data = 
    let pricesAll :Series<_,float> = 
        data
        |> Frame.getCol "MedianHomeValue"

    let roomsPerDwellingAll :Series<_,float> = 
        data
        |> Frame.getCol "RoomsPerDwelling"   

    let fit = 
        let tmpRooms, tmpPrices = 
            Series.zipInner roomsPerDwellingAll pricesAll    
            |> Series.sortBy fst
            |> Series.values 
            |> Seq.unzip
        let coeffs = Linear.Univariable.fit (vector tmpRooms) (vector tmpPrices)
        let predictedPrices = tmpRooms |> Seq.map (Linear.Univariable.predict coeffs)
        [
        Chart.Point(tmpRooms,tmpPrices)
        |> Chart.withTraceInfo (sprintf "%s: data" description )
        Chart.Line(tmpRooms,predictedPrices)
        |> Chart.withTraceInfo (sprintf "%s: coefficients: intercept:%f, slope:%f" description coeffs.[0] coeffs.[1])
        ]                                  
        |> Chart.combine
        |> Chart.withXAxisStyle("rooms per dwelling")
        |> Chart.withYAxisStyle("median value")
    fit

[
    predictPricesByRooms "not at river" housesNotAtRiver
    predictPricesByRooms "at river" housesAtRiver
]
|> Chart.combine
|> Chart.withSize(1200.,700.)

		MedianHomeValue	CrimesPerCapita	PercentResidental	PercentNonRetail	CharlesRiver	NitricOxides	RoomsPerDwelling	PercentPre40s	EmploymentDistance	HighwayDistance	TaxRate	TeacherRatio	BlackIndex	PercentLowIncome
		(Decimal)	(Decimal)	(Decimal)	(Decimal)	(int)	(Decimal)	(Decimal)	(Decimal)	(Decimal)	(int)	(Decimal)	(Decimal)	(Decimal)	(Decimal)
0	->	24.00	0.00632	18.00	2.310	0	0.5380	6.5750	65.20	4.0900	1	296.0	15.30	396.90	4.98
1	->	21.60	0.02731	0.00	7.070	0	0.4690	6.4210	78.90	4.9671	2	242.0	17.80	396.90	9.14
2	->	34.70	0.02729	0.00	7.070	0	0.4690	7.1850	61.10	4.9671	2	242.0	17.80	392.83	4.03
3	->	33.40	0.03237	0.00	2.180	0	0.4580	6.9980	45.80	6.0622	3	222.0	18.70	394.63	2.94
4	->	36.20	0.06905	0.00	2.180	0	0.4580	7.1470	54.20	6.0622	3	222.0	18.70	396.90	5.33
:		...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	->	22.40	0.06263	0.00	11.930	0	0.5730	6.5930	69.10	2.4786	1	273.0	21.00	391.99	9.67
502	->	20.60	0.04527	0.00	11.930	0	0.5730	6.1200	76.70	2.2875	1	273.0	21.00	396.90	9.08
503	->	23.90	0.06076	0.00	11.930	0	0.5730	6.9760	91.00	2.1675	1	273.0	21.00	396.90	5.64
504	->	22.00	0.10959	0.00	11.930	0	0.5730	6.7940	89.30	2.3889	1	273.0	21.00	393.45	6.48
505	->	11.90	0.04741	0.00	11.930	0	0.5730	6.0300	80.80	2.5050	1	273.0	21.00	396.90	7.88

		RoomsPerDwelling	MedianHomeValue	CharlesRiver
		(Decimal)	(Decimal)	(int)
0	->	6.5750	24.00	0
1	->	6.4210	21.60	0
2	->	7.1850	34.70	0
3	->	6.9980	33.40	0
4	->	7.1470	36.20	0
:		...	...	...
501	->	6.5930	22.40	0
502	->	6.1200	20.60	0
503	->	6.9760	23.90	0
504	->	6.7940	22.00	0
505	->	6.0300	11.90	0

Getting started

Posted on 2021-2-9 by David Zimmer in Data Science

Last updated on 2023-9-30 by Jay Goldstein

Getting started¶

Referencing packages¶

Data crunching¶

Data exploration¶