// can't yet format YamlFrontmatter (["title: In silico gene expression"; "category: Datasets"; "categoryindex: 1"; "index: 5"], Some { StartLine = 2 StartColumn = 0 EndLine = 6 EndColumn = 8 }) to pynb markdown

[![Binder](/datasets/img/badge-binder.svg)](https://mybinder.org/v2/gh/plotly/Plotly.NET/gh-pages?filepath=05_InSilicoGeneExpression.ipynb)&emsp;
[![Script](/datasets/img/badge-script.svg)](/datasets/05_InSilicoGeneExpression.fsx)&emsp;
[![Notebook](/datasets/img/badge-notebook.svg)](/datasets/05_InSilicoGeneExpression.ipynb)

# The _in silico gene expression_ dataset

**Table of contents**

- [Description]()
- [How to use]()
- [Examples]()

## Description

This is an in-silico data. It emulates the expression of 100 genes over 3 conditions, with 3 replicates each. It is made so that replicates are more similar to each other.

7% of the values are dropped, as the dataset was originally made to showcase missing value imputation.

## How to use




In [1]:
#r "nuget: FSharp.Data"
#r "nuget: Deedle"

open FSharp.Data
open Deedle

let rawData = Http.RequestString @"https://raw.githubusercontent.com/fslaborg/datasets/main/data/InSilicoGeneExpression.csv"

let df : Frame<string,string> = 
    Frame.ReadCsvString(rawData)
    |> Frame.indexRows "Key" //exact settings may differ here depending on e.g. the separator used in the individual dataset

df.Print()


Condition0_1     Condition0_2     Condition0_3     Condition1_1     Condition1_2     Condition1_3     Condition2_1     Condition2_2     Condition2_3     Gene0  -> <missing>        <missing>        859.507048737706 892.488061131967 1018.39682842723 <missing>        1103.47465251202 1157.72940330711 1065.74060396554 Gene1  -> 874.831680800388 750.248739657293 885.186911420285 928.994516057073 853.081858812674 793.574297701139 1065.97949919587 1131.14376992316 <missing>        Gene2  -> 838.556912459832 852.727407339623 899.295260312015 860.880771705626 932.199854945633 976.124808642915 1207.93463145272 <missing>        1277.61049813247 Gene3  -> 578.81785907921  678.347549342628 602.246497320338 <missing>        643.093516693419 <missing>        <missing>        873.194740469258 849.451122811244 Gene4  -> 842.094396445274 965.835426665507 867.369051645365 928.252271146921 881.501122913359 <missing>        1054.1287942036  1171.60939846118 1038.00577431047 Gene5  -> 1020.09691148753 1074.

## Examples

Compute a correlation matrix between the genes after imputing the missing values




In [2]:
#r "nuget: FSharp.Stats, 0.4.2"
#r "nuget: Plotly.NET, 2.0.0-preview.6"

open FSharp.Stats
open FSharp.Stats.ML
open Plotly.NET

// Select the imputation method: kNearestImpute where the 2 nearest observations are considered
let kn : Impute.MatrixBaseImputation<float[],float> = Impute.kNearestImpute 2

// Impute the missing values using the "imputeBy" function. The values of the deedle frame are first transformed into the input type of this function.
let imputedData = 
    df 
    |> Frame.toJaggedArray 
    |> Impute.imputeBy kn Ops.isNan
    |> Matrix.ofJaggedSeq

// Perform a row-wise pearson correlation on the matrix, resulting in a correlation matrix
let correlationMatrix = Correlation.Matrix.rowWisePearson imputedData

// Create a plotly heatmap from the correlation matrix
let correlationHeatmap = 
    correlationMatrix
    |> Matrix.toJaggedArray
    |> Chart.Heatmap


In [None]:
correlationHeatmap
