diff --git a/docs/01_03_biocollections.fsx b/docs/01_03_biocollections.fsx deleted file mode 100644 index 5be6bae8..00000000 --- a/docs/01_03_biocollections.fsx +++ /dev/null @@ -1,181 +0,0 @@ -(** ---- -title: BioCollections -category: BioFSharp Core -categoryindex: 1 -index: 3 ---- -*) - -(*** hide ***) - -(*** condition: prepare ***) -#r "nuget: FSharpAux, 1.1.0" -#r "nuget: FSharpAux.IO, 1.1.0" -#r "nuget: FSharp.Stats, 0.4.3" -#r "nuget: Plotly.NET, 2.0.0-preview.18" -#r "../src/BioFSharp/bin/Release/netstandard2.0/BioFSharp.dll" -#r "../src/BioFSharp.IO/bin/Release/netstandard2.0/BioFSharp.IO.dll" -#r "../src/BioFSharp.BioContainers/bin/Release/netstandard2.0/BioFSharp.BioContainers.dll" -#r "../src/BioFSharp.ML/bin/Release/netstandard2.0/BioFSharp.ML.dll" -#r "../src/BioFSharp.Stats/bin/Release/netstandard2.0/BioFSharp.Stats.dll" - -(*** condition: ipynb ***) -#if IPYNB -#r "nuget: FSharpAux, 1.1.0" -#r "nuget: FSharpAux.IO, 1.1.0" -#r "nuget: FSharp.Stats, 0.4.3" -#r "nuget: Plotly.NET, 2.0.0-preview.18" -#r "nuget: Plotly.NET.Interactive, 2.0.0-preview.18" -#r "nuget: BioFSharp, {{fsdocs-package-version}}" -#r "nuget: BioFSharp.IO, {{fsdocs-package-version}}" -#r "nuget: BioFSharp.BioContainers, {{fsdocs-package-version}}" -#r "nuget: BioFSharp.ML, {{fsdocs-package-version}}" -#r "nuget: BioFSharp.Stats, {{fsdocs-package-version}}" -#endif // IPYNB - -(** -# BioCollections - -[![Binder]({{root}}img/badge-binder.svg)](https://mybinder.org/v2/gh/CSBiology/BioFSharp/gh-pages?filepath={{fsdocs-source-basename}}.ipynb)  -[![Script]({{root}}img/badge-script.svg)]({{root}}{{fsdocs-source-basename}}.fsx)  -[![Notebook]({{root}}img/badge-notebook.svg)]({{root}}{{fsdocs-source-basename}}.ipynb) - -*Summary:* This example shows how to use collections of biological items in BioFSharp - -Analogous to the build-in collections BioFSharp provides BioSeq, BioList and BioArray for individual collection specific optimized operations. -The easiest way to create them are the `ofBioItemString` -functions -*) - -open BioFSharp - -let s1 = "PEPTIDE" |> BioSeq.ofAminoAcidString -let s2 = "PEPTIDE" |> BioList.ofAminoAcidString -let s3 = "TAGCAT" |> BioArray.ofNucleotideString - -//Peptide represented as a Bioseq -"PEPTIDE" |> BioSeq.ofAminoAcidString -(***include-it***) - -//Peptide represented as a BioList -"PEPTIDE"|> BioList.ofAminoAcidString -(***include-it***) - -//Nucleotide sequence represented as a BioArray -"TAGCAT" |> BioArray.ofNucleotideString -(***include-it***) - -(** -## Nucleotides - -![Nucleotides1](img/Nucleotides.svg) - -**Figure 1: Selection of covered nucleotide operations** (A) Biological principle. (B) Workflow with `BioSeq`. (C) Other covered functionalities. - -Let's imagine you have a given gene sequence and want to find out what the according protein might look like. -*) -let myGene = BioArray.ofNucleotideString "ATGGCTAGATCGATCGATCGGCTAACGTAA" - - -(*** include-value:myGene ***) - -(** -Yikes! Unfortunately we got the 5'-3' coding strand. For proper transcription we should get the complementary strand first: -*) - -let myProperGene = BioArray.complement myGene - -(*** include-value:myProperGene ***) - -(** -Now let's transcribe and translate it: -*) - -let myTranslatedGene = - myProperGene - |> BioArray.transcribeTemplateStrand - |> BioArray.translate 0 - -(*** include-value:myTranslatedGene ***) - -(** -Of course, if your input sequence originates from the coding strand, you can directly transcribe it to mRNA since the -only difference between the coding strand and the mRNA is the replacement of 'T' by 'U' (Figure 1B) -*) - -let myTranslatedGeneFromCodingStrand = - myGene - |> BioArray.transcribeCodingStrand - |> BioArray.translate 0 - -(*** include-value:myTranslatedGeneFromCodingStrand ***) - -(** -Other Nucleotide conversion operations are also covered: -*) - -let mySmallGene = BioSeq.ofNucleotideString "ATGTTCCGAT" -(***include-value:mySmallGene***) - -let smallGeneRev = BioSeq.reverse mySmallGene -(***include-value:smallGeneRev***) - -let smallGeneComp = BioSeq.complement mySmallGene -(***include-value:smallGeneComp***) - -let smallGeneRevComp = BioSeq.reverseComplement mySmallGene -(***include-value:smallGeneRevComp***) - -(** - -## AminoAcids - -### Basics -Some functions which might be needed regularly are defined to work with nucleotides and amino acids: -*) - -let myPeptide = "PEPTIDE" |> BioSeq.ofAminoAcidString -(*** include-value:myPeptide ***) - -let myPeptideFormula = BioSeq.toFormula myPeptide |> Formula.toString -(*** include-value:myPeptideFormula ***) - -let myPeptideMass = BioSeq.toAverageMass myPeptide -(*** include-value:myPeptideMass ***) - -(** - -### Digestion -BioFSharp also comes equipped with a set of tools aimed at cutting apart amino acid sequences. To demonstrate the usage, we'll throw some `trypsin` at the small RuBisCO subunit of _Arabidopos thaliana_: -In the first step, we define our input sequence and the protease we want to use. -*) - -let RBCS = - """MASSMLSSATMVASPAQATMVAPFNGLKSSAAFPATRKANNDITSITSNGGRVNCMQVWP - PIGKKKFETLSYLPDLTDSELAKEVDYLIRNKWIPCVEFELEHGFVYREHGNSPGYYDGR - YWTMWKLPLFGCTDSAQVLKEVEECKKEYPNAFIRIIGFDNTRQVQCISFIAYKPPSFT""" - |> BioArray.ofAminoAcidString - -(***include-value:RBCS***) - -let trypsin = Digestion.Table.getProteaseBy "Trypsin" - -(** -With these two things done, digesting the protein is a piece of cake. For doing this, just use the `digest` function. -*) -let digestedRBCS = Digestion.BioArray.digest trypsin 0 RBCS - -digestedRBCS -|> Seq.head -(***include-it***) - -(* -In reality, proteases don't always completely cut the protein down. Instead, some sites stay intact and should be considered for in silico analysis. -This can easily be done with the `concernMissCleavages` function. It takes the minimum and maximum amount of misscleavages you want to have and also the digested protein. As a result you get all possible combinations arising from this information. -*) - -let digestedRBCS' = Digestion.BioArray.concernMissCleavages 0 2 digestedRBCS - -digestedRBCS -|> Seq.item 1 -(***include-it***) diff --git a/docs/core/biocollections.ipynb b/docs/core/biocollections.ipynb new file mode 100644 index 00000000..c6ae8aaf --- /dev/null +++ b/docs/core/biocollections.ipynb @@ -0,0 +1,766 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: Biocollections\n", + "category: BioFSharp Core\n", + "categoryindex: 1\n", + "index: 3\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
Installed Packages
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(*** hide ***)\n", + "\n", + "(*** condition: prepare ***)\n", + "#r \"nuget: Plotly.NET, 4.2.0\"\n", + "#r \"nuget: FSharpAux, 2.0.0\"\n", + "#r \"nuget: FSharpAux.IO, 2.0.0\"\n", + "#r \"nuget: FSharp.Stats, 0.4.11\"\n", + "#r \"../../src/BioFSharp/bin/Release/netstandard2.0/BioFSharp.dll\"\n", + "#r \"../../src/BioFSharp.IO/bin/Release/netstandard2.0/BioFSharp.IO.dll\"\n", + "#r \"../../src/BioFSharp.BioContainers/bin/Release/netstandard2.0/BioFSharp.BioContainers.dll\"\n", + "#r \"../../src/BioFSharp.ML/bin/Release/netstandard2.0/BioFSharp.ML.dll\"\n", + "#r \"../../src/BioFSharp.Stats/bin/Release/netstandard2.0/BioFSharp.Stats.dll\"\n", + "\n", + "// in the documentation, we have to register formatters manually because we cannot load the extension as nuget package to trigger automatic registration\n", + "#r \"../../src/BioFSharp.Interactive/bin/Release/net6.0/BioFSharp.Interactive.dll\"\n", + "BioFSharp.Interactive.Formatters.registerAll()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BioCollections\n", + "\n", + "*Summary:* This example shows how to use collections of biological items in BioFSharp\n", + "\n", + "Analogous to the build-in collections BioFSharp provides BioSeq, BioList and BioArray for individual collection specific optimized operations. \n", + "The easiest way to create them are the `ofBioItemString` -functions" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [], + "source": [ + "open BioFSharp\n", + "\n", + "let s1 = \"PEPTIDE\" |> BioSeq.ofAminoAcidString \n", + "let s2 = \"PEPTIDE\" |> BioList.ofAminoAcidString \n", + "let s3 = \"TAGCAT\" |> BioArray.ofNucleotideString " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
(Microsoft.FSharp.Collections.IEnumerator+mkSeq@177[BioFSharp.AminoAcids+AminoAcid], [Pro; Glu; Pro; ... ], BioFSharp.Nucleotides+Nucleotide[])
Item1
\r\n",
+       "         1  PEPTIDE\r\n",
+       "
Item2
\r\n",
+       "         1  PEPTIDE\r\n",
+       "
Item3
\r\n",
+       "         1  TAGCAT\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "s1, s2, s3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Nucleotides\n", + "\n", + "![Nucleotides1](../img/Nucleotides.svg)\n", + "\n", + "**Figure 1: Selection of covered nucleotide operations** (A) Biological principle. (B) Workflow with `BioSeq`. (C) Other covered functionalities.\n", + "\n", + "Let's imagine you have a given gene sequence and want to find out what the according protein might look like." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  ATGGCTAGAT CGATCGATCG GCTAACGTAA\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let myGene = BioArray.ofNucleotideString \"ATGGCTAGATCGATCGATCGGCTAACGTAA\"\n", + "\n", + "myGene" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Yikes! Unfortunately we got the 5'-3' coding strand. For proper transcription we should get the complementary strand first:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  TACCGATCTA GCTAGCTAGC CGATTGCATT\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let myProperGene = BioArray.complement myGene\n", + "\n", + "myProperGene" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's transcribe and translate it:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  MARSIDRLT*\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let myTranslatedGene = \n", + " myProperGene\n", + " |> BioArray.transcribeTemplateStrand\n", + " |> BioArray.translate 0\n", + "\n", + "myTranslatedGene" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Of course, if your input sequence originates from the coding strand, you can directly transcribe it to mRNA since the \n", + "only difference between the coding strand and the mRNA is the replacement of 'T' by 'U' (Figure 1B)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  MARSIDRLT*\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let myTranslatedGeneFromCodingStrand = \n", + " myGene\n", + " |> BioArray.transcribeCodingStrand\n", + " |> BioArray.translate 0\n", + "\n", + "myTranslatedGeneFromCodingStrand" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Other Nucleotide conversion operations are also covered:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  ATGTTCCGAT\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let mySmallGene = BioSeq.ofNucleotideString \"ATGTTCCGAT\"\n", + "\n", + "mySmallGene" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  TAGCCTTGTA\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "BioSeq.reverse mySmallGene " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  TACAAGGCTA\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "BioSeq.complement mySmallGene" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  ATCGGAACAT\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "BioSeq.reverseComplement mySmallGene" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AminoAcids\n", + "\n", + "### Basics\n", + "Some functions which might be needed regularly are defined to work with nucleotides and amino acids:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  PEPTIDE\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let myPeptide = \"PEPTIDE\" |> BioSeq.ofAminoAcidString \n", + "\n", + "myPeptide" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "C34.00 H51.00 N7.00 O14.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "myPeptide \n", + "|> BioSeq.toFormula \n", + "|> Formula.toString " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
781.8103169999999
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "BioSeq.toAverageMass myPeptide " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Digestion\n", + "BioFSharp also comes equipped with a set of tools aimed at cutting apart amino acid sequences. To demonstrate the usage, we'll throw some `trypsin` at the small RuBisCO subunit of _Arabidopos thaliana_: \n", + "In the first step, we define our input sequence and the protease we want to use.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\r\n",
+       "         1  MASSMLSSAT MVASPAQATM VAPFNGLKSS AAFPATRKAN NDITSITSNG GRVNCMQVWP\r\n",
+       "        61  PIGKKKFETL SYLPDLTDSE LAKEVDYLIR NKWIPCVEFE LEHGFVYREH GNSPGYYDGR\r\n",
+       "       121  YWTMWKLPLF GCTDSAQVLK EVEECKKEYP NAFIRIIGFD NTRQVQCISF IAYKPPSFT\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let RBCS = \n", + " \"\"\"MASSMLSSATMVASPAQATMVAPFNGLKSSAAFPATRKANNDITSITSNGGRVNCMQVWP\n", + " PIGKKKFETLSYLPDLTDSELAKEVDYLIRNKWIPCVEFELEHGFVYREHGNSPGYYDGR\n", + " YWTMWKLPLFGCTDSAQVLKEVEECKKEYPNAFIRIIGFDNTRQVQCISFIAYKPPSFT\"\"\" \n", + " |> BioArray.ofAminoAcidString\n", + "\n", + "RBCS" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{ ProteinID = 0\\n MissCleavages = 0\\n CleavageStart = 0\\n CleavageEnd = 27\\n PepSequence =\\n [Met; Ala; Ser; Ser; Met; Leu; Ser; Ser; Ala; Thr; Met; Val; Ala; Ser; Pro;\\n Ala; Gln; Ala; Thr; Met; Val; Ala; Pro; Phe; Asn; Gly; Leu; Lys] }
ProteinID
0
MissCleavages
0
CleavageStart
0
CleavageEnd
27
PepSequence
\r\n",
+       "         1  MASSMLSSAT MVASPAQATM VAPFNGLK\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let trypsin = Digestion.Table.getProteaseBy \"Trypsin\"\n", + "\n", + "let digestedRBCS = Digestion.BioArray.digest trypsin 0 RBCS \n", + "\n", + "digestedRBCS\n", + "|> Seq.head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In reality, proteases don't always completely cut the protein down. Instead, some sites stay intact and should be considered for in silico analysis. \n", + "This can easily be done with the `concernMissCleavages` function. It takes the minimum and maximum amount of misscleavages you want to have and also the digested protein. As a result you get all possible combinations arising from this information.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "dotnet_interactive": { + "language": "fsharp" + }, + "polyglot_notebook": { + "kernelName": "fsharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{ ProteinID = 0\\n MissCleavages = 0\\n CleavageStart = 28\\n CleavageEnd = 36\\n PepSequence = [Ser; Ser; Ala; Ala; Phe; Pro; Ala; Thr; Arg] }
ProteinID
0
MissCleavages
0
CleavageStart
28
CleavageEnd
36
PepSequence
\r\n",
+       "         1  SSAAFPATR\r\n",
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "let digestedRBCS' = Digestion.BioArray.concernMissCleavages 0 2 digestedRBCS\n", + "\n", + "digestedRBCS\n", + "|> Seq.item 1" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".NET (C#)", + "language": "C#", + "name": ".net-csharp" + }, + "language_info": { + "name": "polyglot-notebook" + }, + "polyglot_notebook": { + "kernelInfo": { + "defaultKernelName": "csharp", + "items": [ + { + "aliases": [], + "name": "csharp" + }, + { + "aliases": [], + "languageName": "fsharp", + "name": "fsharp" + } + ] + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/core/bioitems.ipynb b/docs/core/bioitems.ipynb index f3813bfd..9bb63d57 100644 --- a/docs/core/bioitems.ipynb +++ b/docs/core/bioitems.ipynb @@ -37,7 +37,11 @@ "#r \"../../src/BioFSharp.IO/bin/Release/netstandard2.0/BioFSharp.IO.dll\"\n", "#r \"../../src/BioFSharp.BioContainers/bin/Release/netstandard2.0/BioFSharp.BioContainers.dll\"\n", "#r \"../../src/BioFSharp.ML/bin/Release/netstandard2.0/BioFSharp.ML.dll\"\n", - "#r \"../../src/BioFSharp.Stats/bin/Release/netstandard2.0/BioFSharp.Stats.dll\"" + "#r \"../../src/BioFSharp.Stats/bin/Release/netstandard2.0/BioFSharp.Stats.dll\"\n", + "\n", + "// in the documentation, we have to register formatters manually because we cannot load the extension as nuget package to trigger automatic registration\n", + "#r \"../../src/BioFSharp.Interactive/bin/Release/net6.0/BioFSharp.Interactive.dll\"\n", + "BioFSharp.Interactive.Formatters.registerAll()" ] }, { @@ -63,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -88,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -153,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -218,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -237,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -250,38 +254,7 @@ { "data": { "text/html": [ - "
Ala
" + "A" ] }, "metadata": {}, @@ -294,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -307,38 +280,7 @@ { "data": { "text/html": [ - "
Gap
" + "-" ] }, "metadata": {}, @@ -358,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -395,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 9, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -426,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 10, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -457,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -492,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -527,7 +469,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 13, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -566,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 14, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -612,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 15, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -669,7 +611,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -735,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -795,7 +737,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 18, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -852,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 19, "metadata": { "dotnet_interactive": { "language": "fsharp" diff --git a/docs/core/formula.ipynb b/docs/core/formula.ipynb index 426c43ad..83977ac6 100644 --- a/docs/core/formula.ipynb +++ b/docs/core/formula.ipynb @@ -36,7 +36,11 @@ "#r \"../../src/BioFSharp.IO/bin/Release/netstandard2.0/BioFSharp.IO.dll\"\n", "#r \"../../src/BioFSharp.BioContainers/bin/Release/netstandard2.0/BioFSharp.BioContainers.dll\"\n", "#r \"../../src/BioFSharp.ML/bin/Release/netstandard2.0/BioFSharp.ML.dll\"\n", - "#r \"../../src/BioFSharp.Stats/bin/Release/netstandard2.0/BioFSharp.Stats.dll\"" + "#r \"../../src/BioFSharp.Stats/bin/Release/netstandard2.0/BioFSharp.Stats.dll\"\n", + "\n", + "// in the documentation, we have to register formatters manually because we cannot load the extension as nuget package to trigger automatic registration\n", + "#r \"../../src/BioFSharp.Interactive/bin/Release/net6.0/BioFSharp.Interactive.dll\"\n", + "BioFSharp.Interactive.Formatters.registerAll()" ] }, { @@ -54,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -90,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -124,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": { "dotnet_interactive": { "language": "fsharp" @@ -155,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": { "dotnet_interactive": { "language": "fsharp" diff --git a/docs/rnaseq_normalization.ipynb b/docs/rnaseq_normalization.ipynb index 8b8a242a..45825171 100644 --- a/docs/rnaseq_normalization.ipynb +++ b/docs/rnaseq_normalization.ipynb @@ -46,10 +46,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## RPKM & TPM\n", + "## RNA-Seq data normalization\n", + "\n", "RNA-Seq is a high-throughput transcriptomics technique, that quantifies RNA molecules in a biological sample. RNA-Seq provides a view of the whole transcriptome and allows to look at gene-expression, post-transcriptonal modifications and look at different populations of RNA. When dealing with RNA-sequencing data, normalization is needed to correct technical biases. RPKM and TPM are two metrics that normalize for gene length and sequencing depth. RNA-Sequencing data needs to be normalized for gene length, because longer genes show greater read counts when expressed at the same level and for sequencing depth, as deeper sequencing depth produces more read counts per gene.\n", "\n", - "#### RPKM:\n", + "#### RPKM (Reads per kilobase million)\n", "\n", "RPKM (Reads per kilobase million) normalization at first determines a scaling factor, by calculating the sum of all reads in a sample and dividing that number by 1,000,000. That scaling factor is used to calculate RPM (Reads per million), by dividing the read counts for each sample with it, normalizing for sequencing depth. To get RPKM and normalize for gene length, RPM values are divided by genelength in kilobases. RPKM is calculated by\n", "$$\n", @@ -117,7 +118,7 @@ "metadata": {}, "source": [ "\n", - "#### TPM:\n", + "#### TPM (Transcripts per kilobase million)\n", "\n", "What differentiates TPM (Transcripts per kilobase million) from RPKM is the order of operations. To calculate TPM values, data gets normalized for gene length first. This is achieved by calculating RPK values (reads per kilobase), by dividing the read counts by genelength in kilobases. The sum of all RPK values is divided by 1,000,000, to get a scaling factor. Finally, TPM values are calculated by dividing the RPK values by the scaling factor, also normalizing for sequencing depth.\n", "By normalizing for gene length first, the sum of all samples is always 1,000,000, making comparisons of proportions easier. TPM is calculated by\n", diff --git a/src/BioFSharp.Interactive/BioFSharp.Interactive.fsproj b/src/BioFSharp.Interactive/BioFSharp.Interactive.fsproj index 41f8cc72..23e85fce 100644 --- a/src/BioFSharp.Interactive/BioFSharp.Interactive.fsproj +++ b/src/BioFSharp.Interactive/BioFSharp.Interactive.fsproj @@ -33,6 +33,7 @@ + diff --git a/src/BioFSharp.Interactive/Extension.fs b/src/BioFSharp.Interactive/Extension.fs index 05788fab..42c3941e 100644 --- a/src/BioFSharp.Interactive/Extension.fs +++ b/src/BioFSharp.Interactive/Extension.fs @@ -11,18 +11,5 @@ type FormatterKernelExtension() = interface IKernelExtension with member _.OnLoadAsync _ = - Formatter.Register( - Action<_, _> - (fun item (writer: IO.TextWriter) -> - let pretty = FSIPrinters.prettyPrintBioItem item - writer.Write(pretty)), - "text/html" - ) - Formatter.Register>( - Action<_, _> - (fun bioCollection (writer: IO.TextWriter) -> - let pretty = $"
{FSIPrinters.prettyPrintBioCollection bioCollection}
" - writer.Write(pretty)), - "text/html" - ) + Formatters.registerAll() Task.CompletedTask diff --git a/src/BioFSharp.Interactive/Formatters.fs b/src/BioFSharp.Interactive/Formatters.fs new file mode 100644 index 00000000..6da7a94f --- /dev/null +++ b/src/BioFSharp.Interactive/Formatters.fs @@ -0,0 +1,26 @@ +namespace BioFSharp.Interactive + +module Formatters = + + open System + open Microsoft.DotNet.Interactive.Formatting + open BioFSharp + open BioFSharp.IO + + let formatAminoAcid (item: AminoAcids.AminoAcid) (writer: IO.TextWriter) = + let pretty = FSIPrinters.prettyPrintBioItem item + writer.Write(pretty) + + let formatBioCollection (bioCollection: seq) (writer: IO.TextWriter) = + let pretty = $"
{FSIPrinters.prettyPrintBioCollection bioCollection}
" + writer.Write(pretty) + + let registerAll() = + Formatter.Register( + formatAminoAcid, + "text/html" + ) + Formatter.Register>( + formatBioCollection, + "text/html" + ) \ No newline at end of file