3 gadi atpakaļ · d332cf8247
--- a/latex/result/thesis.pdf
+++ b/latex/result/thesis.pdf
--- a/latex/tex/bilder/online/cobilab.png
+++ b/latex/tex/bilder/online/cobilab.png
--- a/latex/tex/bilder/online/ensembl.png
+++ b/latex/tex/bilder/online/ensembl.png
--- a/latex/tex/bilder/online/ftp-1000genomes.png
+++ b/latex/tex/bilder/online/ftp-1000genomes.png
--- a/latex/tex/bilder/online/ftp-ensembl.png
+++ b/latex/tex/bilder/online/ftp-ensembl.png
--- a/latex/tex/bilder/online/ftp-ncbi.png
+++ b/latex/tex/bilder/online/ftp-ncbi.png
--- a/latex/tex/bilder/online/gpl.png
+++ b/latex/tex/bilder/online/gpl.png
--- a/latex/tex/bilder/online/mit-license.png
+++ b/latex/tex/bilder/online/mit-license.png
--- a/latex/tex/bilder/online/samtools_spec.png
+++ b/latex/tex/bilder/online/samtools_spec.png
--- a/latex/tex/bilder/online/ucsc.png
+++ b/latex/tex/bilder/online/ucsc.png
--- a/latex/tex/docinfo.tex
+++ b/latex/tex/docinfo.tex
@@ -17,7 +17,7 @@
 
				 \newcommand{\hsmadatum}{01.12.22}      % Datum der Abgabe
			
 
				 \newcommand{\hsmajahr}{2022}             % Jahr der Abgabe
			
 
				 %\newcommand{\hsmafirma}{Paukenschlag GmbH, Mannheim} % Firma bei der die Arbeit durchgeführt wurde
			
 
				-\newcommand{\hsmabetreuer}{Prof. Dr. Elena Fimmel, Hochschule Mannheim} % Betreuer an der Hochschule
			
 
				+\newcommand{\hsmabetreuer}{Prof. Dr. Elena Fimmel} % Betreuer an der Hochschule
			
 
				 \newcommand{\hsmazweitkorrektor}{Prof. Dr. Markus Gumbel}   % Betreuer im Unternehmen oder Zweitkorrektor
			
 
				 \newcommand{\hsmafakultaet}{I}    % I für Informatik oder E, S, B, D, M, N, W, V
			
 
				 \newcommand{\hsmastudiengang}{IB} % IB IMB UIB CSB IM MTB (weitere siehe titleblatt.tex)
			
--- a/latex/tex/kapitel/a6_results.tex
+++ b/latex/tex/kapitel/a6_results.tex
@@ -53,27 +53,27 @@
 
				      \textbf{ID.} & \textbf{Uncompressed Source File} & \textbf{\acs{GeCo}} & \textbf{Samtools \acs{BAM}}& \textbf{Samtools \acs{CRAM}} \\
			
 
				     \midrule
			
 
				 			%src, geco, bam and cram in byte
			
 
				-			File 2.1& 253105752& 46364770& 62048289& 55769827\\
			
 
				-			File 2.2& 246230144& 49938168& 65391181& 58026123\\
			
 
				-			File 2.3& 201600541& 41117340& 53586949& 47707954\\
			
 
				-			File 2.4& 193384854& 39248276& 51457814& 45564837\\
			
 
				-			File 2.5& 184563953& 37133480& 48838053& 43655371\\
			
 
				-			File 2.6& 173652802& 35355184& 46216304& 40980906\\
			
 
				-			File 2.7& 162001796& 31813760& 42371043& 38417108\\
			
 
				-			File 2.8& 147557670& 30104816& 39107538& 34926945\\
			
 
				-			File 2.9& 140701352& 23932541& 32708272& 29459829\\
			
 
				-			File 2.10& 136027438& 27411806& 35855955& 32238052\\
			
 
				-			File 2.11& 137338124& 27408185& 35894133& 32529673\\
			
 
				-			File 2.12& 135496623& 27231126& 35580843& 32166751\\
			
 
				-			File 2.13& 116270459& 20696778& 26467775& 23568321\\
			
 
				-			File 2.14& 108827838& 18676723& 24284901& 21887811\\
			
 
				-			File 2.15& 103691101& 16804782& 22486646& 20493276\\
			
 
				-			File 2.16& 91844042& 16005173& 21568790& 19895937\\
			
 
				-			File 2.17& 84645123& 15877526& 21294270& 20177456\\
			
 
				-			File 2.18& 81712897& 16344067& 20684650& 19310998\\
			
 
				-			File 2.19& 59594634& 10488207& 14616042& 14251243\\
			
 
				-			File 2.20& 65518294& 13074402& 16769658& 15510100\\
			
 
				-			File 2.21& 47488540& 7900773& 10477999& 9708258\\
			
 
				+			File 1.1& 253105752& 46364770& 62048289& 55769827\\
			
 
				+			File 1.2& 246230144& 49938168& 65391181& 58026123\\
			
 
				+			File 1.3& 201600541& 41117340& 53586949& 47707954\\
			
 
				+			File 1.4& 193384854& 39248276& 51457814& 45564837\\
			
 
				+			File 1.5& 184563953& 37133480& 48838053& 43655371\\
			
 
				+			File 1.6& 173652802& 35355184& 46216304& 40980906\\
			
 
				+			File 1.7& 162001796& 31813760& 42371043& 38417108\\
			
 
				+			File 1.8& 147557670& 30104816& 39107538& 34926945\\
			
 
				+			File 1.9& 140701352& 23932541& 32708272& 29459829\\
			
 
				+			File 1.10& 136027438& 27411806& 35855955& 32238052\\
			
 
				+			File 1.11& 137338124& 27408185& 35894133& 32529673\\
			
 
				+			File 1.12& 135496623& 27231126& 35580843& 32166751\\
			
 
				+			File 1.13& 116270459& 20696778& 26467775& 23568321\\
			
 
				+			File 1.14& 108827838& 18676723& 24284901& 21887811\\
			
 
				+			File 1.15& 103691101& 16804782& 22486646& 20493276\\
			
 
				+			File 1.16& 91844042& 16005173& 21568790& 19895937\\
			
 
				+			File 1.17& 84645123& 15877526& 21294270& 20177456\\
			
 
				+			File 1.18& 81712897& 16344067& 20684650& 19310998\\
			
 
				+			File 1.19& 59594634& 10488207& 14616042& 14251243\\
			
 
				+			File 1.20& 65518294& 13074402& 16769658& 15510100\\
			
 
				+			File 1.21& 47488540& 7900773& 10477999& 9708258\\
			
 
				 			&&&&\\
			
 
				 			File 2.1& 1246731616& 12414797& 78260121& 67130756\\
			
 
				 			File 2.2& 1261766002& 12363734& 80895953& 69649632\\
			
--- a/latex/tex/kapitel/a_online.tex
+++ b/latex/tex/kapitel/a_online.tex
@@ -1 +1,71 @@
 
				-\chapter{Visual Persistance of Used Online Sources}
			
 
				+\chapter{Visual Persistence of Used Online Sources}
			
 
				+The following images provide insight into the online sources referenced in this work:
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/cobilab.png}
			
 
				+  \caption{View of Cobilabs - creators of \acs{GeCo} - github page \cite{geco-repo}}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/ensembl.png}
			
 
				+  \caption{View of ensembl genome browser splash page \cite{ensembl}.}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/ftp-1000genomes.png}
			
 
				+  \caption{View of the browser view for IGSRs \acs{FTP}-Server \cite{ftp-igsr}.}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/ftp-ensembl.png}
			
 
				+  \caption{View of the browser view for Ensembls \acs{FTP}-Server \cite{ftp-ensembl}.}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/ftp-ncbi.png}
			
 
				+  \caption{View of the browser view for NCBIs \acs{FTP}-Server \cite{ftp-ncbi}.}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/gpl.png}
			
 
				+  \caption{View of the GNU Public License page \cite{gpl}.}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/mit-license.png}
			
 
				+  \caption{View of MIT license description \cite{mitlic}.}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/samtools_spec.png}
			
 
				+  \caption{View of Samtools sourcefiles and file format description \cite{bam}.}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
 
				+\begin{figure}[H]
			
 
				+  \centering
			
 
				+  \includegraphics[width=15cm]{online/ucsc.png}
			
 
				+  \caption{View of UCSC splash page \cite{ucsc}.}
			
 
				+  \label{}
			
 
				+\end{figure}
			
 
				+
			
--- a/latex/tex/kapitel/k1_introduction.tex
+++ b/latex/tex/kapitel/k1_introduction.tex
@@ -1,8 +1,8 @@
 
				 \chapter{Introduction}
			
 
				 % general information and intro
			
 
				 %Understanding how things in our cosmos work, was and still is a pleasure, that the human being always wants to fulfill. 
			
 
				-Understanding the biological code of living things-, is an alsways developing task which plays a significant part in multiple aspects of our lives. The results of research in this area provides knowledge that helps development in the medical sector, in agriculture and more \cite{ju_21, wang_22, mo_83}.
			
 
				-Getting insights into this biological code is possible through storing and studying information, embedded in genonmes \cite{dna_structure}. Since life is complex, there is a lot of information, that requires a lot of memory space \cite{alok17, survey}.\\
			
 
				+Understanding the biological code of living things-, is an alsways developing task which plays a significant role in multiple aspects of our lives. The results of research in this area provides knowledge that helps development in the medical sector, in agriculture and more \cite{ju_21, wang_22, mo_83}.
			
 
				+Getting insights into the biological code is possible through storing and studying information, embedded in genonmes \cite{dna_structure}. Since life is complex, there is a lot of information, that requires a lot of memory space \cite{alok17, survey}.\\
			
 
				 % ...Communication with other researches means sending huge chunks of data through cables or through waves over the air, which costs time and makes raw data vulnerable to erorrs.\\
			
 
				 % compression values and goals
			
 
				 %With compression algorithms and their implementation in tools, the problem of storing information got smaller.
			
--- a/latex/tex/kapitel/k3_datatypes.tex
+++ b/latex/tex/kapitel/k3_datatypes.tex
@@ -24,13 +24,12 @@
 
				 
			
 
				 \section{File Formats used to Store DNA}
			
 
				 \label{chap:file formats}
			
 
				-As described in previous chapters \ac{DNA} can be represented by a string with the buildingblocks A,T,G and C. Using a common file format for saving text would be impractical because the amount of characters or symbols in the used alphabet defines how many bits are used to store each single symbol.\\
			
 
				-The \ac{ASCII} \cite{iso-ascii} table is a character set registered in 1975, and to this day it is still in use to encode texts digitally. For the purpose of communication larger character sets replaced \acs{ASCII}. It is still used in situations where storage is short.\\
			
 
				-% grund dass ASCII abgelöst wurde -> zu wenig darstellungsmöglichkeiten. Pro heute -> weniger overhead pro character
			
 
				-The buildingblocks of \acs{DNA} require a minimum of four letters, so at least two bits are needed. Storing a single \textit{A} with \acs{ASCII} encoding, requires 8 bit (\,excluding magic bytes and the bytes used to mark the \ac{EOF})\,. Since there are at least $2^8$ or 128 displayable symbols with \acs{ASCII} encoding, this leaves a great overhead of unused combinations.\\
			
 
				+As described in previous chapters \ac{DNA} can be represented by a string with the buildingblocks \texttt{A,T,G and C}. Using a common file format for saving text would be impractical because the amount of characters or symbols in the used alphabet defines how many bits are used to store each single symbol.\\
			
 
				+The \ac{ASCII} \cite{iso-ascii} table is a character set registered in 1975, and to this day it is still in use to encode texts digitally. To solve communication issues, larger character sets replaced \acs{ASCII} some fields. It is still used in situations where storage is short \cite{rfchttp}.\\
			
 
				+The buildingblocks of \acs{DNA} require a minimum of four letters, so at least two bits are needed. Storing a single \textit{A} with \acs{ASCII} encoding, requires 8~bit. Since there are at least $2^8$ or 128 displayable symbols with \acs{ASCII} encoding, this leaves a great overhead of unused combinations.\\
			
 
				 % cout out examples. Might be needed later or elsewhere
			
 
				 % \texttt{00 -> A, 01 -> T, 10 -> G, 11 -> C}. 
			
 
				-In most tools, more than four symbols are used. This is due to the complexity in sequencing \acs{DNA}. It is not 100\% preceice, so additional symbols are used to mark nucleotides that could not or could only partly get determined. Furthermore a so called quality score is used to indicate the certainty of correct sequencing for each nucleotide \cite{survey, Cock_2009}.\\
			
 
				+In most tools, more than four symbols are used. This is due to the complexity in sequencing \acs{DNA}. This process is not 100\% preceice, so additional symbols are used to mark nucleotides that could not or could only partly get determined. Furthermore a so called quality score is used to indicate the certainty of correct sequencing for each nucleotide \cite{survey, Cock_2009}.\\
			
 
				 More common everyday-usage text encodings like unicode require 16 bits per letter. So, settling with \acs{ASCII} has improvement capabilities and is, on the other side, more efficient than using bulkier alternatives like unicode.\\
			
 
				 
			
 
				 % differences between information that is store
			
@@ -41,7 +40,7 @@ Formats for storing uncompressed genomic data, can be sorted into several catego
 
				 	\item Sequenced reads
			
 
				 \end{itemize}
			
 
				 The categories are listed in descending order, based on their complexity, considering their usecase and data structure. Starting with sequence variation, called haplotype which describes formats storing graph based structures that focus on analyzing variations in different genomes \cite{haplo, sam12}. 
			
 
				-Sequenced reads focus on storing continuous protein chains from a sequenced genome \cite{survey}.
			
 
				+Sequenced reads focus on storing continuous nucleotide chains from a sequenced genome \cite{survey}.
			
 
				 Aligned data is somewhat simliar to sequenced reads with the difference that instead of a whole chain of genomes, overlapping subsequences are stored. This could be described as a rawer form of sequenced reads. This way aligned data stores additional information on how certain a specific part of a genome is read correctly \cite{survey, sam12}.
			
 
				 The focus of this work is the compression of sequenced data but not the likelihood of how accurate the data might be. Therefore, only formats that are able to store sequenced reads will be worked with. Note that some algigned data formats are also able to store aligned reads, since latter is just a less informative representation of the first \cite{survey, sam12}.\\
			
 
				 
			
@@ -58,7 +57,7 @@ Several people and groups have developed different file formats to store genomes
 
				   \item{The format is open source. Otherwise, optimizations cannot be tested without buying the software and/or requesting permission to disassemble and reverse engineer the software or parts of it.}
			
 
				 \end{itemize}
			
 
				 
			
 
				-Information on available formats was gathered through various Internet platforms \cite{ensembl, ucsc, ga4gh} and scientific papers \cite{survey, sam12, Cock_2009}. 
			
 
				+Information on available formats was gathered through various Internet platforms \cite{ensembl, ucsc} and scientific papers \cite{survey, sam12, Cock_2009}. 
			
 
				 Some common file formats are:\\
			
 
				 
			
 
				 \begin{itemize}
			
--- a/latex/tex/kapitel/k4_algorithms.tex
+++ b/latex/tex/kapitel/k4_algorithms.tex
@@ -21,7 +21,6 @@
 
				 % entropie fim doku grundlagen2 
			
 
				 % dna nucleotide zu einem kapitel -> structure of dna. auch kapitel wegstreichen (zu generisch)
			
 
				 % file structure/format <-> datatypes. länger beschreiben: e.g. File formats to store dna
			
 
				-% 3.2.1 raus
			
 
				 
			
 
				 \section{Compression Aproaches}
			
 
				 The process of compressing data serves the goal to generate an output that is smaller than its input \cite{dict}.\\
			
--- a/latex/tex/kapitel/k5_feasability.tex
+++ b/latex/tex/kapitel/k5_feasability.tex
@@ -30,8 +30,8 @@ Since improvements must be measured, defining a baseline which would need to be
 
				 The goal of this is, to determine a baseline for efficiency and effectivity of state of the art tools, used to compress \ac{DNA}. This baseline is set by two important factors:
			
 
				 
			
 
				 \begin{itemize}
			
 
				-  \item Efficiency: \textbf{Duration} the Process had run for
			
 
				-  \item Effectivity: The difference in \textbf{Size} between input and compressed data
			
 
				+  \item Efficiency: \textbf{duration} the Process had run for
			
 
				+  \item Effectivity: The difference in \textbf{size} between input and compressed data
			
 
				 \end{itemize}
			
 
				 
			
 
				 As a third point, the compliance that files were compressed losslessly should be verified. This is done by comparing the source file to a copy that got compressed and than decompressed again. If one of the two processes should operate lossy, a difference between the source file and the copy a difference in size should be recognizable. 
			
--- a/latex/tex/kapitel/k6_results.tex
+++ b/latex/tex/kapitel/k6_results.tex
@@ -230,7 +230,7 @@ With the probability of \texttt{C}, the probabilities for sets (n-plets) of \tex
 
				   \label{k6:pet-prob}
			
 
				 \end{figure}
			
 
				 
			
 
				-The exemplaric probabilities Petoukhov displayed are reprinted in \ref{k6:pet-prob}. Noteable are the similarities in the distirbution of \%A and \%G as well as in \%C and \%T. They align until the third digit after the decimal point. According to him, this regularity is found in the genome of humans, some anmials, plants, bacteria and more \cite{pet21}.\\
			
 
				+The exemplaric probabilities Petoukhov displayed are reprinted in \ref{k6:pet-prob}. Noteable are the similarities in the distirbution of \%A and \%T as well as in \%G and \%C. They align until the third digit after the decimal point. According to him, this regularity is found in the genome of humans, some anmials, plants, bacteria and more \cite{pet21}.\\
			
 
				 % begin optimization 
			
 
				 Considering this and the measured results, an improvement in the arithmetic coding process and therefore in \acs{GeCo}s efficiency, would be a good start to equalize the great gap in the compression duration. Combined with a more current tool it is possible that even greater improvements could be achived.\\
			
 
				 % simple theoretical approach
			
@@ -252,7 +252,6 @@ The question for how many probabilities must be determined only gets answered by
 
				 But how could a improvement look like, not considering possible difficulties multithreading would bring?
			
 
				 To answer this, first a mechanism to determine a possible improvement must be determined. To compare parts of a programm and their complexity, the Big-O notation is used. Considering a single threaded loop with the purpose to count every nucleotide in a sequence, the process of counting can be split into several operations, defined by this pseudocode.\\
			
 
				 
			
 
				-%todo use GeCo arith function with bigO
			
 
				 while (sequence not end)\\
			
 
				 do\\
			
 
				 \-\hspace{0.5cm} next\_nucleotide = read\_next\_nucleotide(sequence)\\
			
@@ -264,26 +263,26 @@ do\\
 
				 \-\hspace{0.5cm} done\\
			
 
				 done\\
			
 
				 
			
 
				-This loop will itterate over a whole sequence, counting each nucleotide. In line three, a inner loop can be found which itterates over the alphabet, to determine which symbol should be increased. Considering the findings, described above, the inner loop can be left out, because there is no need to compare the read nucleotide against more than one symbol. The Big-O notation for this code, with any sequence with the length of n, would be decreseased from O($n^2$) to O($n\cdot 1)$ or simply O(n) \cite{big-o}. Which is clearly an improvement in complexety and therfore decreade runtime.\\
			
 
				+This loop will itterate over a whole sequence, counting each nucleotide. In line three, a inner loop can be found which itterates over the alphabet, to determine which symbol should be increased. Considering the findings, described above, the inner loop can be left out, because there is no need to compare the read nucleotide against more than one symbol. The Big-O notation for this code, with any sequence with the length of n and an alphabet length of four, would be decreseased from O($n\cdot 4$) to O($n\cdot 1)$ or simply O(n) \cite{big-o}. Which is clearly an improvement in complexety and therfore might result in a decreade runtime.\\
			
 
				 The runtime for calculations of the other symbols probabilities must be considered as well and compared against the nested loop to be certain, that the overall runtime was improved.\\
			
 
				 
			
 
				 Should Petoukhovs rules, and the regularity shown in \ref{k6:pet-prob} happen to be universal, three approaches could be used to determine the probability of a genome:
			
 
				 \begin{itemize}
			
 
				 	\item No counting of any nucleotide and using a fixed set of probabilities.
			
 
				 	\item Counting only one nucleotide and determining the others by calculation. 
			
 
				-	\item Counting either A and C or G and T and determining the other two by mirroring the results.
			
 
				+	\item Counting either A and T or G and C and determining the other two by mirroring the results.
			
 
				 \end{itemize}
			
 
				 
			
 
				 The calculation mentioned in the second bulletpoint, would look like the following, if for example the probability for \texttt{C} got determined by parsing the whole sequence:
			
 
				 
			
 
				-$\%T = \%C$\\
			
 
				-$\%A+\%G = \%100-(\%T+\%C)$\\
			
 
				-$\%A = \frac{\%100-(\%T+\%C)}{2}$\\
			
 
				-$\%G = \%A$\\
			
 
				+$\%G \approx \%C$\\
			
 
				+$\%A+\%T \approx \%100-(\%G+\%C)$\\
			
 
				+$\%A \approx \frac{\%100-(\%G+\%C)}{2}$\\
			
 
				+$\%T \approx \%A$\\
			
 
				 
			
 
				 The mapping, mentioned in the last point would consist of the first and last line of the example above.\\
			
 
				 % actual programming approach
			
 
				-Working with probabilities, in fractions as well as in percent, would probabily mean rounding values. To increase the accuracity, the actual value resulting form the counted symbol could be used. For this to work, the amount of overall symbols had to be determined, for \texttt{\%A+\%G} to be calculateable. Since counting all symbols during the process of the one needed nucleotide, could have an impact on the runtime, the full length could also be calculated. With s beeing the size of the parsed sequence in bytes and c beeing the bytes per character $\frac{s}{c}$ would result in the amount of symbols in the sequence.\\
			
 
				+Working with probabilities, in fractions as well as in percent, would probabily mean rounding values. To increase the accuracity, the actual value resulting form the counted symbol could be used. For this to work, the amount of overall symbols had to be determined, for $\%A+\%T$ to be calculateable. Since counting all symbols during the process of the one needed nucleotide, could have an impact on the runtime, the full length could also be calculated. With s beeing the size of the parsed sequence in bytes and c beeing the bytes per character $\frac{s}{c}$ would result in the amount of symbols in the sequence.\\
			
 
				 %tf? -> They obviously differ in several categories: runtime, since each is parsing more of the sequence, and the grade of heuristic which is taken into account.
			
 
				 
			
 
				 % more realistic view on parsing
			
@@ -297,18 +296,17 @@ But even though the whole chromosome might show a certain pattern, its subsequen
 
				 
			
 
				 Without determining probabilities, one can see that the amount of \texttt{C}s outnumbers \texttt{T}s and \texttt{A}s. With the whole 133258320 symbols 130~\acs{MB}, the probability distribution will align more. The following values have been roundet down: \texttt{A $\approx$ 0.291723, C $\approx$ 0.207406, G $\approx$ 0.208009, T $\approx$ 0.2928609}. The pattern described by S. Petoukhov is recognizable. But by cutting out a subsection, of relevant size, with unequal distributions will have an impact on the probabilities of the whole sequence. 
			
 
				 If a greater sequence would lead to a more equal distribution, this knowledge could be used to help determining distributions on subsequences of one with equaly distributed probabilities.\\
			
 
				-There are some rules that apply to any whole chromosom sequence as well as to subsequences referenced by \texttt{S}. With the knowledge about lenght \texttt{len(S)} and the frequency and position of one symbol e.g. \texttt{C} represented as \texttt{|C|}, rules about the enveloping sequence can be derived. The arithmetic operations on symbols $\cdot$ for consecutive repetitions and $+$ for the concatination are used. For x and y as the ammount of nucleotides before the first and after the last \texttt{C} applies:
			
 
				+There are some rules that apply to any whole chromosom sequence as well as to subsequences referenced by \texttt{S}. With the knowledge about lenght \texttt{|S|} and the frequency and position of one symbol e.g. \texttt{C} represented as \texttt{|C|}, rules about the enveloping sequence can be derived. The arithmetic operations on symbols $\cdot$ for consecutive repetitions and $+$ for the concatination are used. For x and y as the ammount of nucleotides before the first and after the last \texttt{C} applies:
			
 
				 
			
 
				 \begin{itemize}
			
 
				-	\item $\frac{len(S)}{x/y-1}\cdot (|C| -1)$ determines the ammount of $(x \cdot N) + C$ and $C + (y \cdot N)$ sequences $\in S$. 
			
 
				-	\item The longest chain starting with \texttt{C} is $C + N \cdot (len(S) - x - 1)$.
			
 
				-	\item The longest chain ending with \texttt{C} is $(len(S) - y -1) \cdot N + C$.
			
 
				+	\item $\frac{|S|}{x/y-1}\cdot (|C| -1)$ determines the ammount of $(x \cdot N) + C$ and $C + (y \cdot N)$ sequences $\in S$. 
			
 
				+	\item The longest chain starting with \texttt{C} is $C + N \cdot (|S| - x - 1)$.
			
 
				+	\item The longest chain ending with \texttt{C} is $(|S| - y -1) \cdot N + C$.
			
 
				 	\item There are $(|C| - 1)$ occurrences of $(x + 1) \cdot N + C$ and an equal ammount of $C + N \cdot (y + 1)$.
			
 
				 \end{itemize}
			
 
				 Those statements might seem trivial to some, but possibly help other to clarify the boundaries on Petoukhov's rules. Also, they represent the end of the thought process of this works last section.\\
			
 
				 
			
 
				 \mycomment{
			
 
				-% todo erweitern um vergleiche zu survey work
			
 
				 Besides multithreading, there are other methods that could impact improvement approaches. Like the ussage of entropy coding in reference free compression, in combination with other compression solutions. This methods use structural properties, like repetitions or palindromes to apply a dictionary coding algorithm like \acs{LZ77} on the sequence. The parts that do not show any sign of forward or backward repetition get compressed using airhtmetic coding \cite{survey}. When this method is used, working with the probabilities of the whole genome is not purposeful. In the example subsequence out of \texttt{File 1.10}, no \texttt{G} is present. Compressing the subsequence with a additional interval of >0.2 for a symbol that would never get encoded, would be a waste of resources.\\
			
 
				 }
			
 
				 \mycomment{
			
@@ -326,7 +324,7 @@ best case
 
				 
			
 
				 bad case
			
 
				 - exact determination of all probabilities are not feasible
			
 
				- -> using A$\approx$G$\approx$0.28 and T=C=0.22 to estimate the probability and gather additional information to aproximate the real distibution
			
 
				+ -> using A$\approx$T$\approx$0.28 and G=C=0.22 to estimate the probability and gather additional information to aproximate the real distibution
			
 
				 - petoukov was wrong about universality of his rules
			
 
				  -> this still might work for a variety of genomes: all human chromosomes, mice, plants...
			
 
				 }
			
@@ -339,7 +337,7 @@ Before resulting in a final conclusion, a quick summary of important points:
 
				 The goal for this new optimization approach is clearly defined. Also a possible test environment and measurement techniques that would indicate a success have been tested, in this work as well as in cited works \cite{survey}. Considering how other improvements were implemented in the past shows that the way an approach like described above is feasible \cite{moffat_arith}. This, combined with the last point leads to assumption that there is a realistic chance to optimize entropy coding, specifically the arithmetic coding algorithm.\\
			
 
				 This assumption will consolidate by viewing best- and worst-case szenarios that could result from further research. Two important future events are taken into consideration. One would be the theoretical prove of an working optimization approach and the other if Petoukhov's findings develop favorable:
			
 
				 The best case would be described as optimization through exact determination of the whole probability distribution is possible and Petoukhov's findings prove that his rules are universal for genomes between living organisms. This would result in a faster compression in entropy coding. Depending on the dimension, either a tool that is implementing entropy coding only or a hybrid tool, with improved efficiency in its entropy coding algorithms would define a new \texttt{state of the art}.\\
			
 
				-In a worst case szenario, the exact determination of probability distributions would not be possible. This would mean more research should be done in approximating probability distibutions. Additionally, how the use of $A\approx G \approx 0.2914$ and $C\approx T\approx 0.2086$ could provide efficiency improvements in reference-free compression of whole chromosomes and general improvements in the compression of a reference genome in reference-based compression solutions \cite{survey}.\\
			
 
				+In a worst case szenario, the exact determination of probability distributions would not be possible. This would mean more research should be done in approximating probability distibutions. Additionally, how the use of $A\approx T \approx 0.2914$ and $G\approx C\approx 0.2086$ could provide efficiency improvements in reference-free compression of whole chromosomes and general improvements in the compression of a reference genome in reference-based compression solutions \cite{survey}.\\
			
 
				 Also in this szenario Petoukov would be wrong about the universality of the defined rules, considering the exemplary caculation of probability determination of \texttt{File 1.10} a concern that his rules do not apply to any genomes or that he had a miscalculation is out of the way. This would limit the range of the impact an improvement would create. The combination of which genomes follow Petoukov's rules and a list of tools that specialize on the compression of those would set the new goal for an optimization approach.\\
			
 
				 
			
 
				 %From this perspective, how favorable research turns out does not determine if there will be an impact but just how far it will reach.
			
--- a/latex/tex/literatur.bib
+++ b/latex/tex/literatur.bib
@@ -14,27 +14,44 @@
 
				   year         = {1980},
			
 
				 }
			
 
				 
			
 
				+@Book{delfs_knebl,
			
 
				+  title     = "Introduction to cryptography",
			
 
				+  author    = "Delfs, Hans and Knebl, Helmut",
			
 
				+  publisher = "Springer",
			
 
				+  series    = "Information Security and Cryptography",
			
 
				+  date      =  {2007-03},
			
 
				+	note			= {{ISBN:} 978-3-540-49243-6},
			
 
				+  address   = "Berlin, Germany",
			
 
				+  language  = "en"
			
 
				+}
			
 
				+
			
 
				 @TechReport{rfcgzip,
			
 
				   author       = {L. Peter Deutsch and Jean-Loup Gailly and Mark Adler and L. Peter Deutsch and Glenn Randers-Pehrson},
			
 
				   date         = {1996-05},
			
 
				-  title        = {GZIP file format specification version 4.3},
			
 
				+  title        = {{GZIP} file format specification version 4.3},
			
 
				+  doi          = {10.17487/rfc1952},
			
 
				   number       = {1952},
			
 
				   type         = {RFC},
			
 
				+  url          = {https://www.rfc-editor.org/rfc/rfc1952},
			
 
				   howpublished = {Internet Requests for Comments},
			
 
				   issn         = {2070-1721},
			
 
				   month        = {May},
			
 
				-  publisher    = {RFC},
			
 
				+  publisher    = {{RFC} Editor},
			
 
				   year         = {1996},
			
 
				 }
			
 
				 
			
 
				 @TechReport{rfcansi,
			
 
				-  author       = {K. Simonsen and},
			
 
				+  author       = {K. Simonsen},
			
 
				+  date         = {1992-06},
			
 
				   title        = {Character Mnemonics and Character Sets},
			
 
				+  doi          = {10.17487/rfc1345},
			
 
				   number       = {1345},
			
 
				   type         = {RFC},
			
 
				+  url          = {https://www.rfc-editor.org/rfc/rfc1345},
			
 
				   howpublished = {Internet Requests for Comments},
			
 
				   issn         = {2070-1721},
			
 
				   month        = {June},
			
 
				+  publisher    = {{RFC} Editor},
			
 
				   year         = {1992},
			
 
				 }
			
 
				 
			
@@ -48,23 +65,6 @@
 
				   year        = {2019},
			
 
				 }
			
 
				 
			
 
				-@TechReport{iso-ascii,
			
 
				-  author      = {ISO/IEC JTC 1/SC 2 Coded character sets},
			
 
				-  date        = {1998-04},
			
 
				-  institution = {International Organization for Standardization},
			
 
				-  title       = {Information technology — 8-bit single-byte coded graphic character sets — Part 1: Latin alphabet No. 1},
			
 
				-  type        = {Standard},
			
 
				-  address     = {Geneva, CH},
			
 
				-  key         = {ISO8859-1:1998},
			
 
				-  volume      = {1998},
			
 
				-  year        = {1998},
			
 
				-}
			
 
				-
			
 
				-@TechReport{isoutf,
			
 
				-  author = {ISO},
			
 
				-  title  = {ISO/IEC 10646:2020 UTF},
			
 
				-}
			
 
				-
			
 
				 @Article{ju_21,
			
 
				   author       = {Philomin Juliana and Ravi Prakash Singh and Jesse Poland and Sandesh Shrestha and Julio Huerta-Espino and Velu Govindan and Suchismita Mondal and Leonardo Abdiel Crespo-Herrera and Uttam Kumar and Arun Kumar Joshi and Thomas Payne and Pradeep Kumar Bhati and Vipin Tomar and Franjel Consolacion and Jaime Amador Campos Serna},
			
 
				   date         = {2021-03},
			
@@ -134,7 +134,7 @@
 
				 }
			
 
				 
			
 
				 @Article{dna_structure,
			
 
				-  author       = {J. D. WATSON and F. H. C. CRICK},
			
 
				+  author       = {J. Watson and F. Crick},
			
 
				   date         = {1953-04},
			
 
				   journaltitle = {Nature},
			
 
				   title        = {Molecular Structure of Nucleic Acids: A Structure for Deoxyribose Nucleic Acid},
			
@@ -199,18 +199,8 @@
 
				   publisher    = {Springer Science and Business Media {LLC}},
			
 
				 }
			
 
				 
			
 
				-@Book{delfs_knebl,
			
 
				-  author    = {Delfs, Hans and Knebl, Helmut},
			
 
				-  date      = {2007},
			
 
				-  title     = {Introduction to Cryptography},
			
 
				-  isbn      = {9783540492436},
			
 
				-  pages     = {368},
			
 
				-  publisher = {Springer},
			
 
				-  subtitle  = {Principles and Applications (Information Security and Cryptography)},
			
 
				-}
			
 
				-
			
 
				 @Article{cc14,
			
 
				-  author       = {Kashfia Sailunaz and Mohammed Rokibul Alam Kotwal and Mohammad Nurul Huda},
			
 
				+  author       = {Kashfia Sailunaz and Mohammed Kotwal and Mohammad Huda},
			
 
				   date         = {2014-03},
			
 
				   journaltitle = {International Journal of Computer Applications},
			
 
				   title        = {Data Compression Considering Text Files},
			
@@ -234,22 +224,26 @@
 
				 }
			
 
				 
			
 
				 @Article{huf52,
			
 
				-  author      = {Huffman, David A.},
			
 
				-  title       = {A Method for the Construction of Minimum-Redundancy Codes},
			
 
				-  number      = {9},
			
 
				-  pages       = {1098-1101},
			
 
				-  volume      = {40},
			
 
				-  added-at    = {2009-01-14T00:43:43.000+0100},
			
 
				-  biburl      = {https://www.bibsonomy.org/bibtex/2585b817b85d7278b868329672ddded96/dret},
			
 
				-  description = {dret'd bibliography},
			
 
				-  interhash   = {d00a180c1c2e7851560c2d51e0fd8f92},
			
 
				-  intrahash   = {585b817b85d7278b868329672ddded96},
			
 
				-  journal     = {Proceedings of the Institute of Radio Engineers},
			
 
				-  keywords    = {imported},
			
 
				-  month       = {September},
			
 
				-  timestamp   = {2009-01-14T00:43:44.000+0100},
			
 
				-  uri         = {http://compression.graphicon.ru/download/articles/huff/huffman_1952_minimum-redundancy-codes.pdf},
			
 
				-  year        = {1952},
			
 
				+  author       = {David Huffman},
			
 
				+  date         = {1952-09},
			
 
				+  journaltitle = {Proceedings of the {IRE}},
			
 
				+  title        = {A Method for the Construction of Minimum-Redundancy Codes},
			
 
				+  doi          = {10.1109/jrproc.1952.273898},
			
 
				+  number       = {9},
			
 
				+  pages        = {1098--1101},
			
 
				+  volume       = {40},
			
 
				+  added-at     = {2009-01-14T00:43:43.000+0100},
			
 
				+  biburl       = {https://www.bibsonomy.org/bibtex/2585b817b85d7278b868329672ddded96/dret},
			
 
				+  description  = {dret'd bibliography},
			
 
				+  interhash    = {d00a180c1c2e7851560c2d51e0fd8f92},
			
 
				+  intrahash    = {585b817b85d7278b868329672ddded96},
			
 
				+  journal      = {Proceedings of the Institute of Radio Engineers},
			
 
				+  keywords     = {imported},
			
 
				+  month        = {September},
			
 
				+  publisher    = {Institute of Electrical and Electronics Engineers ({IEEE})},
			
 
				+  timestamp    = {2009-01-14T00:43:44.000+0100},
			
 
				+  uri          = {http://compression.graphicon.ru/download/articles/huff/huffman_1952_minimum-redundancy-codes.pdf},
			
 
				+  year         = {1952},
			
 
				 }
			
 
				 
			
 
				 @Article{moffat20,
			
@@ -406,11 +400,10 @@
 
				   publisher    = {Springer Science and Business Media {LLC}},
			
 
				 }
			
 
				 
			
 
				-
			
 
				 @Book{cthreading,
			
 
				   author    = {Quinn, Michael J.},
			
 
				   title     = {Parallel Programming in C with MPI and OpenMP},
			
 
				-  isbn      = {0071232656},
			
 
				+  note			= {{ISBN:} 0071232656},
			
 
				   publisher = {McGraw-Hill Education Group},
			
 
				   year      = {2003},
			
 
				 }
			
@@ -429,7 +422,7 @@
 
				   author    = {McIntosh, Colin},
			
 
				   date      = {2013},
			
 
				   title     = {Cambridge International Dictionary of English},
			
 
				-  isbn      = {9781107035157},
			
 
				+  note 			= {{ISBN:} 9781107035157},
			
 
				   pages     = {1856},
			
 
				   publisher = {Cambridge University Press},
			
 
				 }
			
@@ -448,89 +441,111 @@
 
				 
			
 
				 
			
 
				 @Online{bam,
			
 
				-  title   = {Sequence Alignment/Map Format Specification},
			
 
				+  title   = {{Sequence Alignment/Map Format Specification}},
			
 
				   url     = {https://github.com/samtools/hts-specs},
			
 
				   urldate = {2022-09-12},
			
 
				 }
			
 
				 
			
 
				 @Online{ucsc,
			
 
				-  title   = {UCSC Genome Browser},
			
 
				+  title   = {{UCSC University of California Santa Cruz - Genome Browser}},
			
 
				   url     = {https://genome.ucsc.edu/},
			
 
				   urldate = {2022-10-28},
			
 
				 }
			
 
				 
			
 
				 @Online{ensembl,
			
 
				-  title  = {ENSEMBL Project},
			
 
				+  title  = {{The Ensembl Project}},
			
 
				   url    = {http://www.ensembl.org/},
			
 
				   urldate   = {2022-10-24},
			
 
				 }
			
 
				 
			
 
				-@Online{ga4gh,
			
 
				-  title = {Global Alliance for Genomics and Health},
			
 
				-  url   = {https://github.com/samtools/hts-specs.},
			
 
				-	urldate = {2022-10-04},
			
 
				-}
			
 
				-
			
 
				 @Online{bed,
			
 
				-  title  = {BED Browser Extensible Data},
			
 
				+  title  = {{BED -- Browser Extensible Data}},
			
 
				   url    = {https://samtools.github.io/hts-specs/BEDv1.pdf},
			
 
				   urldate   = {2022-10-20},
			
 
				 }
			
 
				 
			
 
				 @Online{illufastq,
			
 
				-  title  = {Illumina FASTq file structure explained},
			
 
				+  title  = {{Illumina FASTq file structure explained}},
			
 
				   url    = {https://support.illumina.com/bulletins/2016/04/fastq-files-explained.html},
			
 
				   urldate   = {2022-11-17},
			
 
				 }
			
 
				 
			
 
				 @Online{twobit,
			
 
				-  editor = {UCSC University of California Sata Cruz},
			
 
				+  editor = {{UCSC -- University of California Sata Cruz}},
			
 
				   title  = {TwoBit File Format},
			
 
				   url    = {https://genome-source.gi.ucsc.edu/gitlist/kent.git/raw/master/src/inc/twoBit.h},
			
 
				   urldate   = {2022-09-22},
			
 
				 }
			
 
				 
			
 
				 @Online{ftp-igsr,
			
 
				-  title = {IGSR: The International Genome Sample Resource},
			
 
				+  title = {{IGSR -- The International Genome Sample Resource}},
			
 
				   url   = {https://ftp.1000genomes.ebi.ac.uk},
			
 
				   urldate  = {2022-11-10},
			
 
				 }
			
 
				 
			
 
				 @Online{ftp-ncbi,
			
 
				-  title = {NCBI National Center for Biotechnology Information},
			
 
				+  title = {{NCBI -- National Center for Biotechnology Information}},
			
 
				   url   = {https://ftp.ncbi.nlm.nih.gov/genomes/},
			
 
				   urldate  = {2022-11-01},
			
 
				 }
			
 
				 
			
 
				 @Online{ftp-ensembl,
			
 
				-  title = {ENSEMBL Rapid Release},
			
 
				+  title = {{Ensembl FTP-Server}},
			
 
				   url   = {https://ftp.ensembl.org},
			
 
				   urldate  = {2022-10-15},
			
 
				 }
			
 
				 
			
 
				 @Online{geco-repo,
			
 
				-  title  = {Repositories for the three versions of GeCo},
			
 
				+  title  = {{Repositories for the three versions of GeCo}},
			
 
				   url    = {https://github.com/cobilab},
			
 
				   urldate   = {2022-11-19},
			
 
				 }
			
 
				 
			
 
				 @Online{code-analysis,
			
 
				   editor = {Kirsten, S and Nick Bloor and Sarah Baso and James Bowie and Evgeniy Ryzhkov and Iberiam and Ann Campbell and Jonathan Marcil and Christina Schelin and Jie Wang and Fabian and Achim and Dirk Wetter},
			
 
				-  title  = {Static Code Analysis},
			
 
				+  title  = {{Static Code Analysis definition}},
			
 
				   url    = {https://owasp.org/www-community/controls/Static_Code_Analysis},
			
 
				   urldate   = {2022-11-20},
			
 
				 }
			
 
				 
			
 
				 @Online{gpl,
			
 
				-  title = {GNU Public License},
			
 
				+  title = {{GPL - GNU Public License description}},
			
 
				   url   = {http://www.gnu.org/licenses/gpl-3.0.html},
			
 
				 	urldate = {2022-11-20},
			
 
				 }
			
 
				 
			
 
				 @Online{mitlic,
			
 
				-  title = {MIT License},
			
 
				+  title = {{MIT license description}},
			
 
				   url   = {https://spdx.org/licenses/MIT.html},
			
 
				 	urldate = {2022-11-23}
			
 
				 }
			
 
				 
			
 
				+@TechReport{rfchttp,
			
 
				+  author    = {R. Fielding and J. Gettys and J. Mogul and H. Frystyk and L. Masinter and P. Leach and T. Berners-Lee},
			
 
				+  date      = {1999-06},
			
 
				+  title     = {Hypertext Transfer Protocol -- {HTTP}/1.1},
			
 
				+  doi       = {10.17487/rfc2616},
			
 
				+  url       = {https://www.rfc-editor.org/rfc/rfc2616},
			
 
				+  publisher = {{RFC} Editor},
			
 
				+}
			
 
				+
			
 
				+@TechReport{iso-ascii,
			
 
				+  author      = {{ISO/IEC} 8859-1:1998},
			
 
				+  date        = {2020-12},
			
 
				+  institution = {International Organization for Standardization, Geneva, Switzerland.},
			
 
				+  title       = {Information technology — 8-bit single-byte coded graphic character sets — Part 1: Latin alphabet No. 1},
			
 
				+  type        = {Standard},
			
 
				+  url         = {https://www.iso.org/standard/28245.html},
			
 
				+  year        = {1998},
			
 
				+}
			
 
				+
			
 
				+@TechReport{isoutf,
			
 
				+  author      = {{ISO/IEC} 10646:2020},
			
 
				+  date        = {2020-12},
			
 
				+  institution = {International Organization for Standardization, Geneva, Switzerland.},
			
 
				+  title       = {Information technology — Universal coded character set {UCS}},
			
 
				+  url         = {https://www.iso.org/standard/76835.html},
			
 
				+  year        = {1991},
			
 
				+}
			
 
				+
			
 
				 @Comment{jabref-meta: databaseType:biblatex;}
			
--- a/latex/tex/max-sign.png
+++ b/latex/tex/max-sign.png
--- a/latex/tex/thesis.tex
+++ b/latex/tex/thesis.tex
@@ -172,10 +172,10 @@
 
				 
			
 
				 % Listingverzeichnis erzeugen. Wenn Sie keine Listings haben,
			
 
				 % entfernen Sie einfach diesen Teil.
			
 
				-\cleardoublepage
			
 
				-\phantomsection
			
 
				-\addcontentsline{toc}{chapter}{\hsmalistings}
			
 
				-\lstlistoflistings
			
 
				+%\cleardoublepage
			
 
				+%\phantomsection
			
 
				+%\addcontentsline{toc}{chapter}{\hsmalistings}
			
 
				+%\lstlistoflistings
			
 
				 
			
 
				 % Literaturverzeichnis erzeugen
			
 
				 \begingroup
			
@@ -183,7 +183,7 @@
 
				 \begin{flushleft}
			
 
				 \let\clearpage\relax % Fix für leere Seiten (issue #25)
			
 
				 \printbibliography[nottype=online]
			
 
				-\printbibliography[type=online, sorting=nud title=Online Sources]
			
 
				+\printbibliography[type=online, title={Online Sources}]
			
 
				 \end{flushleft}
			
 
				 \endgroup
			
 
				 
			
@@ -200,6 +200,6 @@
 
				 \input{kapitel/a5_feasability}
			
 
				 \input{kapitel/a6_results}
			
 
				 \input{kapitel/a_online}
			
 
				-%\input{kapitel/a0}
			
 
				+\input{kapitel/a0}
			
 
				 
			
 
				 \end{document}
			
--- a/latex/tex/unterschrift.png
+++ b/latex/tex/unterschrift.png