1
0

2 Ревизии 7271e1a8bc ... 4e59b22625

Автор SHA1 Съобщение Дата
  u 4e59b22625 killed authors. fixed thesis преди 3 години
  u a77999b7da finished arith example figs, added stuff to results, working on literatur.bib: online has authors преди 3 години

BIN
latex/result/thesis.pdf


+ 555 - 0
latex/tex/backup_lit.bib

@@ -0,0 +1,555 @@
+% todo sort alphabetically 
+% seperate online sources
+
+@Article{alok17,
+  author       = {A. Al-Okaily and B. Almarri and S. Yami and C. Huang},
+  date         = {2017-04-01},
+  journaltitle = {Journal of Computational Biology},
+  title        = {Toward a Better Compression for {DNA} Sequences Using Huffman Encoding},
+  doi          = {10.1089/cmb.2016.0151},
+  number       = {4},
+  pages        = {280--288},
+  volume       = {24},
+  publisher    = {Mary Ann Liebert Inc.},
+}
+
+@Article{Cock_2009,
+  author       = {P. Cock and C. Fields and N. Goto and M. Heuer and P. Rice},
+  date         = {2009-12},
+  journaltitle = {Nucleic Acids Research},
+  title        = {The Sanger {FASTQ} file format for sequences with quality scores, and the Solexa/Illumina {FASTQ} variants},
+  doi          = {10.1093/nar/gkp1137},
+  number       = {6},
+  pages        = {1767--1771},
+  volume       = {38},
+  publisher    = {Oxford University Press ({OUP})},
+}
+
+@Article{cells,
+  author       = {E. Bianconi and A. Piovesan and F. Facchin and A. Beraudi and R. Casadei and F. Frabetti and L. Vitale and M. Pelleri and S. Tassani and F. Piva and S. Perez-Amodio and P. Strippoli and S. Canaider},
+  date         = {2013-07},
+  journaltitle = {Annals of Human Biology},
+  title        = {An estimation of the number of cells in the human body},
+  doi          = {10.3109/03014460.2013.807878},
+  number       = {6},
+  pages        = {463--471},
+  volume       = {40},
+  publisher    = {Informa {UK} Limited},
+}
+
+@Article{dna_structure,
+  author       = {J. Watson and F. Crick},
+  date         = {1953-04},
+  journaltitle = {Nature},
+  title        = {Molecular Structure of Nucleic Acids: A Structure for Deoxyribose Nucleic Acid},
+  doi          = {10.1038/171737a0},
+  number       = {4356},
+  pages        = {737--738},
+  volume       = {171},
+  publisher    = {Springer Science and Business Media {LLC}},
+}
+
+@Article{iupac,
+  author       = {A. Johnson},
+  date         = {2010-03},
+  journaltitle = {Bioinformatics},
+  title        = {An extended {IUPAC} nomenclature code for polymorphic nucleic acids},
+  doi          = {10.1093/bioinformatics/btq098},
+  number       = {10},
+  pages        = {1386--1389},
+  volume       = {26},
+  publisher    = {Oxford University Press ({OUP})},
+}
+
+@TechReport{rfc1951,
+  author    = {P. Deutsch},
+  date      = {1996-05},
+  title     = {{DEFLATE} Compressed Data Format Specification version 1.3},
+  doi       = {10.17487/rfc1951},
+  url       = {https://www.rfc-editor.org/rfc/rfc1951},
+  publisher = {{RFC} Editor},
+}
+
+@Article{Shannon_1948,
+  author       = {C. E. Shannon},
+  date         = {1948-07},
+  journaltitle = {Bell System Technical Journal},
+  title        = {A Mathematical Theory of Communication},
+  doi          = {10.1002/j.1538-7305.1948.tb01338.x},
+  number       = {3},
+  pages        = {379--423},
+  volume       = {27},
+  publisher    = {Institute of Electrical and Electronics Engineers ({IEEE})},
+}
+
+@InProceedings{compr-visual,
+  author    = {S. Khuri and H. Hsu},
+  booktitle = {Proceedings of the 2000 {ACM} symposium on Applied computing - {SAC} {\textquotesingle}00},
+  date      = {2000},
+  title     = {Tools for visualizing text compression algorithms},
+  doi       = {10.1145/335603.335716},
+  publisher = {{ACM} Press},
+}
+
+@Article{lcqs,
+  author       = {J. Fu and B. Ke and S. Dong},
+  date         = {2020-03},
+  journaltitle = {{BMC} Bioinformatics},
+  title        = {{LCQS}: an efficient lossless compression tool of quality scores with random access functionality},
+  doi          = {10.1186/s12859-020-3428-7},
+  number       = {1},
+  volume       = {21},
+  publisher    = {Springer Science and Business Media {LLC}},
+}
+
+@Book{delfs_knebl,
+  author    = {H. Delfs and H. Knebl},
+  date      = {2007},
+  title     = {Introduction to Cryptography},
+  isbn      = {9783540492436},
+  pages     = {368},
+  publisher = {Springer},
+  subtitle  = {Principles and Applications (Information Security and Cryptography)},
+}
+
+@Article{cc14,
+  author       = {K. Sailunaz and M. Kotwal and M. Huda},
+  date         = {2014-03},
+  journaltitle = {International Journal of Computer Applications},
+  title        = {Data Compression Considering Text Files},
+  doi          = {10.5120/15765-4456},
+  number       = {11},
+  pages        = {27--32},
+  volume       = {90},
+  publisher    = {Foundation of Computer Science},
+}
+
+@Article{cnet13,
+  author       = {M. RajShivare and Y. Maravi and S. Sharma},
+  date         = {2013-10},
+  journaltitle = {International Journal of Computer Applications},
+  title        = {Analysis of Header Compression Techniques for Networks: A Review},
+  doi          = {10.5120/13856-1701},
+  number       = {5},
+  pages        = {13--20},
+  volume       = {80},
+  publisher    = {Foundation of Computer Science},
+}
+
+@TechReport{rfcgzip,
+  author       = {P. Deutsch and J. Gailly and M. Adler and P. Deutsch and G. Randers-Pehrson},
+  date         = {1996-05},
+  title        = {GZIP file format specification version 4.3},
+  number       = {1952},
+  type         = {RFC},
+  howpublished = {Internet Requests for Comments},
+  issn         = {2070-1721},
+  month        = {May},
+  publisher    = {RFC},
+  year         = {1996},
+}
+
+@Article{huf52,
+  author      = {D. A. Huffman},
+  title       = {A Method for the Construction of Minimum-Redundancy Codes},
+  number      = {9},
+  pages       = {1098-1101},
+  volume      = {40},
+  added-at    = {2009-01-14T00:43:43.000+0100},
+  description = {dret'd bibliography},
+  interhash   = {d00a180c1c2e7851560c2d51e0fd8f92},
+  intrahash   = {585b817b85d7278b868329672ddded96},
+  journal     = {Proceedings of the Institute of Radio Engineers},
+  keywords    = {imported},
+  month       = {September},
+  timestamp   = {2009-01-14T00:43:44.000+0100},
+  uri         = {http://compression.graphicon.ru/download/articles/huff/huffman_1952_minimum-redundancy-codes.pdf},
+  year        = {1952},
+}
+
+@Article{moffat20,
+  author       = {A. Moffat},
+  date         = {2020-07},
+  journaltitle = {{ACM} Computing Surveys},
+  title        = {Huffman Coding},
+  doi          = {10.1145/3342555},
+  number       = {4},
+  pages        = {1--35},
+  volume       = {52},
+  publisher    = {Association for Computing Machinery ({ACM})},
+}
+
+@Article{moffat_arith,
+  author       = {A. Moffat and R. Neal and I. Witten},
+  date         = {1998-07},
+  journaltitle = {{ACM} Transactions on Information Systems},
+  title        = {Arithmetic coding revisited},
+  doi          = {10.1145/290159.290162},
+  number       = {3},
+  pages        = {256--294},
+  volume       = {16},
+  publisher    = {Association for Computing Machinery ({ACM})},
+}
+
+@Article{ris76,
+  author       = {J. Rissanen},
+  date         = {1976-05},
+  journaltitle = {{IBM} Journal of Research and Development},
+  title        = {Generalized Kraft Inequality and Arithmetic Coding},
+  doi          = {10.1147/rd.203.0198},
+  number       = {3},
+  pages        = {198--203},
+  volume       = {20},
+  publisher    = {{IBM}},
+}
+
+@Article{ieee-float,
+  title   = {IEEE Standard for Floating-Point Arithmetic},
+  doi     = {10.1109/IEEESTD.2019.8766229},
+  pages   = {1-84},
+  journal = {IEEE Std 754-2019 (Revision of IEEE 754-2008)},
+  year    = {2019},
+}
+
+@Article{big-o,
+  author = {M. Firdous and A. Rouf},
+  title  = {The Big-O of Mathematics and Computer Science},
+  doi    = {10.26855/jamc.2022.03.001},
+  pages  = {1-3},
+  volume = {6},
+  month  = {01},
+  year   = {2022},
+}
+
+@Article{sam12,
+  author       = {P. Danecek and J. Bonfield and J. Liddle and J. Marshall and V. Ohan and M. Pollard and A. Whitwham and T. Keane and S. McCarthy and R. Davies and H. Li},
+  date         = {2021-01},
+  journaltitle = {{GigaScience}},
+  title        = {Twelve years of {SAMtools} and {BCFtools}},
+  doi          = {10.1093/gigascience/giab008},
+  number       = {2},
+  volume       = {10},
+  publisher    = {Oxford University Press ({OUP})},
+}
+
+@Article{cram-origin,
+  author       = {M. Fritz and R. Leinonen and G. Cochrane and E. Birney},
+  date         = {2011-01},
+  journaltitle = {Genome Research},
+  title        = {Efficient storage of high throughput {DNA} sequencing data using reference-based compression},
+  doi          = {10.1101/gr.114819.110},
+  number       = {5},
+  pages        = {734--740},
+  volume       = {21},
+  publisher    = {Cold Spring Harbor Laboratory},
+}
+
+@TechReport{rfcansi,
+    author =    {K. Simonsen},
+    series =    {Request for Comments},
+    number =    {1345}, 
+    howpublished =  {RFC 1345},
+    publisher = {RFC Editor},
+    doi =       {10.17487/RFC1345},
+    url =       {https://www.rfc-editor.org/info/rfc1345},
+    title =     {{Character Mnemonics and Character Sets}},
+    pagetotal = {103},
+    year =      {1992},
+    month =     {jun},
+}
+
+@Article{witten87,
+  author       = {I. Witten and R. Neal and J. Cleary},
+  date         = {1987-06},
+  journaltitle = {Communications of the {ACM}},
+  title        = {Arithmetic coding for data compression},
+  doi          = {10.1145/214762.214771},
+  issn         = {0001-0782},
+  number       = {6},
+  pages        = {520–540},
+  url          = {https://doi.org/10.1145/214762.214771},
+  volume       = {30},
+  abstract     = {The state of the art in data compression is arithmetic coding, not the better-known Huffman method. Arithmetic coding gives greater compression, is faster for adaptive models, and clearly separates the model from the channel encoding.},
+  address      = {New York, NY, USA},
+  issue_date   = {June 1987},
+  journal      = {Commun. ACM},
+  month        = {jun},
+  numpages     = {21},
+  publisher    = {Association for Computing Machinery ({ACM})},
+  year         = {1987},
+}
+
+@InProceedings{geco,
+  author    = {D. Pratas and A. Pinho and P. Ferreira},
+  booktitle = {2016 Data Compression Conference ({DCC})},
+  date      = {2016-03},
+  title     = {Efficient Compression of Genomic Sequences},
+  doi       = {10.1109/DCC.2016.60},
+  publisher = {{IEEE}},
+}
+
+@Article{survey,
+  author       = {M. Hosseini and D. Pratas and A. Pinho},
+  date         = {2016-10},
+  journaltitle = {Information},
+  title        = {A Survey on Data Compression Methods for Biological Sequences},
+  doi          = {10.3390/info7040056},
+  number       = {4},
+  pages        = {56},
+  volume       = {7},
+  publisher    = {{MDPI} {AG}},
+}
+
+@Article{vertical,
+  author       = {K. Kredens and J. Martins and O. Dordal and M. Ferrandin and R. Herai and E. Scalabrin and B. {\'{A}}vila},
+  date         = {2020-05},
+  journaltitle = {{PLOS} {ONE}},
+  title        = {Vertical lossless genomic data compression tools for assembled genomes: A systematic literature review},
+  doi          = {10.1371/journal.pone.0232942},
+  editor       = {Rashid Mehmood},
+  number       = {5},
+  pages        = {e0232942},
+  volume       = {15},
+  publisher    = {Public Library of Science ({PLoS})},
+}
+
+
+@TechReport{isompeg,
+  author      = {{ISO Central Secretary}},
+  date        = {2020-10},
+  institution = {International Organization for Standardization},
+  title       = {MPGE-G},
+  language    = {en},
+  number      = {ISO/IEC 23092-1:2020},
+  type        = {Standard},
+  url         = {https://www.iso.org/standard/23092.html},
+  year        = {2019},
+}
+
+@ARTICLE{9455132,
+  author={J. Voges and M. Hernaez and M. Mattavelli and J. Ostermann},
+  journal={Proceedings of the IEEE}, 
+  title={An Introduction to MPEG-G: The First Open ISO/IEC Standard for the Compression and Exchange of Genomic Sequencing Data}, 
+  year={2021},
+  volume={109},
+  number={9},
+  pages={1607-1622},
+  doi={10.1109/JPROC.2021.3082027}
+}
+
+@Article{haplo,
+  author       = {W. Low and R. Tearle and R. Liu and S. Koren and A. Rhie and D. Bickhart and B. Rosen and Z. Kronenberg and S. Kingan and E. Tseng and F. Thibaud-Nissen and F. Martin and K. Billis and J. Ghurye and A. Hastie and J. Lee and A. Pang and M. Heaton and A. Phillippy and S. Hiendleder and T. Smith and J. Williams},
+  date         = {2020-04},
+  journaltitle = {Nature Communications},
+  title        = {Haplotype-resolved genomes provide insights into structural variation and gene content in Angus and Brahman cattle},
+  doi          = {10.1038/s41467-020-15848-y},
+  number       = {1},
+  volume       = {11},
+  publisher    = {Springer Science and Business Media {LLC}},
+}
+
+@Article{pet21,
+  author    = {S. Petoukhov},
+  date      = {2021-10},
+  title     = {Tensor Rules in the Stochastic Organization of Genomes and Genetic Stochastic Resonance in Algebraic Biology},
+  doi       = {10.20944/preprints202110.0093.v1},
+  publisher = {{MDPI} {AG}},
+}
+
+@TechReport{iso-ascii,
+  author      = {ISO/IEC JTC 1/SC 2 Coded character sets},
+  date        = {1998-04},
+  institution = {International Organization for Standardization},
+  title       = {Information technology — 8-bit single-byte coded graphic character sets — Part 1: Latin alphabet No. 1},
+  type        = {Standard},
+  address     = {Geneva, CH},
+  key         = {ISO8859-1:1998},
+  volume      = {1998},
+  year        = {1998},
+}
+
+@Book{dict,
+  author    = {C. McIntosh},
+  date      = {2013},
+  title     = {Cambridge International Dictionary of English},
+  isbn      = {9781107035157},
+  pages     = {1856},
+  publisher = {Cambridge University Press},
+}
+
+@TechReport{rfc-udp,
+  author       = {J. Postel},
+  date         = {1980-08-28},
+  institution  = {RFC Editor},
+  title        = {User Datagram Protocol},
+  doi          = {10.17487/RFC0768},
+  number       = {768},
+  pagetotal    = {3},
+  url          = {https://www.rfc-editor.org/info/rfc768},
+  howpublished = {RFC 768},
+  month        = {aug},
+  publisher    = {RFC Editor},
+  series       = {Request for Comments},
+  year         = {1980},
+}
+
+@TechReport{isoutf,
+  author      = {ISO/IEC JTC 1/SC 2 Coded character sets},
+  title  = {ISO/IEC 10646:2020 UTF},
+  date        = {2020-12},
+  institution = {International Organization for Standardization},
+  title       = {Information technology — Universal coded character set (UCS)},
+  type        = {Standard},
+  address     = {Geneva, CH},
+  key         = {ISO10646:2020},
+}
+
+@Article{lz77,
+  author  = {Ziv, J. and Lempel, A.},
+  title   = {A universal algorithm for sequential data compression},
+  doi     = {10.1109/TIT.1977.1055714},
+  number  = {3},
+  pages   = {337-343},
+  volume  = {23},
+  journal = {IEEE Transactions on Information Theory},
+  year    = {1977},
+}
+
+@Article{wang_22,
+  author       = {S. Wang and C. Gao and Y. Zheng and L. Yi and J. Lu and X. Huang and J. Cai and P. Zhang and Y. Cui and A. Ke},
+  date         = {2022-02},
+  journaltitle = {Molecular Cancer},
+  title        = {Current applications and future perspective of {CRISPR}/Cas9 gene editing in cancer},
+  doi          = {10.1186/s12943-022-01518-8},
+  number       = {1},
+  volume       = {21},
+  publisher    = {Springer Science and Business Media {LLC}},
+}
+
+@Article{ju_21,
+  author       = {Philomin, J. and R. Singh and J. Poland and S. Shrestha and J. Huerta-Espino and V. Govindan and S. Mondal and L. Crespo-Herrera and U. Kumar and A. Joshi and T. Payne and P. Bhati and V. Tomar and F. Consolacion and J. Serna},
+  date         = {2021-03},
+  journaltitle = {Scientific Reports},
+  title        = {Elucidating the genetics of grain yield and stress-resilience in bread wheat using a large-scale genome-wide association mapping study with 55,568 lines},
+  doi          = {10.1038/s41598-021-84308-4},
+  number       = {1},
+  volume       = {11},
+  publisher    = {Springer Science and Business Media {LLC}},
+}
+
+@Article{mo_83,
+  author       = {A. Motulsky},
+  date         = {1983-01},
+  journaltitle = {Science},
+  title        = {Impact of Genetic Manipulation on Society and Medicine},
+  doi          = {10.1126/science.6336852},
+  number       = {4581},
+  pages        = {135--140},
+  volume       = {219},
+  publisher    = {American Association for the Advancement of Science ({AAAS})},
+}
+
+
+@Online{ftp-igsr,
+  date  = {2022-11-10},
+  title = {IGSR: The International Genome Sample Resource},
+  url   = {https://ftp.1000genomes.ebi.ac.uk},
+}
+
+@Online{ftp-ncbi,
+  date  = {2022-11-01},
+  title = {NCBI National Center for Biotechnology Information},
+  url   = {https://ftp.ncbi.nlm.nih.gov/genomes/},
+}
+
+@Online{ftp-ensembl,
+  date  = {2022-10-15},
+  title = {ENSEMBL Rapid Release},
+  url   = {https://ftp.ensembl.org},
+}
+
+@Book{cthreading,
+  author    = {Quinn, Michael J.},
+  title     = {Parallel Programming in C with MPI and OpenMP},
+  isbn      = {0071232656},
+  publisher = {McGraw-Hill Education Group},
+  year      = {2003},
+}
+
+@Online{geco-repo,
+  author = {Cobilab},
+  date   = {2022-11-19},
+  title  = {Repositories for the three versions of GeCo},
+  url    = {https://github.com/cobilab},
+}
+
+@Online{code-analysis,
+  author = {Ryan Dewhurst},
+  date   = {2022-11-20},
+  editor = {Kirsten S and Nick Bloor and Sarah Baso and James Bowie and Evgeniy Ryzhkov and Iberiam and Ann Campbell and Jonathan Marcil and Christina Schelin and Jie Wang and Fabian and Achim and Dirk Wetter},
+  title  = {Static Code Analysis},
+  url    = {https://owasp.org/www-community/controls/Static_Code_Analysis},
+}
+
+@Online{gpl,
+  title = {GNU Public License},
+  url   = {http://www.gnu.org/licenses/gpl-3.0.html},
+}
+
+@Online{mitlic,
+  title = {MIT License},
+  url   = {https://spdx.org/licenses/MIT.html},
+}
+
+@Online{bam,
+  author  = {The SAM/BAM Format Specification Working Group},
+  date    = {2022-08-22},
+  title   = {Sequence Alignment/Map Format Specification},
+  url     = {https://github.com/samtools/hts-specs},
+  urldate = {2022-09-12},
+  version = {44b4167},
+}
+
+@Online{ucsc,
+  author  = {UCSC - University of California, Santa Cruz},
+  date    = {2022-10-28},
+  title   = {UCSC Genome Browser},
+  url     = {https://genome.ucsc.edu/},
+  urldate = {2022-10-28},
+}
+
+@Online{ensembl,
+  author = {P. Flicek},
+  date   = {2022-10-24},
+  title  = {ENSEMBL Project},
+  url    = {http://www.ensembl.org/},
+}
+
+@Online{ga4gh,
+  date  = {2022-10-10},
+  title = {Global Alliance for Genomics and Health},
+  url   = {https://github.com/samtools/hts-specs.},
+}
+
+@Online{bed,
+  author = {Sanger Institute, Genome Research Limited},
+  date   = {2022-10-20},
+  title  = {BED Browser Extensible Data},
+  url    = {https://samtools.github.io/hts-specs/BEDv1.pdf},
+}
+
+@Online{illufastq,
+  author = {Illumina},
+  date   = {2022-11-17},
+  title  = {Illumina FASTq file structure explained},
+  url    = {https://support.illumina.com/bulletins/2016/04/fastq-files-explained.html},
+}
+
+@Online{twobit,
+  date   = {2022-09-22},
+  editor = {UCSC University of California Sata Cruz},
+  title  = {TwoBit File Format},
+  url    = {https://genome-source.gi.ucsc.edu/gitlist/kent.git/raw/master/src/inc/twoBit.h},
+}
+@Comment{jabref-meta: databaseType:biblatex;}
+

BIN
latex/tex/bilder/k4/arith-scaled.png


BIN
latex/tex/bilder/k4/arith-unscaled.png


BIN
latex/tex/bilder/k6/pet-prob.png


+ 1 - 0
latex/tex/kapitel/abkuerzungen.tex

@@ -17,6 +17,7 @@
   \acro{FTP}{File Transfere Protocol}
   \acro{GA4GH}{Global Alliance for Genomics and Health}
 	\acro{GB}{Gigabyte}
+	\acro{MB}{Megabyte}
   \acro{GeCo}{Genome Compressor}
 	\acro{GPL}{GNU General Public License}
   \acro{IUPAC}{International Union of Pure and Applied Chemistry}

+ 48 - 17
latex/tex/kapitel/k6_results.tex

@@ -217,9 +217,22 @@ In both tables \ref{k6:recal-time} and \ref{k6:recal-size} the already identifie
 So far, this work went over formats for storing genomes, methods to compress files (in mentioned formats) and through tests where implementations of named algorithms compress several files and analyzed the results. The test results show that \acs{GeCo} provides a better compression ratio than Samtools and takes more time to run through. So in this testrun, implementations of arithmetic coding resulted in a better compression ratio than Samtools \acs{BAM} with the mix of Huffman coding and \acs{LZ77}, or Samtools custom compression format \acs{CRAM}. Comparing results in \autocite{survey}, supports this statement. This study used \acs{FASTA}/Multi-FASTA files from 71MB to 166MB and found that \acs{GeCo} had a variating compression ratio from 12.34 to 91.68 times smaller than the input reference and also resulted in long runtimes up to over 600 minutes \cite{survey}. Since this study focused on another goal than this work and therefore used different test variables and environments, the results can not be compared. But what can be taken from this, is that arithmetic coding, at least in \acs{GeCo} is in need of a runtime improvement.\\
 The actual mathematical prove of such an improvement, the planing of a implementation and the development of a proof of concept, will be a rewarding but time and ressource comsuming project. Dealing with those tasks would go beyond the scope of this work. But in order to widen the foundation for this tasks, the rest of this work will consist of considerations and problem analysis, which should be thought about and dealt with to develop a improvement.
 
-S.V. Petoukhov described his findings, which are under ongoing research, about the distribution of nucleotides \cite{pet21}. With the probability of one nucleotide, in a sequence of sufficient length, information about the direct neighbours might be revealed. For example, with the probability of \texttt{C}, the probabilities for sets (n-plets) of any nucleotide \texttt{N}, including \texttt{C} might be determinable without counting them \cite{pet21}.\\
-%\%C ≈ Σ\%CN ≈ Σ\%NС ≈ Σ\%CNN ≈ Σ\%NCN ≈ Σ\%NNC ≈ Σ\%CNNN ≈ Σ\%NCNN ≈ Σ\%NNCN ≈ Σ\%NNNC\\
+S.V. Petoukhov described his prepublished findings, which are under ongoing research, about the distribution of nucleotides \cite{pet21}. With the probability of one nucleotide, in a sequence of sufficient length, estimations about the direct neighbours of this nucleotide might be revealed. This can be illustrated in this formula \cite{pet12}:\\
 
+\texttt{\% C $\approx$ $\sum$\%CN $\approx$ $\sum$\%NC $\approx$ $\sum$\%CNN $\approx$ $\sum$\%NCN $\approx$ $\sum$\%NNC $\approx$ $\sum$\%CNNN $\approx$ $\sum$\%NCNN $\approx$ $\sum$\%NNCN $\approx$ $\sum$\%NNNC ...}
+
+Whereas the elements in each sum get more, with a increasing n in the n-plet. To be precise $4^n$ describes the growing of combinations.
+For example, with the probability of \texttt{C}, the probabilities for sets (n-plets) of \texttt{N} as a placeholder for any nucleotide of \texttt{A, C, G or T}, and including at least one \texttt{C} might be determinable without counting them \cite{pet21}. \texttt{$\sum$\%CN} means the probability \texttt{\%C} determines a estimation of all occurences \texttt{\%CC + \%CA + \%CG + \%CT}.\\
+Further he described that there might be a simliarity between nucleotides. 
+
+\begin{figure}[H]
+  \centering
+  \includegraphics[width=15cm]{k6/pet-prob.png}
+  \caption{Probabilities for \texttt{A, C, G and T} in \texttt{Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly} \cite{pet12, ftp-ncbi}.}
+  \label{k6:pet-prob}
+\end{figure}
+
+The exemplaric probabilities he displayed are reprinted in \ref{k6:pet-prob}. Noteable are the similarities in the distirbution of \%A and \%G as well as in \%C and \%T. They align until the third digit after the decimal point. According to Petoukhov, this regularity is found in the genome of humans, some anmials, plants, bacteria and more \cite{pet12}.\\
 % begin optimization 
 Considering this and the measured results, an improvement in the arithmetic coding process and therefore in \acs{GeCo}s efficiency, would be a good start to equalize the great gap in the compression duration. Combined with a tool that is developed with todays standards, there is a possibility that even greater improvements could be archived.\\
 % simple theoretical approach
@@ -240,7 +253,7 @@ The question for how many probabilities are needed, needs to be answered, to sta
 %One should keep in mind that this is only one of many approaches. Any prove of other approaches which reduces the probability determination, can be taken in instead. 
 
 % second bullet point (mutlithreading aspect=
-The Second point must be asked, because the improvement in counting only one nucleotide in comparison to counting three, would be to little to be called relevant. Especially if multithreading is a option. 
+The Second point must be asked, because the improvement in counting only one nucleotide in comparison to counting three, would be to little to be called relevant. Especially if multithreading is a option.  
 % fim: nicht ganz klar ->
 Since in the static codeanalysis in \ref{k4:geco} revealed no multithreading, the analysis for improvements when splitting the workload onto several threads should be considered, before working on an improvement based on Petoukhovs findings. This is relevant, because some improvements, like the one described above, will loose efficiency if only subsections of a genomes are processed. A tool like OpenMC for multithreading C programs would possibly supply the required functionality to develop a prove of concept \cite{cthreading, pet21}.
 % theoretical improvement with pseudocode
@@ -262,8 +275,27 @@ done\\
 
 This loop will itterate over a whole sequence, counting each nucleotide. In line three, a inner loop can be found which itterates over the alphabet, to determine which symbol should be increased. Considering the findings, described above, the inner loop can be left out, because there is no need to compare the read nucleotide against more than one symbol. The Big-O notation for this code, with any sequence with the length of n, would be decreseased from O($n^2$) to O($n\cdot 1)$ or simply O(n) \cite{big-o}. Which is clearly an improvement in complexety and therefor also in runtime.\\
 The runtime for calculations of the other symbols probabilities must be considered as well and compared against the nested loop to be certain, that the overall runtime was improved.\\
+
+Should Petoukhovs rules, and the regularity shown in \ref{k6:pet-prob} happen to be universal, three approaches could be used to determine the probability of a genome:
+\begin{itemize}
+	\item No counting of any nucleotide and using a fixed set of probabilities.
+	\item Counting only one nucleotide and determining the others by calculation. 
+	\item Counting either A and C or G and T and determining the other two by mirroring the results.
+\end{itemize}
+
+The calculation mentioned in the second bulletpoint, would look like the following, if for example the probability for \texttt{C} got determined by parsing the whole sequence:
+
+$\%T = \%C$\\
+$\%A+\%G = \%100-(\%T+\%C)$\\
+$\%A = \frac{\%100-(\%T+\%C)}{2}$\\
+$\%G = \%A$\\
+
+The mapping, mentioned in the last point would be identical to the first and last declaration.\\
+% actual programming approach
+Working with probabilities, in fractions as well as in percent, would probabily mean rounding values. To increase the accuracity, the actual value resulting form the counted symbol could be used. For this to work, the ammount of overall symbols had to be determined, for \texttt{\%A+\%G} to be calculateable. Since counting all symbols while counting the one nucleotide, would have an impact on the runtime, the value could be calculated. With s beeing the size of the parsed sequence in bytes and c beeing the bytes per character $\frac{s}{c}$ would result in the amount of symbols in the sequence.\\
+They obviously differ in several categories: runtime, since each is parsing more of the sequence, and the grade of heuristic which is taken into account.
+
 % more realistic view on parsing todo need cites
-%In practice, obviously smarter ways are used, to determine probabilities. Like splitting the sequence in multiple parts and parse each subsequence asynchronous. 
 Getting back to the question how multithreading would impact improvements: A implementation like the one described above, could also work with multithreading. Since the ratio of the difference between O($n^2$) and O(n) does not differ with the reduction of n. Multiple threads, processing parts of a sequence with the length of n, would also benefit, because any fraction of $n^2$ will always be greater than the corresponding fraction of n. This results can either sumed up for global probabilities or get used individually on each associated subsequence. Either way, the presented improvement approach should be appliable to both parsing methods.\\
 This leaves a list of problems, which needs to be regarded in the approach of developing a improvement.
 If there space for improvement in the parsing/counting process, what problems needs to be addressed:
@@ -275,23 +307,22 @@ If there space for improvement in the parsing/counting process, what problems ne
 
 % todo petoukhov just said T = AT+GT+CT+TT = %NT and %T = %TN
 % if %C = %T and %A = %G 
-% C = ?
+% C ist ungefaehr T => bytes(genome) - bytes(T) = 2*bytes(A) = 2*bytes(G) = bytes(A) + bytes(T)
 
 % bulletpoint 3
-A important question that needs answered would be: If Petoukhovs findings show that, through simliarities in the distribution of each nucleotide, one can lead to the aproximation of the other three. Entropy codings work with probabilities, how does that affect the coding mechanism?
+Another important question that needs answered would be: If Petoukhovs findings will show that, through simliarities in the distribution of each nucleotide, one can lead to the aproximation of the other three. Entropy codings work with probabilities, how does that affect the coding mechanism?
 With a equal probability for each nucleotide, entropy coding can not be treated as a whole. This is due to the fact, that Huffman coding makes use of differing probabilities. A equal distribution means every character will be encoded in the same length which would make the encoding process unnecessary. Arithmetic coding on the other hand is able to handle equal probabilities.
-The fact that there are obviously chains of repeating nucleotides in genomes. For example \texttt{File 2.2}, which contains this subsequence is found at line 90:
+The fact that there are obviously chains of repeating nucleotides in genomes. For example \texttt{File 1.10}, which contains this subsequence:
 
-\texttt{AAAAAAAAAAAAAAAAAAAAAATAAATATTTTATTT} 
+\texttt{AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCC} 
 
-Without determining probabilities, one can see that the amount of \texttt{A}s outnumbers \texttt{T}s and neither \texttt{C} nor \texttt{G} are present. With the whole 1.2 gigabytes, the distribution will align more, but by cutting out a subsection, of relevant size, with unequal distributions will have an impact on the probabilities of the whole sequence. If a greater sequence would lead to a more equal distribution, this knowledge could be used to help determining distributions on subsequences of one with equaly distributed probabilities.
-% length cutting
+Without determining probabilities, one can see that the amount of \texttt{C}s outnumbers \texttt{T}s and \texttt{A}s. With the whole 130\acs{MB}, the probability distribution will align more. The following values have been roundet down: 
 
-% todo erweitern um vergleiche zu survey work
-% how is data interpreted
-% why did the tools result in this, what can we learn
-% improvements
-% - goal: less time to compress
-% 	- approach: optimize probability determination
-% 	-> how?
+\texttt{A $\approx$ 0.291723, C $\approx$ 0.207406, G $\approx$ 0.208009, T $\approx$ 0.2928609}
 
+Nevertheless, this 133258320 characters show the pattern described by S. Petoukhov.
+But by cutting out a subsection, of relevant size, with unequal distributions will have an impact on the probabilities of the whole sequence. 
+If a greater sequence would lead to a more equal distribution, this knowledge could be used to help determining distributions on subsequences of one with equaly distributed probabilities.
+% length cutting
+% todo erweitern um vergleiche zu survey work
+Besides multithreading, there are other methods that could impact improvement approaches. To get a bit more specific, ussage of entropy coding in reference free compression. This methods use structural properties, like repetitions or palindromes to apply a dictionary coding algorithm like \acs{LZ77} on the sequence. The parts that do not show any sign of forward or backward repetition get compressed using airhtmetic coding \cite{survey}. When this method is used, working with the probabilities of the whole genome is not purposeful. In the example subsequence out of \texttt{File 1.10}, no \texttt{G} is present. Compressing the subsequence with a additional interval of >0.2 for a symbol that would never get encoded, would be a waste of resources.\\

+ 109 - 129
latex/tex/literatur.bib

@@ -1,8 +1,5 @@
-% todo sort alphabetically 
-% seperate online sources
-
 @Article{alok17,
-  author       = {A. Al-Okaily and B. Almarri and S. Yami and C. Huang},
+  author       = {Anas Al-Okaily and Badar Almarri and Sultan Al Yami and Chun-Hsi Huang},
   date         = {2017-04-01},
   journaltitle = {Journal of Computational Biology},
   title        = {Toward a Better Compression for {DNA} Sequences Using Huffman Encoding},
@@ -10,11 +7,11 @@
   number       = {4},
   pages        = {280--288},
   volume       = {24},
-  publisher    = {Mary Ann Liebert Inc.},
+  publisher    = {Mary Ann Liebert Inc},
 }
 
 @Article{Cock_2009,
-  author       = {P. Cock and C. Fields and N. Goto and M. Heuer and P. Rice},
+  author       = {Peter J. A. Cock and Christopher J. Fields and Naohisa Goto and Michael L. Heuer and Peter M. Rice},
   date         = {2009-12},
   journaltitle = {Nucleic Acids Research},
   title        = {The Sanger {FASTQ} file format for sequences with quality scores, and the Solexa/Illumina {FASTQ} variants},
@@ -26,7 +23,7 @@
 }
 
 @Article{cells,
-  author       = {E. Bianconi and A. Piovesan and F. Facchin and A. Beraudi and R. Casadei and F. Frabetti and L. Vitale and M. Pelleri and S. Tassani and F. Piva and S. Perez-Amodio and P. Strippoli and S. Canaider},
+  author       = {Eva Bianconi and Allison Piovesan and Federica Facchin and Alina Beraudi and Raffaella Casadei and Flavia Frabetti and Lorenza Vitale and Maria Chiara Pelleri and Simone Tassani and Francesco Piva and Soledad Perez-Amodio and Pierluigi Strippoli and Silvia Canaider},
   date         = {2013-07},
   journaltitle = {Annals of Human Biology},
   title        = {An estimation of the number of cells in the human body},
@@ -38,7 +35,7 @@
 }
 
 @Article{dna_structure,
-  author       = {J. Watson and F. Crick},
+  author       = {J. D. WATSON and F. H. C. CRICK},
   date         = {1953-04},
   journaltitle = {Nature},
   title        = {Molecular Structure of Nucleic Acids: A Structure for Deoxyribose Nucleic Acid},
@@ -50,7 +47,7 @@
 }
 
 @Article{iupac,
-  author       = {A. Johnson},
+  author       = {Andrew D. Johnson},
   date         = {2010-03},
   journaltitle = {Bioinformatics},
   title        = {An extended {IUPAC} nomenclature code for polymorphic nucleic acids},
@@ -62,7 +59,7 @@
 }
 
 @TechReport{rfc1951,
-  author    = {P. Deutsch},
+  author    = {L Peter Deutsch},
   date      = {1996-05},
   title     = {{DEFLATE} Compressed Data Format Specification version 1.3},
   doi       = {10.17487/rfc1951},
@@ -82,8 +79,9 @@
   publisher    = {Institute of Electrical and Electronics Engineers ({IEEE})},
 }
 
+
 @InProceedings{compr-visual,
-  author    = {S. Khuri and H. Hsu},
+  author    = {Sami Khuri and Hsiu-Chin Hsu},
   booktitle = {Proceedings of the 2000 {ACM} symposium on Applied computing - {SAC} {\textquotesingle}00},
   date      = {2000},
   title     = {Tools for visualizing text compression algorithms},
@@ -92,7 +90,7 @@
 }
 
 @Article{lcqs,
-  author       = {J. Fu and B. Ke and S. Dong},
+  author       = {Jiabing Fu and Bixin Ke and Shoubin Dong},
   date         = {2020-03},
   journaltitle = {{BMC} Bioinformatics},
   title        = {{LCQS}: an efficient lossless compression tool of quality scores with random access functionality},
@@ -103,7 +101,7 @@
 }
 
 @Book{delfs_knebl,
-  author    = {H. Delfs and H. Knebl},
+  author    = {Delfs, Hans and Knebl, Helmut},
   date      = {2007},
   title     = {Introduction to Cryptography},
   isbn      = {9783540492436},
@@ -113,7 +111,7 @@
 }
 
 @Article{cc14,
-  author       = {K. Sailunaz and M. Kotwal and M. Huda},
+  author       = {Kashfia Sailunaz and Mohammed Rokibul Alam Kotwal and Mohammad Nurul Huda},
   date         = {2014-03},
   journaltitle = {International Journal of Computer Applications},
   title        = {Data Compression Considering Text Files},
@@ -125,7 +123,7 @@
 }
 
 @Article{cnet13,
-  author       = {M. RajShivare and Y. Maravi and S. Sharma},
+  author       = {Manish RajShivare and Yogendra P. S. Maravi and Sanjeev Sharma},
   date         = {2013-10},
   journaltitle = {International Journal of Computer Applications},
   title        = {Analysis of Header Compression Techniques for Networks: A Review},
@@ -137,7 +135,7 @@
 }
 
 @TechReport{rfcgzip,
-  author       = {P. Deutsch and J. Gailly and M. Adler and P. Deutsch and G. Randers-Pehrson},
+  author       = {L. Peter Deutsch and Jean-Loup Gailly and Mark Adler and L. Peter Deutsch and Glenn Randers-Pehrson},
   date         = {1996-05},
   title        = {GZIP file format specification version 4.3},
   number       = {1952},
@@ -150,12 +148,13 @@
 }
 
 @Article{huf52,
-  author      = {D. A. Huffman},
+  author      = {Huffman, David A.},
   title       = {A Method for the Construction of Minimum-Redundancy Codes},
   number      = {9},
   pages       = {1098-1101},
   volume      = {40},
   added-at    = {2009-01-14T00:43:43.000+0100},
+  biburl      = {https://www.bibsonomy.org/bibtex/2585b817b85d7278b868329672ddded96/dret},
   description = {dret'd bibliography},
   interhash   = {d00a180c1c2e7851560c2d51e0fd8f92},
   intrahash   = {585b817b85d7278b868329672ddded96},
@@ -168,7 +167,7 @@
 }
 
 @Article{moffat20,
-  author       = {A. Moffat},
+  author       = {Alistair Moffat},
   date         = {2020-07},
   journaltitle = {{ACM} Computing Surveys},
   title        = {Huffman Coding},
@@ -180,7 +179,7 @@
 }
 
 @Article{moffat_arith,
-  author       = {A. Moffat and R. Neal and I. Witten},
+  author       = {Alistair Moffat and Radford M. Neal and Ian H. Witten},
   date         = {1998-07},
   journaltitle = {{ACM} Transactions on Information Systems},
   title        = {Arithmetic coding revisited},
@@ -192,7 +191,7 @@
 }
 
 @Article{ris76,
-  author       = {J. Rissanen},
+  author       = {J. J. Rissanen},
   date         = {1976-05},
   journaltitle = {{IBM} Journal of Research and Development},
   title        = {Generalized Kraft Inequality and Arithmetic Coding},
@@ -212,7 +211,7 @@
 }
 
 @Article{big-o,
-  author = {M. Firdous and A. Rouf},
+  author = {Mala, Firdous and Ali, Rouf},
   title  = {The Big-O of Mathematics and Computer Science},
   doi    = {10.26855/jamc.2022.03.001},
   pages  = {1-3},
@@ -222,7 +221,7 @@
 }
 
 @Article{sam12,
-  author       = {P. Danecek and J. Bonfield and J. Liddle and J. Marshall and V. Ohan and M. Pollard and A. Whitwham and T. Keane and S. McCarthy and R. Davies and H. Li},
+  author       = {Petr Danecek and James K Bonfield and Jennifer Liddle and John Marshall and Valeriu Ohan and Martin O Pollard and Andrew Whitwham and Thomas Keane and Shane A McCarthy and Robert M Davies and Heng Li},
   date         = {2021-01},
   journaltitle = {{GigaScience}},
   title        = {Twelve years of {SAMtools} and {BCFtools}},
@@ -233,7 +232,7 @@
 }
 
 @Article{cram-origin,
-  author       = {M. Fritz and R. Leinonen and G. Cochrane and E. Birney},
+  author       = {Markus Hsi-Yang Fritz and Rasko Leinonen and Guy Cochrane and Ewan Birney},
   date         = {2011-01},
   journaltitle = {Genome Research},
   title        = {Efficient storage of high throughput {DNA} sequencing data using reference-based compression},
@@ -244,22 +243,21 @@
   publisher    = {Cold Spring Harbor Laboratory},
 }
 
+
+
 @TechReport{rfcansi,
-    author =    {K. Simonsen},
-    series =    {Request for Comments},
-    number =    {1345}, 
-    howpublished =  {RFC 1345},
-    publisher = {RFC Editor},
-    doi =       {10.17487/RFC1345},
-    url =       {https://www.rfc-editor.org/info/rfc1345},
-    title =     {{Character Mnemonics and Character Sets}},
-    pagetotal = {103},
-    year =      {1992},
-    month =     {jun},
+  author       = {K. Simonsen and},
+  title        = {Character Mnemonics and Character Sets},
+  number       = {1345},
+  type         = {RFC},
+  howpublished = {Internet Requests for Comments},
+  issn         = {2070-1721},
+  month        = {June},
+  year         = {1992},
 }
 
 @Article{witten87,
-  author       = {I. Witten and R. Neal and J. Cleary},
+  author       = {Ian H. Witten and Radford M. Neal and John G. Cleary},
   date         = {1987-06},
   journaltitle = {Communications of the {ACM}},
   title        = {Arithmetic coding for data compression},
@@ -280,7 +278,7 @@
 }
 
 @InProceedings{geco,
-  author    = {D. Pratas and A. Pinho and P. Ferreira},
+  author    = {Diogo Pratas and Armando J. Pinho and Paulo J. S. G. Ferreira},
   booktitle = {2016 Data Compression Conference ({DCC})},
   date      = {2016-03},
   title     = {Efficient Compression of Genomic Sequences},
@@ -289,7 +287,7 @@
 }
 
 @Article{survey,
-  author       = {M. Hosseini and D. Pratas and A. Pinho},
+  author       = {Morteza Hosseini and Diogo Pratas and Armando Pinho},
   date         = {2016-10},
   journaltitle = {Information},
   title        = {A Survey on Data Compression Methods for Biological Sequences},
@@ -301,7 +299,7 @@
 }
 
 @Article{vertical,
-  author       = {K. Kredens and J. Martins and O. Dordal and M. Ferrandin and R. Herai and E. Scalabrin and B. {\'{A}}vila},
+  author       = {Kelvin V. Kredens and Juliano V. Martins and Osmar B. Dordal and Mauri Ferrandin and Roberto H. Herai and Edson E. Scalabrin and Br{\'{a}}ulio C. {\'{A}}vila},
   date         = {2020-05},
   journaltitle = {{PLOS} {ONE}},
   title        = {Vertical lossless genomic data compression tools for assembled genomes: A systematic literature review},
@@ -312,8 +310,6 @@
   volume       = {15},
   publisher    = {Public Library of Science ({PLoS})},
 }
-
-
 @TechReport{isompeg,
   author      = {{ISO Central Secretary}},
   date        = {2020-10},
@@ -326,19 +322,16 @@
   year        = {2019},
 }
 
-@ARTICLE{9455132,
-  author={J. Voges and M. Hernaez and M. Mattavelli and J. Ostermann},
-  journal={Proceedings of the IEEE}, 
-  title={An Introduction to MPEG-G: The First Open ISO/IEC Standard for the Compression and Exchange of Genomic Sequencing Data}, 
-  year={2021},
-  volume={109},
-  number={9},
-  pages={1607-1622},
-  doi={10.1109/JPROC.2021.3082027}
+@Article{mpeg,
+  author    = {Claudio Albert and Tom Paridaens and Jan Voges and Daniel Naro and Junaid J. Ahmad and Massimo Ravasi and Daniele Renzi and Giorgio Zoia and Paolo Ribeca and Idoia Ochoa and Marco Mattavelli and Jaime Delgado and Mikel Hernaez},
+  date      = {2018-09},
+  title     = {An introduction to {MPEG}-G, the new {ISO} standard for genomic information representation},
+  doi       = {10.1101/426353},
+  publisher = {Cold Spring Harbor Laboratory},
 }
 
 @Article{haplo,
-  author       = {W. Low and R. Tearle and R. Liu and S. Koren and A. Rhie and D. Bickhart and B. Rosen and Z. Kronenberg and S. Kingan and E. Tseng and F. Thibaud-Nissen and F. Martin and K. Billis and J. Ghurye and A. Hastie and J. Lee and A. Pang and M. Heaton and A. Phillippy and S. Hiendleder and T. Smith and J. Williams},
+  author       = {Wai Yee Low and Rick Tearle and Ruijie Liu and Sergey Koren and Arang Rhie and Derek M. Bickhart and Benjamin D. Rosen and Zev N. Kronenberg and Sarah B. Kingan and Elizabeth Tseng and Fran{\c{c}}oise Thibaud-Nissen and Fergal J. Martin and Konstantinos Billis and Jay Ghurye and Alex R. Hastie and Joyce Lee and Andy W. C. Pang and Michael P. Heaton and Adam M. Phillippy and Stefan Hiendleder and Timothy P. L. Smith and John L. Williams},
   date         = {2020-04},
   journaltitle = {Nature Communications},
   title        = {Haplotype-resolved genomes provide insights into structural variation and gene content in Angus and Brahman cattle},
@@ -348,8 +341,18 @@
   publisher    = {Springer Science and Business Media {LLC}},
 }
 
+
+@Book{cthreading,
+  author    = {Quinn, Michael J.},
+  title     = {Parallel Programming in C with MPI and OpenMP},
+  isbn      = {0071232656},
+  publisher = {McGraw-Hill Education Group},
+  year      = {2003},
+}
+
+
 @Article{pet21,
-  author    = {S. Petoukhov},
+  author    = {Sergey V. Petoukhov},
   date      = {2021-10},
   title     = {Tensor Rules in the Stochastic Organization of Genomes and Genetic Stochastic Resonance in Algebraic Biology},
   doi       = {10.20944/preprints202110.0093.v1},
@@ -369,7 +372,7 @@
 }
 
 @Book{dict,
-  author    = {C. McIntosh},
+  author    = {McIntosh, Colin},
   date      = {2013},
   title     = {Cambridge International Dictionary of English},
   isbn      = {9781107035157},
@@ -387,21 +390,15 @@
   pagetotal    = {3},
   url          = {https://www.rfc-editor.org/info/rfc768},
   howpublished = {RFC 768},
-  month        = {aug},
+  month        = aug,
   publisher    = {RFC Editor},
   series       = {Request for Comments},
   year         = {1980},
 }
 
 @TechReport{isoutf,
-  author      = {ISO/IEC JTC 1/SC 2 Coded character sets},
+  author = {ISO},
   title  = {ISO/IEC 10646:2020 UTF},
-  date        = {2020-12},
-  institution = {International Organization for Standardization},
-  title       = {Information technology — Universal coded character set (UCS)},
-  type        = {Standard},
-  address     = {Geneva, CH},
-  key         = {ISO10646:2020},
 }
 
 @Article{lz77,
@@ -416,7 +413,7 @@
 }
 
 @Article{wang_22,
-  author       = {S. Wang and C. Gao and Y. Zheng and L. Yi and J. Lu and X. Huang and J. Cai and P. Zhang and Y. Cui and A. Ke},
+  author       = {Si-Wei Wang and Chao Gao and Yi-Min Zheng and Li Yi and Jia-Cheng Lu and Xiao-Yong Huang and Jia-Bin Cai and Peng-Fei Zhang and Yue-Hong Cui and Ai-Wu Ke},
   date         = {2022-02},
   journaltitle = {Molecular Cancer},
   title        = {Current applications and future perspective of {CRISPR}/Cas9 gene editing in cancer},
@@ -427,7 +424,7 @@
 }
 
 @Article{ju_21,
-  author       = {Philomin, J. and R. Singh and J. Poland and S. Shrestha and J. Huerta-Espino and V. Govindan and S. Mondal and L. Crespo-Herrera and U. Kumar and A. Joshi and T. Payne and P. Bhati and V. Tomar and F. Consolacion and J. Serna},
+  author       = {Philomin Juliana and Ravi Prakash Singh and Jesse Poland and Sandesh Shrestha and Julio Huerta-Espino and Velu Govindan and Suchismita Mondal and Leonardo Abdiel Crespo-Herrera and Uttam Kumar and Arun Kumar Joshi and Thomas Payne and Pradeep Kumar Bhati and Vipin Tomar and Franjel Consolacion and Jaime Amador Campos Serna},
   date         = {2021-03},
   journaltitle = {Scientific Reports},
   title        = {Elucidating the genetics of grain yield and stress-resilience in bread wheat using a large-scale genome-wide association mapping study with 55,568 lines},
@@ -438,7 +435,7 @@
 }
 
 @Article{mo_83,
-  author       = {A. Motulsky},
+  author       = {Arno G. Motulsky},
   date         = {1983-01},
   journaltitle = {Science},
   title        = {Impact of Genetic Manipulation on Society and Medicine},
@@ -449,107 +446,90 @@
   publisher    = {American Association for the Advancement of Science ({AAAS})},
 }
 
-
-@Online{ftp-igsr,
-  date  = {2022-11-10},
-  title = {IGSR: The International Genome Sample Resource},
-  url   = {https://ftp.1000genomes.ebi.ac.uk},
-}
-
-@Online{ftp-ncbi,
-  date  = {2022-11-01},
-  title = {NCBI National Center for Biotechnology Information},
-  url   = {https://ftp.ncbi.nlm.nih.gov/genomes/},
-}
-
-@Online{ftp-ensembl,
-  date  = {2022-10-15},
-  title = {ENSEMBL Rapid Release},
-  url   = {https://ftp.ensembl.org},
-}
-
-@Book{cthreading,
-  author    = {Quinn, Michael J.},
-  title     = {Parallel Programming in C with MPI and OpenMP},
-  isbn      = {0071232656},
-  publisher = {McGraw-Hill Education Group},
-  year      = {2003},
-}
-
-@Online{geco-repo,
-  author = {Cobilab},
-  date   = {2022-11-19},
-  title  = {Repositories for the three versions of GeCo},
-  url    = {https://github.com/cobilab},
-}
-
-@Online{code-analysis,
-  author = {Ryan Dewhurst},
-  date   = {2022-11-20},
-  editor = {Kirsten S and Nick Bloor and Sarah Baso and James Bowie and Evgeniy Ryzhkov and Iberiam and Ann Campbell and Jonathan Marcil and Christina Schelin and Jie Wang and Fabian and Achim and Dirk Wetter},
-  title  = {Static Code Analysis},
-  url    = {https://owasp.org/www-community/controls/Static_Code_Analysis},
-}
-
-@Online{gpl,
-  title = {GNU Public License},
-  url   = {http://www.gnu.org/licenses/gpl-3.0.html},
-}
-
-@Online{mitlic,
-  title = {MIT License},
-  url   = {https://spdx.org/licenses/MIT.html},
-}
-
 @Online{bam,
-  author  = {The SAM/BAM Format Specification Working Group},
-  date    = {2022-08-22},
   title   = {Sequence Alignment/Map Format Specification},
   url     = {https://github.com/samtools/hts-specs},
   urldate = {2022-09-12},
-  version = {44b4167},
 }
 
 @Online{ucsc,
-  author  = {UCSC - University of California, Santa Cruz},
-  date    = {2022-10-28},
   title   = {UCSC Genome Browser},
   url     = {https://genome.ucsc.edu/},
   urldate = {2022-10-28},
 }
 
 @Online{ensembl,
-  author = {P. Flicek},
-  date   = {2022-10-24},
   title  = {ENSEMBL Project},
   url    = {http://www.ensembl.org/},
+  urldate   = {2022-10-24},
 }
 
 @Online{ga4gh,
-  date  = {2022-10-10},
   title = {Global Alliance for Genomics and Health},
   url   = {https://github.com/samtools/hts-specs.},
+	urldate = {2022-10-04},
 }
 
 @Online{bed,
-  author = {Sanger Institute, Genome Research Limited},
-  date   = {2022-10-20},
   title  = {BED Browser Extensible Data},
   url    = {https://samtools.github.io/hts-specs/BEDv1.pdf},
+  urldate   = {2022-10-20},
 }
 
 @Online{illufastq,
-  author = {Illumina},
-  date   = {2022-11-17},
   title  = {Illumina FASTq file structure explained},
   url    = {https://support.illumina.com/bulletins/2016/04/fastq-files-explained.html},
+  urldate   = {2022-11-17},
 }
 
 @Online{twobit,
-  date   = {2022-09-22},
   editor = {UCSC University of California Sata Cruz},
   title  = {TwoBit File Format},
   url    = {https://genome-source.gi.ucsc.edu/gitlist/kent.git/raw/master/src/inc/twoBit.h},
+  urldate   = {2022-09-22},
+}
+
+@Online{ftp-igsr,
+  title = {IGSR: The International Genome Sample Resource},
+  url   = {https://ftp.1000genomes.ebi.ac.uk},
+  urldate  = {2022-11-10},
+}
+
+@Online{ftp-ncbi,
+  title = {NCBI National Center for Biotechnology Information},
+  url   = {https://ftp.ncbi.nlm.nih.gov/genomes/},
+  urldate  = {2022-11-01},
 }
-@Comment{jabref-meta: databaseType:biblatex;}
 
+@Online{ftp-ensembl,
+  title = {ENSEMBL Rapid Release},
+  url   = {https://ftp.ensembl.org},
+  urldate  = {2022-10-15},
+}
+
+@Online{geco-repo,
+  title  = {Repositories for the three versions of GeCo},
+  url    = {https://github.com/cobilab},
+  urldate   = {2022-11-19},
+}
+
+@Online{code-analysis,
+  editor = {Kirsten, S and Nick Bloor and Sarah Baso and James Bowie and Evgeniy Ryzhkov and Iberiam and Ann Campbell and Jonathan Marcil and Christina Schelin and Jie Wang and Fabian and Achim and Dirk Wetter},
+  title  = {Static Code Analysis},
+  url    = {https://owasp.org/www-community/controls/Static_Code_Analysis},
+  urldate   = {2022-11-20},
+}
+
+@Online{gpl,
+  title = {GNU Public License},
+  url   = {http://www.gnu.org/licenses/gpl-3.0.html},
+	urldate = {2022-11-20},
+}
+
+@Online{mitlic,
+  title = {MIT License},
+  url   = {https://spdx.org/licenses/MIT.html},
+	urldate = {2022-11-23}
+}
+
+@Comment{jabref-meta: databaseType:biblatex;}

+ 10 - 2
latex/tex/preambel.tex

@@ -76,14 +76,22 @@
                                   %      inline: Zitat in Klammern (\parancite)
                                   %      footnote: Zitat in Fußnoten (\footcite)
                                   %      plain: Zitat direkt ohne Klammern (\cite)
-  style=ieee,		              % Legt den Stil für die Zitate fest
+  style=ieee,		              		% Legt den Stil für die Zitate fest
                                   %      ieee: Zitate als Zahlen [1]
                                   %      alphabetic: Zitate als Kürzel und Jahr [Ein05]
                                   %      authoryear: Zitate Author und Jahr [Einstein (1905)]
   hyperref=true,                  % Hyperlinks für Zitate
-  firstinits=false                % Vornamen abkürzen (Maier, M. anstatt Maier, Markus)?
+  firstinits=true, 								% Vornamen abkürzen (Maier, M. anstatt Maier, Markus)?
                                   %      true: abkürzen
                                   %      false: nicht abkürzen
+	sorting=anyt										% Legt Reihenfolge fest, in der Zitate angezeigt werden
+																	% nty:		name, title, year
+																	% nyt:		name, year, title
+																	% nyvt:		name, year, volume, title
+																	% anyt:		alphabetic label, name, year, title
+																	% anyvt:	alphabetic label, name, year, volume, title
+																	% ydnt:		sort by year (descending), name, title
+																	% none:		entries are processed in citation order 
 ]{biblatex}                       % Literaturverwaltung mit BibLaTeX
 \usepackage{rotating}             % Seiten drehen
 \usepackage{harveyballs}          % Harveyballs

+ 1 - 0
latex/tex/thesis.tex

@@ -183,6 +183,7 @@
 \begin{flushleft}
 \let\clearpage\relax % Fix für leere Seiten (issue #25)
 \printbibliography
+\printbibliography[type=online, sorting=nud title=Online Sources]
 \end{flushleft}
 \endgroup