@inproceedings{466943782e434384b34775522f16e341,
title = "Security level classification of confidential documents written in Turkish",
abstract = "This article introduces a security level classification methodology of confidential documents written in Turkish language. Internal documents of TUBITAK UEKAE, holding various security levels (unclassified-restricted-secret) were classified within a methodology using Support Vector Machines (SVM's) [1] and na{\"i}ve bayes classifiers [3][9]. To represent term-document relations a recommended metric {"}TF-IDF{"} [2] was chosen to construct a weight matrix. Turkic languages provide a very difficult natural language processing problem in comparison with English: {"}Stemming{"}. A Turkish stemming tool {"}zemberek{"} was used to find out the features without suffix. At the end of the article some experimental results and success metrics are projected.",
keywords = "Data loss prevention, Document classification, Na{\"i}ve bayes, Security, Stemming, Support vector machine, TF-IDF, Turkish",
author = "Erdem Alparslan and Hayretdin Bahsi",
year = "2010",
doi = "10.1007/978-3-642-12630-7_41",
language = "English (US)",
isbn = "3642126294",
series = "Lecture Notes of the Institute for Computer Sciences, Social-Informatics and Telecommunications Engineering",
pages = "329--334",
booktitle = "User Centric Media - First International Conference, UCMedia 2009, Revised Selected Papers",
note = "1st International Conference on User Centric Media, UCMedia 2009 ; Conference date: 09-12-2009 Through 11-12-2009",
}