vik- 174 : added min string length to language detection

2025-07-19 19:14:55 +00:00 · 2013-09-13 17:12:50 -04:00 · 2013-09-13 17:12:50 -04:00 · 6b7df1ae47
commit 6b7df1ae47
parent 47ce10ad90
2 changed files with 52 additions and 45 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java
@ -13,8 +13,9 @@ import org.sleuthkit.datamodel.AbstractFile;
 public interface TextLanguageIdentifier {
    /**
-     * attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile}
+     * attempts to identify the language of the given String and add it to the
-     * as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact.
+     * black board for the given {@code AbstractFile} as a TSK_TEXT_LANGUAGE
     * attribute on a TSK_GEN_INFO artifact.
     *
     * @param extracted the String whose language is to be identified
     * @param sourceFile the AbstractFile the string is extracted from.
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java
@ -12,7 +12,8 @@ import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.TskCoreException;
 /**
- *TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier
+ * TextLanguageIdentifier implementation based on a wrapped Tike
 * LanguageIdentifier
 *
 *
 * @author jmillman
@ -20,10 +21,11 @@ import org.sleuthkit.datamodel.TskCoreException;
 public class TikaLanguageIdentifier implements TextLanguageIdentifier {
    private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
    private static final int MIN_STRING_LENGTH = 1000;
    @Override
    public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
-
+        if (extracted.length() > MIN_STRING_LENGTH) {
            org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
            logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
@ -46,7 +48,11 @@ public class TikaLanguageIdentifier implements TextLanguageIdentifier {
-              /*  //attempt to verify that artifact with attribute was created
+
        }else{
            logger.info("extracted text too short, skipping language detection on " + sourceFile.getName());
        }
    } /*  //attempt to verify that artifact with attribute was created
     ArrayList<BlackboardArtifact> arts;
@ -66,5 +72,5 @@ public class TikaLanguageIdentifier implements TextLanguageIdentifier {
     } catch (TskCoreException ex) {
     Exceptions.printStackTrace(ex);
     }*/
-    }
+
 }