From b8c4597c5572f5a37d974f60a9dff3255d83530f Mon Sep 17 00:00:00 2001 From: jmillman Date: Fri, 13 Sep 2013 10:06:45 -0400 Subject: [PATCH] vik-174 : used Tika's LanguageIdentifier to add language identification info to blackboard --- .../AbstractFileHtmlExtract.java | 14 ++-- .../AbstractFileTikaTextExtract.java | 20 ++++-- .../keywordsearch/TextLanguageIdentifier.java | 24 +++++++ .../keywordsearch/TikaLanguageIdentifier.java | 70 +++++++++++++++++++ 4 files changed, 117 insertions(+), 11 deletions(-) create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java create mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java index 1f5d9a57e8..530c85eaff 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java @@ -148,6 +148,7 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract { //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName()); //encode to bytes to index as byte stream String extracted; + //add BOM and trim the 0 bytes //set initial size to chars read + bom - try to prevent from resizing StringBuilder sb = new StringBuilder((int) totalRead + 1000); @@ -163,6 +164,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract { totalRead = 0; extracted = sb.toString(); + + //attempt to identify language of extracted text and post it to the blackboard + new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile); + + //converts BOM automatically to charSet encoding byte[] encodedBytes = extracted.getBytes(outCharset); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); @@ -216,13 +222,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract { public boolean isSupported(AbstractFile file, String detectedFormat) { if (detectedFormat == null) { return false; - } - else if (WEB_MIME_TYPES.contains(detectedFormat) ) { + } else if (WEB_MIME_TYPES.contains(detectedFormat)) { return true; - } - else { + } else { return false; } - + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java index 9ad1139ce0..92397829a2 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java @@ -39,11 +39,16 @@ import org.sleuthkit.autopsy.ingest.IngestModuleAbstractFile; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; import org.apache.tika.Tika; +import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; +import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; +import org.sleuthkit.datamodel.BlackboardArtifact; +import org.sleuthkit.datamodel.BlackboardAttribute; +import org.sleuthkit.datamodel.TskCoreException; /** * Extractor of text from TIKA supported AbstractFile content. Extracted text is @@ -123,7 +128,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { final InputStream stream = new ReadContentInputStream(sourceFile); try { Metadata meta = new Metadata(); - + //Parse the file in a task Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile); @@ -152,7 +157,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { return false; } - + // break the results into chunks and index success = true; long readSize; @@ -213,7 +218,10 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { } extracted = sb.toString(); - + + //attempt to identify language of extracted text and post it to the blackboard + new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile); + //converts BOM automatically to charSet encoding byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); @@ -272,7 +280,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { if (detectedFormat == null) { return false; } else if (detectedFormat.equals("application/octet-stream") - || detectedFormat.equals("application/x-msdownload") ) { + || detectedFormat.equals("application/x-msdownload")) { //any binary unstructured blobs (string extraction will be used) return false; } else if (AbstractFileExtract.ARCHIVE_MIME_TYPES.contains(detectedFormat)) { @@ -292,8 +300,8 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract { } /** - * Runnable task that calls tika to parse the content using - * the input stream. Provides reader for results. + * Runnable task that calls tika to parse the content using the input + * stream. Provides reader for results. */ private static class ParseRequestTask implements Runnable { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java new file mode 100755 index 0000000000..ade1be040a --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java @@ -0,0 +1,24 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import org.sleuthkit.datamodel.AbstractFile; + +/** + * + * @author jmillman + */ +public interface TextLanguageIdentifier { + + /** + * attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile} + * as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact. + * + * @param extracted the String whose language is to be identified + * @param sourceFile the AbstractFile the string is extracted from. + * @return + */ + public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile); +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java new file mode 100755 index 0000000000..011da893c0 --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java @@ -0,0 +1,70 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import java.util.logging.Level; +import org.sleuthkit.autopsy.coreutils.Logger; +import org.sleuthkit.datamodel.AbstractFile; +import org.sleuthkit.datamodel.BlackboardArtifact; +import org.sleuthkit.datamodel.BlackboardAttribute; +import org.sleuthkit.datamodel.TskCoreException; + +/** + *TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier + * + * + * @author jmillman + */ +public class TikaLanguageIdentifier implements TextLanguageIdentifier { + + private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName()); + + @Override + public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) { + + org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted); + + logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage() + + "with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + "confidence"); + + BlackboardArtifact genInfo; + try { + genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO); + + BlackboardAttribute textLang = new BlackboardAttribute( + BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(), + KeywordSearchIngestModule.MODULE_NAME, li.getLanguage()); + + + genInfo.addAttribute(textLang); + + } catch (TskCoreException ex) { + logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex); + } + + + + /* //attempt to verify that artifact with attribute was created + ArrayList arts; + + + try { + arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId()); + + for (BlackboardArtifact art : arts) { + + List attrs = art.getAttributes(); + for (BlackboardAttribute attr : attrs) { + if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) { + logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName()); + break; + } + } + } + } catch (TskCoreException ex) { + Exceptions.printStackTrace(ex); + }*/ + } +}