vik-174 : used Tika's LanguageIdentifier to add language identification info to blackboard

2025-07-17 18:17:43 +00:00 · 2013-09-13 10:06:45 -04:00 · 2013-09-13 10:06:45 -04:00 · b8c4597c55
commit b8c4597c55
parent 9d06ae8c41
4 changed files with 117 additions and 11 deletions
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java
@ -148,6 +148,7 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
                //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
                //encode to bytes to index as byte stream
                String extracted;
                //add BOM and trim the 0 bytes
                //set initial size to chars read + bom - try to prevent from resizing
                StringBuilder sb = new StringBuilder((int) totalRead + 1000);
@ -163,6 +164,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
                totalRead = 0;
                extracted = sb.toString();
                //attempt to identify language of extracted text and post it to the blackboard
                new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile);
                //converts BOM automatically to charSet encoding
                byte[] encodedBytes = extracted.getBytes(outCharset);
                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
@ -216,11 +222,9 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
    public boolean isSupported(AbstractFile file, String detectedFormat) {
        if (detectedFormat == null) {
            return false;
-        }
+        } else if (WEB_MIME_TYPES.contains(detectedFormat)) {
        else if (WEB_MIME_TYPES.contains(detectedFormat) ) {
            return true;
-        }
+        } else {
        else {
            return false;
        }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java
@ -39,11 +39,16 @@ import org.sleuthkit.autopsy.ingest.IngestModuleAbstractFile;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.ReadContentInputStream;
 import org.apache.tika.Tika;
 import org.apache.tika.language.LanguageIdentifier;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.sleuthkit.autopsy.casemodule.Case;
 import org.sleuthkit.autopsy.coreutils.StringExtract;
 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.TskCoreException;
 /**
 * Extractor of text from TIKA supported AbstractFile content. Extracted text is
@ -214,6 +219,9 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
                extracted = sb.toString();
                //attempt to identify language of extracted text and post it to the blackboard
                new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile);
                //converts BOM automatically to charSet encoding
                byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
@ -272,7 +280,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
        if (detectedFormat == null) {
            return false;
        } else if (detectedFormat.equals("application/octet-stream")
-                || detectedFormat.equals("application/x-msdownload") ) {
+                || detectedFormat.equals("application/x-msdownload")) {
            //any binary unstructured blobs (string extraction will be used)
            return false;
        } else if (AbstractFileExtract.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
@ -292,8 +300,8 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
    }
    /**
-     * Runnable task that calls tika to parse the content using
+     * Runnable task that calls tika to parse the content using the input
-     * the input stream.  Provides reader for results. 
+     * stream. Provides reader for results.
     */
    private static class ParseRequestTask implements Runnable {
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java
@ -0,0 +1,24 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import org.sleuthkit.datamodel.AbstractFile;
 /**
 *
 * @author jmillman
 */
 public interface TextLanguageIdentifier {
    /**
     * attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile}
     * as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact.
     *
     * @param extracted  the String whose language is to be identified
     * @param sourceFile the AbstractFile the string is extracted from.
     * @return
     */
  public  void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile);
 }
--- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java
+++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java
@ -0,0 +1,70 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package org.sleuthkit.autopsy.keywordsearch;
 import java.util.logging.Level;
 import org.sleuthkit.autopsy.coreutils.Logger;
 import org.sleuthkit.datamodel.AbstractFile;
 import org.sleuthkit.datamodel.BlackboardArtifact;
 import org.sleuthkit.datamodel.BlackboardAttribute;
 import org.sleuthkit.datamodel.TskCoreException;
 /**
 *TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier
 * 
 * 
 * @author jmillman
 */
 public class TikaLanguageIdentifier implements TextLanguageIdentifier {
    private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
    @Override
    public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
        org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
        logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
                + "with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + "confidence");
        BlackboardArtifact genInfo;
        try {
            genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO);
            BlackboardAttribute textLang = new BlackboardAttribute(
                    BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
                    KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
            genInfo.addAttribute(textLang);
        } catch (TskCoreException ex) {
            logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex);
        }
              /*  //attempt to verify that artifact with attribute was created
                ArrayList<BlackboardArtifact> arts;
                try {
                    arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId());
                    for (BlackboardArtifact art : arts) {
                        List<BlackboardAttribute> attrs = art.getAttributes();
                        for (BlackboardAttribute attr : attrs) {
                            if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) {
                                logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName());
                                break;
                            }
                        }
                    }
                } catch (TskCoreException ex) {
                    Exceptions.printStackTrace(ex);
                }*/
    }
 }