vik-174 : used Tika's LanguageIdentifier to add language identification info to blackboard

This commit is contained in:
jmillman 2013-09-13 10:06:45 -04:00
parent 9d06ae8c41
commit b8c4597c55
4 changed files with 117 additions and 11 deletions

View File

@ -148,6 +148,7 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
//encode to bytes to index as byte stream
String extracted;
//add BOM and trim the 0 bytes
//set initial size to chars read + bom - try to prevent from resizing
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
@ -163,6 +164,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
totalRead = 0;
extracted = sb.toString();
//attempt to identify language of extracted text and post it to the blackboard
new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile);
//converts BOM automatically to charSet encoding
byte[] encodedBytes = extracted.getBytes(outCharset);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
@ -216,11 +222,9 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
public boolean isSupported(AbstractFile file, String detectedFormat) {
if (detectedFormat == null) {
return false;
}
else if (WEB_MIME_TYPES.contains(detectedFormat) ) {
} else if (WEB_MIME_TYPES.contains(detectedFormat)) {
return true;
}
else {
} else {
return false;
}

View File

@ -39,11 +39,16 @@ import org.sleuthkit.autopsy.ingest.IngestModuleAbstractFile;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.apache.tika.Tika;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
@ -214,6 +219,9 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
extracted = sb.toString();
//attempt to identify language of extracted text and post it to the blackboard
new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile);
//converts BOM automatically to charSet encoding
byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
@ -272,7 +280,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
if (detectedFormat == null) {
return false;
} else if (detectedFormat.equals("application/octet-stream")
|| detectedFormat.equals("application/x-msdownload") ) {
|| detectedFormat.equals("application/x-msdownload")) {
//any binary unstructured blobs (string extraction will be used)
return false;
} else if (AbstractFileExtract.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
@ -292,8 +300,8 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
}
/**
* Runnable task that calls tika to parse the content using
* the input stream. Provides reader for results.
* Runnable task that calls tika to parse the content using the input
* stream. Provides reader for results.
*/
private static class ParseRequestTask implements Runnable {

View File

@ -0,0 +1,24 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.sleuthkit.autopsy.keywordsearch;
import org.sleuthkit.datamodel.AbstractFile;
/**
*
* @author jmillman
*/
public interface TextLanguageIdentifier {
/**
* attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile}
* as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact.
*
* @param extracted the String whose language is to be identified
* @param sourceFile the AbstractFile the string is extracted from.
* @return
*/
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile);
}

View File

@ -0,0 +1,70 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException;
/**
*TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier
*
*
* @author jmillman
*/
public class TikaLanguageIdentifier implements TextLanguageIdentifier {
private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
@Override
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
+ "with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + "confidence");
BlackboardArtifact genInfo;
try {
genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO);
BlackboardAttribute textLang = new BlackboardAttribute(
BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
genInfo.addAttribute(textLang);
} catch (TskCoreException ex) {
logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex);
}
/* //attempt to verify that artifact with attribute was created
ArrayList<BlackboardArtifact> arts;
try {
arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId());
for (BlackboardArtifact art : arts) {
List<BlackboardAttribute> attrs = art.getAttributes();
for (BlackboardAttribute attr : attrs) {
if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) {
logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName());
break;
}
}
}
} catch (TskCoreException ex) {
Exceptions.printStackTrace(ex);
}*/
}
}