mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 02:07:42 +00:00
vik-174 : used Tika's LanguageIdentifier to add language identification info to blackboard
This commit is contained in:
parent
9d06ae8c41
commit
b8c4597c55
@ -148,6 +148,7 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
||||
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
|
||||
//encode to bytes to index as byte stream
|
||||
String extracted;
|
||||
|
||||
//add BOM and trim the 0 bytes
|
||||
//set initial size to chars read + bom - try to prevent from resizing
|
||||
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
|
||||
@ -163,6 +164,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
||||
totalRead = 0;
|
||||
extracted = sb.toString();
|
||||
|
||||
|
||||
//attempt to identify language of extracted text and post it to the blackboard
|
||||
new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile);
|
||||
|
||||
|
||||
//converts BOM automatically to charSet encoding
|
||||
byte[] encodedBytes = extracted.getBytes(outCharset);
|
||||
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||
@ -216,13 +222,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||
if (detectedFormat == null) {
|
||||
return false;
|
||||
}
|
||||
else if (WEB_MIME_TYPES.contains(detectedFormat) ) {
|
||||
} else if (WEB_MIME_TYPES.contains(detectedFormat)) {
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -39,11 +39,16 @@ import org.sleuthkit.autopsy.ingest.IngestModuleAbstractFile;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.language.LanguageIdentifier;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.sleuthkit.autopsy.casemodule.Case;
|
||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||
@ -123,7 +128,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
final InputStream stream = new ReadContentInputStream(sourceFile);
|
||||
try {
|
||||
Metadata meta = new Metadata();
|
||||
|
||||
|
||||
//Parse the file in a task
|
||||
Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
|
||||
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
|
||||
@ -152,7 +157,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// break the results into chunks and index
|
||||
success = true;
|
||||
long readSize;
|
||||
@ -213,7 +218,10 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
}
|
||||
|
||||
extracted = sb.toString();
|
||||
|
||||
|
||||
//attempt to identify language of extracted text and post it to the blackboard
|
||||
new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile);
|
||||
|
||||
//converts BOM automatically to charSet encoding
|
||||
byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
|
||||
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||
@ -272,7 +280,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
if (detectedFormat == null) {
|
||||
return false;
|
||||
} else if (detectedFormat.equals("application/octet-stream")
|
||||
|| detectedFormat.equals("application/x-msdownload") ) {
|
||||
|| detectedFormat.equals("application/x-msdownload")) {
|
||||
//any binary unstructured blobs (string extraction will be used)
|
||||
return false;
|
||||
} else if (AbstractFileExtract.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
|
||||
@ -292,8 +300,8 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
||||
}
|
||||
|
||||
/**
|
||||
* Runnable task that calls tika to parse the content using
|
||||
* the input stream. Provides reader for results.
|
||||
* Runnable task that calls tika to parse the content using the input
|
||||
* stream. Provides reader for results.
|
||||
*/
|
||||
private static class ParseRequestTask implements Runnable {
|
||||
|
||||
|
@ -0,0 +1,24 @@
|
||||
/*
|
||||
* To change this template, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author jmillman
|
||||
*/
|
||||
public interface TextLanguageIdentifier {
|
||||
|
||||
/**
|
||||
* attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile}
|
||||
* as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact.
|
||||
*
|
||||
* @param extracted the String whose language is to be identified
|
||||
* @param sourceFile the AbstractFile the string is extracted from.
|
||||
* @return
|
||||
*/
|
||||
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile);
|
||||
}
|
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* To change this template, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
package org.sleuthkit.autopsy.keywordsearch;
|
||||
|
||||
import java.util.logging.Level;
|
||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||
import org.sleuthkit.datamodel.AbstractFile;
|
||||
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||
import org.sleuthkit.datamodel.TskCoreException;
|
||||
|
||||
/**
|
||||
*TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier
|
||||
*
|
||||
*
|
||||
* @author jmillman
|
||||
*/
|
||||
public class TikaLanguageIdentifier implements TextLanguageIdentifier {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
|
||||
|
||||
@Override
|
||||
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
|
||||
|
||||
org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
|
||||
|
||||
logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
|
||||
+ "with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + "confidence");
|
||||
|
||||
BlackboardArtifact genInfo;
|
||||
try {
|
||||
genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO);
|
||||
|
||||
BlackboardAttribute textLang = new BlackboardAttribute(
|
||||
BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
|
||||
KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
|
||||
|
||||
|
||||
genInfo.addAttribute(textLang);
|
||||
|
||||
} catch (TskCoreException ex) {
|
||||
logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* //attempt to verify that artifact with attribute was created
|
||||
ArrayList<BlackboardArtifact> arts;
|
||||
|
||||
|
||||
try {
|
||||
arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId());
|
||||
|
||||
for (BlackboardArtifact art : arts) {
|
||||
|
||||
List<BlackboardAttribute> attrs = art.getAttributes();
|
||||
for (BlackboardAttribute attr : attrs) {
|
||||
if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) {
|
||||
logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (TskCoreException ex) {
|
||||
Exceptions.printStackTrace(ex);
|
||||
}*/
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user