mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
vik-174 : used Tika's LanguageIdentifier to add language identification info to blackboard
This commit is contained in:
parent
9d06ae8c41
commit
b8c4597c55
@ -148,6 +148,7 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
|||||||
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
|
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
|
||||||
//encode to bytes to index as byte stream
|
//encode to bytes to index as byte stream
|
||||||
String extracted;
|
String extracted;
|
||||||
|
|
||||||
//add BOM and trim the 0 bytes
|
//add BOM and trim the 0 bytes
|
||||||
//set initial size to chars read + bom - try to prevent from resizing
|
//set initial size to chars read + bom - try to prevent from resizing
|
||||||
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
|
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
|
||||||
@ -163,6 +164,11 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
|||||||
totalRead = 0;
|
totalRead = 0;
|
||||||
extracted = sb.toString();
|
extracted = sb.toString();
|
||||||
|
|
||||||
|
|
||||||
|
//attempt to identify language of extracted text and post it to the blackboard
|
||||||
|
new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile);
|
||||||
|
|
||||||
|
|
||||||
//converts BOM automatically to charSet encoding
|
//converts BOM automatically to charSet encoding
|
||||||
byte[] encodedBytes = extracted.getBytes(outCharset);
|
byte[] encodedBytes = extracted.getBytes(outCharset);
|
||||||
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||||
@ -216,11 +222,9 @@ public class AbstractFileHtmlExtract implements AbstractFileExtract {
|
|||||||
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
public boolean isSupported(AbstractFile file, String detectedFormat) {
|
||||||
if (detectedFormat == null) {
|
if (detectedFormat == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
} else if (WEB_MIME_TYPES.contains(detectedFormat)) {
|
||||||
else if (WEB_MIME_TYPES.contains(detectedFormat) ) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,11 +39,16 @@ import org.sleuthkit.autopsy.ingest.IngestModuleAbstractFile;
|
|||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
import org.apache.tika.Tika;
|
import org.apache.tika.Tika;
|
||||||
|
import org.apache.tika.language.LanguageIdentifier;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.mime.MediaType;
|
import org.apache.tika.mime.MediaType;
|
||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.sleuthkit.autopsy.casemodule.Case;
|
||||||
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
import org.sleuthkit.autopsy.coreutils.StringExtract;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||||
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
* Extractor of text from TIKA supported AbstractFile content. Extracted text is
|
||||||
@ -214,6 +219,9 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
|
|
||||||
extracted = sb.toString();
|
extracted = sb.toString();
|
||||||
|
|
||||||
|
//attempt to identify language of extracted text and post it to the blackboard
|
||||||
|
new TikaLanguageIdentifier().addLanguageToBlackBoard(extracted, sourceFile);
|
||||||
|
|
||||||
//converts BOM automatically to charSet encoding
|
//converts BOM automatically to charSet encoding
|
||||||
byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
|
byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
|
||||||
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
|
||||||
@ -272,7 +280,7 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
if (detectedFormat == null) {
|
if (detectedFormat == null) {
|
||||||
return false;
|
return false;
|
||||||
} else if (detectedFormat.equals("application/octet-stream")
|
} else if (detectedFormat.equals("application/octet-stream")
|
||||||
|| detectedFormat.equals("application/x-msdownload") ) {
|
|| detectedFormat.equals("application/x-msdownload")) {
|
||||||
//any binary unstructured blobs (string extraction will be used)
|
//any binary unstructured blobs (string extraction will be used)
|
||||||
return false;
|
return false;
|
||||||
} else if (AbstractFileExtract.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
|
} else if (AbstractFileExtract.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
|
||||||
@ -292,8 +300,8 @@ public class AbstractFileTikaTextExtract implements AbstractFileExtract {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runnable task that calls tika to parse the content using
|
* Runnable task that calls tika to parse the content using the input
|
||||||
* the input stream. Provides reader for results.
|
* stream. Provides reader for results.
|
||||||
*/
|
*/
|
||||||
private static class ParseRequestTask implements Runnable {
|
private static class ParseRequestTask implements Runnable {
|
||||||
|
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
/*
|
||||||
|
* To change this template, choose Tools | Templates
|
||||||
|
* and open the template in the editor.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author jmillman
|
||||||
|
*/
|
||||||
|
public interface TextLanguageIdentifier {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile}
|
||||||
|
* as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact.
|
||||||
|
*
|
||||||
|
* @param extracted the String whose language is to be identified
|
||||||
|
* @param sourceFile the AbstractFile the string is extracted from.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile);
|
||||||
|
}
|
@ -0,0 +1,70 @@
|
|||||||
|
/*
|
||||||
|
* To change this template, choose Tools | Templates
|
||||||
|
* and open the template in the editor.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.keywordsearch;
|
||||||
|
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardArtifact;
|
||||||
|
import org.sleuthkit.datamodel.BlackboardAttribute;
|
||||||
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @author jmillman
|
||||||
|
*/
|
||||||
|
public class TikaLanguageIdentifier implements TextLanguageIdentifier {
|
||||||
|
|
||||||
|
private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
|
||||||
|
|
||||||
|
org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
|
||||||
|
|
||||||
|
logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
|
||||||
|
+ "with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + "confidence");
|
||||||
|
|
||||||
|
BlackboardArtifact genInfo;
|
||||||
|
try {
|
||||||
|
genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO);
|
||||||
|
|
||||||
|
BlackboardAttribute textLang = new BlackboardAttribute(
|
||||||
|
BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
|
||||||
|
KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
|
||||||
|
|
||||||
|
|
||||||
|
genInfo.addAttribute(textLang);
|
||||||
|
|
||||||
|
} catch (TskCoreException ex) {
|
||||||
|
logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* //attempt to verify that artifact with attribute was created
|
||||||
|
ArrayList<BlackboardArtifact> arts;
|
||||||
|
|
||||||
|
|
||||||
|
try {
|
||||||
|
arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId());
|
||||||
|
|
||||||
|
for (BlackboardArtifact art : arts) {
|
||||||
|
|
||||||
|
List<BlackboardAttribute> attrs = art.getAttributes();
|
||||||
|
for (BlackboardAttribute attr : attrs) {
|
||||||
|
if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) {
|
||||||
|
logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (TskCoreException ex) {
|
||||||
|
Exceptions.printStackTrace(ex);
|
||||||
|
}*/
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user