mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-19 19:14:55 +00:00
vik- 174 : added min string length to language detection
This commit is contained in:
parent
47ce10ad90
commit
6b7df1ae47
@ -13,12 +13,13 @@ import org.sleuthkit.datamodel.AbstractFile;
|
|||||||
public interface TextLanguageIdentifier {
|
public interface TextLanguageIdentifier {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile}
|
* attempts to identify the language of the given String and add it to the
|
||||||
* as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact.
|
* black board for the given {@code AbstractFile} as a TSK_TEXT_LANGUAGE
|
||||||
|
* attribute on a TSK_GEN_INFO artifact.
|
||||||
*
|
*
|
||||||
* @param extracted the String whose language is to be identified
|
* @param extracted the String whose language is to be identified
|
||||||
* @param sourceFile the AbstractFile the string is extracted from.
|
* @param sourceFile the AbstractFile the string is extracted from.
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile);
|
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile);
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,8 @@ import org.sleuthkit.datamodel.BlackboardAttribute;
|
|||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier
|
* TextLanguageIdentifier implementation based on a wrapped Tike
|
||||||
|
* LanguageIdentifier
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* @author jmillman
|
* @author jmillman
|
||||||
@ -20,51 +21,56 @@ import org.sleuthkit.datamodel.TskCoreException;
|
|||||||
public class TikaLanguageIdentifier implements TextLanguageIdentifier {
|
public class TikaLanguageIdentifier implements TextLanguageIdentifier {
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
|
private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
|
||||||
|
private static final int MIN_STRING_LENGTH = 1000;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
|
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
|
||||||
|
if (extracted.length() > MIN_STRING_LENGTH) {
|
||||||
|
org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
|
||||||
|
|
||||||
org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
|
logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
|
||||||
|
+ " with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + " confidence");
|
||||||
|
|
||||||
logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
|
BlackboardArtifact genInfo;
|
||||||
+ "with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + "confidence");
|
try {
|
||||||
|
genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO);
|
||||||
|
|
||||||
BlackboardArtifact genInfo;
|
BlackboardAttribute textLang = new BlackboardAttribute(
|
||||||
try {
|
BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
|
||||||
genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO);
|
KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
|
||||||
|
|
||||||
BlackboardAttribute textLang = new BlackboardAttribute(
|
|
||||||
BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
|
|
||||||
KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
|
|
||||||
|
|
||||||
|
|
||||||
genInfo.addAttribute(textLang);
|
genInfo.addAttribute(textLang);
|
||||||
|
|
||||||
} catch (TskCoreException ex) {
|
} catch (TskCoreException ex) {
|
||||||
logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex);
|
logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}else{
|
||||||
|
logger.info("extracted text too short, skipping language detection on " + sourceFile.getName());
|
||||||
}
|
}
|
||||||
|
} /* //attempt to verify that artifact with attribute was created
|
||||||
|
ArrayList<BlackboardArtifact> arts;
|
||||||
|
|
||||||
|
|
||||||
|
try {
|
||||||
|
arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId());
|
||||||
|
|
||||||
/* //attempt to verify that artifact with attribute was created
|
for (BlackboardArtifact art : arts) {
|
||||||
ArrayList<BlackboardArtifact> arts;
|
|
||||||
|
|
||||||
|
List<BlackboardAttribute> attrs = art.getAttributes();
|
||||||
|
for (BlackboardAttribute attr : attrs) {
|
||||||
|
if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) {
|
||||||
|
logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (TskCoreException ex) {
|
||||||
|
Exceptions.printStackTrace(ex);
|
||||||
|
}*/
|
||||||
|
|
||||||
try {
|
|
||||||
arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId());
|
|
||||||
|
|
||||||
for (BlackboardArtifact art : arts) {
|
|
||||||
|
|
||||||
List<BlackboardAttribute> attrs = art.getAttributes();
|
|
||||||
for (BlackboardAttribute attr : attrs) {
|
|
||||||
if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) {
|
|
||||||
logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (TskCoreException ex) {
|
|
||||||
Exceptions.printStackTrace(ex);
|
|
||||||
}*/
|
|
||||||
}
|
|
||||||
}
|
}
|
Loading…
x
Reference in New Issue
Block a user