mirror of
https://github.com/overcuriousity/autopsy-flatpak.git
synced 2025-07-17 18:17:43 +00:00
Merge pull request #5125 from ethteck/better-encoding-detection
Added decodetect library for better encoding detection on text files
This commit is contained in:
commit
754fd8c9d0
@ -42,6 +42,9 @@
|
|||||||
|
|
||||||
<dependency conf="core->default" org="com.google.cloud" name="google-cloud-translate" rev="1.70.0"/>
|
<dependency conf="core->default" org="com.google.cloud" name="google-cloud-translate" rev="1.70.0"/>
|
||||||
<dependency conf="core->default" org="org.apache.opennlp" name="opennlp-tools" rev="1.9.1"/>
|
<dependency conf="core->default" org="org.apache.opennlp" name="opennlp-tools" rev="1.9.1"/>
|
||||||
|
|
||||||
|
<dependency conf="core->default" org="com.ethteck.decodetect" name="decodetect-core" rev="0.3"/>
|
||||||
|
<dependency conf="core->default" org="com.beetstra.jutf7" name="jutf7" rev="1.0.0"/>
|
||||||
|
|
||||||
<dependency org="org.sejda.webp-imageio" name="webp-imageio-sejda" rev="0.1.0"/>
|
<dependency org="org.sejda.webp-imageio" name="webp-imageio-sejda" rev="0.1.0"/>
|
||||||
<dependency org="com.googlecode.libphonenumber" name="libphonenumber" rev="3.5" />
|
<dependency org="com.googlecode.libphonenumber" name="libphonenumber" rev="3.5" />
|
||||||
@ -54,6 +57,6 @@
|
|||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/javax.ws.rs/javax.ws.rs-api -->
|
<!-- https://mvnrepository.com/artifact/javax.ws.rs/javax.ws.rs-api -->
|
||||||
<dependency org="javax.ws.rs" name="javax.ws.rs-api" rev="2.0"/>
|
<dependency org="javax.ws.rs" name="javax.ws.rs-api" rev="2.0"/>
|
||||||
<override org="jakarta.ws.rs" module="jakarta.ws.rs-api" rev="2.1.5"/>
|
<override org="jakarta.ws.rs" module="jakarta.ws.rs-api" rev="2.1.5"/>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</ivy-module>
|
</ivy-module>
|
||||||
|
@ -18,6 +18,7 @@ file.reference.commons-lang3-3.8.1.jar=release\\modules\\ext\\commons-lang3-3.8.
|
|||||||
file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
|
file.reference.commons-pool2-2.4.2.jar=release/modules/ext/commons-pool2-2.4.2.jar
|
||||||
file.reference.cxf-rt-rs-client-3.3.0.jar=release\\modules\\ext\\cxf-rt-rs-client-3.3.0.jar
|
file.reference.cxf-rt-rs-client-3.3.0.jar=release\\modules\\ext\\cxf-rt-rs-client-3.3.0.jar
|
||||||
file.reference.dec-0.1.2.jar=release\\modules\\ext\\dec-0.1.2.jar
|
file.reference.dec-0.1.2.jar=release\\modules\\ext\\dec-0.1.2.jar
|
||||||
|
file.reference.decodetect-core-0.3.jar=release\\modules\\ext\\decodetect-core-0.3.jar
|
||||||
file.reference.fontbox-2.0.13.jar=release\\modules\\ext\\fontbox-2.0.13.jar
|
file.reference.fontbox-2.0.13.jar=release\\modules\\ext\\fontbox-2.0.13.jar
|
||||||
file.reference.geoapi-3.0.1.jar=release\\modules\\ext\\geoapi-3.0.1.jar
|
file.reference.geoapi-3.0.1.jar=release\\modules\\ext\\geoapi-3.0.1.jar
|
||||||
file.reference.grib-4.5.5.jar=release\\modules\\ext\\grib-4.5.5.jar
|
file.reference.grib-4.5.5.jar=release\\modules\\ext\\grib-4.5.5.jar
|
||||||
@ -50,6 +51,7 @@ file.reference.jsoup-1.11.3.jar=release\\modules\\ext\\jsoup-1.11.3.jar
|
|||||||
file.reference.jul-to-slf4j-1.7.25.jar=release\\modules\\ext\\jul-to-slf4j-1.7.25.jar
|
file.reference.jul-to-slf4j-1.7.25.jar=release\\modules\\ext\\jul-to-slf4j-1.7.25.jar
|
||||||
file.reference.juniversalchardet-1.0.3.jar=release\\modules\\ext\\juniversalchardet-1.0.3.jar
|
file.reference.juniversalchardet-1.0.3.jar=release\\modules\\ext\\juniversalchardet-1.0.3.jar
|
||||||
file.reference.junrar-2.0.0.jar=release\\modules\\ext\\junrar-2.0.0.jar
|
file.reference.junrar-2.0.0.jar=release\\modules\\ext\\junrar-2.0.0.jar
|
||||||
|
file.reference.jutf7-1.0.0.jar=release\\modules\\ext\\jutf7-1.0.0.jar
|
||||||
file.reference.jxmapviewer2-2.4.jar=release/modules/ext/jxmapviewer2-2.4.jar
|
file.reference.jxmapviewer2-2.4.jar=release/modules/ext/jxmapviewer2-2.4.jar
|
||||||
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
|
file.reference.jython-standalone-2.7.0.jar=release/modules/ext/jython-standalone-2.7.0.jar
|
||||||
file.reference.libphonenumber-3.5.jar=release/modules/ext/libphonenumber-3.5.jar
|
file.reference.libphonenumber-3.5.jar=release/modules/ext/libphonenumber-3.5.jar
|
||||||
|
@ -794,6 +794,14 @@
|
|||||||
<runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
|
<runtime-relative-path>ext/vorbis-java-tika-0.8.jar</runtime-relative-path>
|
||||||
<binary-origin>release\modules\ext\vorbis-java-tika-0.8.jar</binary-origin>
|
<binary-origin>release\modules\ext\vorbis-java-tika-0.8.jar</binary-origin>
|
||||||
</class-path-extension>
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/decodetect-core-0.3.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/decodetect-core-0.3.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
|
<class-path-extension>
|
||||||
|
<runtime-relative-path>ext/jutf7-1.0.0.jar</runtime-relative-path>
|
||||||
|
<binary-origin>release/modules/ext/jutf7-1.0.0.jar</binary-origin>
|
||||||
|
</class-path-extension>
|
||||||
</data>
|
</data>
|
||||||
</configuration>
|
</configuration>
|
||||||
</project>
|
</project>
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.sleuthkit.autopsy.modules.filetypeid;
|
package org.sleuthkit.autopsy.modules.filetypeid;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -29,6 +30,7 @@ import org.apache.tika.Tika;
|
|||||||
import org.apache.tika.io.TikaInputStream;
|
import org.apache.tika.io.TikaInputStream;
|
||||||
import org.apache.tika.mime.MimeTypes;
|
import org.apache.tika.mime.MimeTypes;
|
||||||
import org.sleuthkit.autopsy.coreutils.Logger;
|
import org.sleuthkit.autopsy.coreutils.Logger;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
import org.sleuthkit.datamodel.TskCoreException;
|
import org.sleuthkit.datamodel.TskCoreException;
|
||||||
@ -249,6 +251,17 @@ public class FileTypeDetector {
|
|||||||
mimeType = tikaType.replace("tika-", ""); //NON-NLS
|
mimeType = tikaType.replace("tika-", ""); //NON-NLS
|
||||||
mimeType = removeOptionalParameter(mimeType);
|
mimeType = removeOptionalParameter(mimeType);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* If the file was marked as an octet stream and the extension is .txt, try to detect a text
|
||||||
|
* encoding with Decodetect.
|
||||||
|
*/
|
||||||
|
if (file.getNameExtension().equals("txt")) {
|
||||||
|
Charset detectedCharset = TextFileExtractor.getEncoding(file);
|
||||||
|
if (detectedCharset != TextFileExtractor.UNKNOWN_CHARSET) {
|
||||||
|
mimeType = MimeTypes.PLAIN_TEXT;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -89,6 +89,7 @@ public class TextExtractorFactory {
|
|||||||
*/
|
*/
|
||||||
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
|
private static List<TextExtractor> getFileExtractors(AbstractFile content, Lookup context) {
|
||||||
List<TextExtractor> fileExtractors = Arrays.asList(
|
List<TextExtractor> fileExtractors = Arrays.asList(
|
||||||
|
new TextFileExtractor(content),
|
||||||
new HtmlTextExtractor(content),
|
new HtmlTextExtractor(content),
|
||||||
new SqliteTextExtractor(content),
|
new SqliteTextExtractor(content),
|
||||||
new TikaTextExtractor(content));
|
new TikaTextExtractor(content));
|
||||||
|
@ -0,0 +1,135 @@
|
|||||||
|
/*
|
||||||
|
* Autopsy Forensic Browser
|
||||||
|
*
|
||||||
|
* Copyright 2018-2019 Basis Technology Corp.
|
||||||
|
* Contact: carrier <at> sleuthkit <dot> org
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.sleuthkit.autopsy.textextractors;
|
||||||
|
|
||||||
|
import com.ethteck.decodetect.core.Decodetect;
|
||||||
|
import com.ethteck.decodetect.core.DecodetectResult;
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.charset.CharsetDecoder;
|
||||||
|
import java.nio.charset.CharsetEncoder;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.charset.UnsupportedCharsetException;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.tika.parser.txt.CharsetDetector;
|
||||||
|
import org.apache.tika.parser.txt.CharsetMatch;
|
||||||
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
|
import org.sleuthkit.datamodel.Content;
|
||||||
|
import org.sleuthkit.datamodel.ReadContentInputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text from text files
|
||||||
|
*/
|
||||||
|
public final class TextFileExtractor implements TextExtractor {
|
||||||
|
public static Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
|
||||||
|
@Override
|
||||||
|
public boolean contains(Charset cs) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CharsetDecoder newDecoder() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CharsetEncoder newEncoder() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// This value will be used as a threshold for determining which encoding
|
||||||
|
// detection library to use. If Tika's own confidence is at least
|
||||||
|
// MIN_MATCH_CONFIDENCE, Tika's result will be used for decoding.
|
||||||
|
// Otherwise, Decodetect will be used.
|
||||||
|
static final private int MIN_TIKA_MATCH_CONFIDENCE = 35;
|
||||||
|
|
||||||
|
// This value determines whether we will consider Decodetect's top-scoring
|
||||||
|
// result a legitimate match or if we will disregard its findings
|
||||||
|
//
|
||||||
|
// Possible values are 0 to 1, inclusive
|
||||||
|
static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
|
||||||
|
|
||||||
|
private final AbstractFile file;
|
||||||
|
|
||||||
|
public TextFileExtractor(AbstractFile file) {
|
||||||
|
this.file = file;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader getReader() {
|
||||||
|
Charset encoding = getEncoding(file);
|
||||||
|
if (encoding.equals(UNKNOWN_CHARSET)) {
|
||||||
|
encoding = StandardCharsets.UTF_8;
|
||||||
|
}
|
||||||
|
return getReader(encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader(Charset encoding) {
|
||||||
|
return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isSupported() {
|
||||||
|
return file.getMIMEType().equals("text/plain");
|
||||||
|
}
|
||||||
|
|
||||||
|
public class TextFileExtractorException extends Exception {
|
||||||
|
public TextFileExtractorException(String msg, Throwable ex) {
|
||||||
|
super(msg, ex);
|
||||||
|
}
|
||||||
|
public TextFileExtractorException(String msg) {
|
||||||
|
super(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Charset getEncoding(Content content) {
|
||||||
|
try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(content))) {
|
||||||
|
// Tika first
|
||||||
|
CharsetDetector detector = new CharsetDetector();
|
||||||
|
detector.setText(stream);
|
||||||
|
CharsetMatch tikaResult = detector.detect();
|
||||||
|
if (tikaResult != null && tikaResult.getConfidence() >= MIN_TIKA_MATCH_CONFIDENCE) {
|
||||||
|
try {
|
||||||
|
return Charset.forName(tikaResult.getName());
|
||||||
|
} catch (UnsupportedCharsetException ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decodetect if Tika fails or falls below confidence threshold
|
||||||
|
int maxBytes = 100000;
|
||||||
|
int numBytes = Math.min(stream.available(), maxBytes);
|
||||||
|
byte[] targetArray = new byte[numBytes];
|
||||||
|
stream.read(targetArray);
|
||||||
|
List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
|
||||||
|
if (!results.isEmpty()) {
|
||||||
|
DecodetectResult topResult = results.get(0);
|
||||||
|
if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
|
||||||
|
return topResult.getEncoding();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException ignored) {
|
||||||
|
}
|
||||||
|
return UNKNOWN_CHARSET;
|
||||||
|
}
|
||||||
|
}
|
@ -28,6 +28,7 @@ import java.util.Map;
|
|||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.tika.mime.MimeTypes;
|
||||||
import org.openide.util.Lookup;
|
import org.openide.util.Lookup;
|
||||||
import org.openide.util.NbBundle;
|
import org.openide.util.NbBundle;
|
||||||
import org.openide.util.NbBundle.Messages;
|
import org.openide.util.NbBundle.Messages;
|
||||||
@ -44,12 +45,12 @@ import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
|
|||||||
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
|
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
|
||||||
import org.sleuthkit.autopsy.ingest.IngestServices;
|
import org.sleuthkit.autopsy.ingest.IngestServices;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
|
||||||
import org.sleuthkit.autopsy.keywordsearch.TextFileExtractor.TextFileExtractorException;
|
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
|
||||||
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
|
||||||
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
|
||||||
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
import org.sleuthkit.autopsy.textextractors.TextExtractor;
|
||||||
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
|
||||||
|
import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
|
||||||
import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
|
import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
|
||||||
import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
|
import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
import org.sleuthkit.datamodel.AbstractFile;
|
||||||
@ -632,7 +633,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
if (context.fileIngestIsCancelled()) {
|
if (context.fileIngestIsCancelled()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (fileType.equals("application/octet-stream")) {
|
if (fileType.equals(MimeTypes.OCTET_STREAM)) {
|
||||||
extractStringsAndIndex(aFile);
|
extractStringsAndIndex(aFile);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -657,20 +658,7 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
|
if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
|
||||||
//Carved Files should be the only type of unallocated files capable of a txt extension and
|
//Carved Files should be the only type of unallocated files capable of a txt extension and
|
||||||
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
|
//should be ignored by the TextFileExtractor because they may contain more than one text encoding
|
||||||
try {
|
wasTextAdded = indexTextFile(aFile);
|
||||||
TextFileExtractor textFileExtractor = new TextFileExtractor();
|
|
||||||
Reader textReader = textFileExtractor.getReader(aFile);
|
|
||||||
if (textReader == null) {
|
|
||||||
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
|
|
||||||
} else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
|
||||||
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
|
||||||
wasTextAdded = true;
|
|
||||||
}
|
|
||||||
} catch (IngesterException ex) {
|
|
||||||
logger.log(Level.WARNING, "Unable to index as unicode", ex);
|
|
||||||
} catch (TextFileExtractorException ex) {
|
|
||||||
logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// if it wasn't supported or had an error, default to strings
|
// if it wasn't supported or had an error, default to strings
|
||||||
@ -678,5 +666,29 @@ public final class KeywordSearchIngestModule implements FileIngestModule {
|
|||||||
extractStringsAndIndex(aFile);
|
extractStringsAndIndex(aFile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds the text file to the index given an encoding.
|
||||||
|
* Returns true if indexing was successful and false otherwise.
|
||||||
|
*
|
||||||
|
* @param aFile Text file to analyze
|
||||||
|
* @param detectedCharset the encoding of the file
|
||||||
|
*/
|
||||||
|
private boolean indexTextFile(AbstractFile aFile) {
|
||||||
|
try {
|
||||||
|
TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
|
||||||
|
Reader textReader = textFileExtractor.getReader();
|
||||||
|
if (textReader == null) {
|
||||||
|
logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
|
||||||
|
} else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
|
||||||
|
textReader.close();
|
||||||
|
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} catch (IngesterException | IOException ex) {
|
||||||
|
logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,68 +0,0 @@
|
|||||||
/*
|
|
||||||
* Autopsy Forensic Browser
|
|
||||||
*
|
|
||||||
* Copyright 2018-2019 Basis Technology Corp.
|
|
||||||
* Contact: carrier <at> sleuthkit <dot> org
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.sleuthkit.autopsy.keywordsearch;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.io.Reader;
|
|
||||||
import org.apache.tika.parser.txt.CharsetDetector;
|
|
||||||
import org.apache.tika.parser.txt.CharsetMatch;
|
|
||||||
import org.sleuthkit.datamodel.AbstractFile;
|
|
||||||
import org.sleuthkit.datamodel.ReadContentInputStream;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract text from .txt files
|
|
||||||
*/
|
|
||||||
final class TextFileExtractor {
|
|
||||||
|
|
||||||
//Set a Minimum confidence value to reject matches that may not have a valid text encoding
|
|
||||||
//Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
|
|
||||||
//and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing.
|
|
||||||
//This limited information was used to select the current value as one that would filter out clearly non-text
|
|
||||||
//files while hopefully working on all files with a valid text encoding
|
|
||||||
static final private int MIN_MATCH_CONFIDENCE = 20;
|
|
||||||
|
|
||||||
public Reader getReader(AbstractFile source) throws TextFileExtractorException {
|
|
||||||
CharsetDetector detector = new CharsetDetector();
|
|
||||||
//wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
|
|
||||||
InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
|
|
||||||
try {
|
|
||||||
detector.setText(stream);
|
|
||||||
} catch (IOException ex) {
|
|
||||||
throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
|
|
||||||
}
|
|
||||||
CharsetMatch match = detector.detect();
|
|
||||||
if (match == null) {
|
|
||||||
throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
|
|
||||||
} else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
|
|
||||||
throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
|
|
||||||
}
|
|
||||||
|
|
||||||
return match.getReader();
|
|
||||||
}
|
|
||||||
|
|
||||||
public class TextFileExtractorException extends Exception {
|
|
||||||
public TextFileExtractorException(String msg, Throwable ex) {
|
|
||||||
super(msg, ex);
|
|
||||||
}
|
|
||||||
public TextFileExtractorException(String msg) {
|
|
||||||
super(msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user