external tsv parsing library

This commit is contained in:
Greg DiCristofaro 2021-01-15 15:54:07 -05:00
parent 02e0959eb7
commit 098911bb1e
3 changed files with 141 additions and 190 deletions

View File

@ -28,6 +28,8 @@
<dependency conf="core->default" org="org.jsoup" name="jsoup" rev="1.10.3"/>
<dependency conf="core->default" org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.7"/>
<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-csv" rev="2.9.7"/>
<dependency conf="core->default" org="com.drewnoakes" name="metadata-extractor" rev="2.11.0"/>
<dependency conf="core->default" org="com.google.cloud" name="google-cloud-translate" rev="1.70.0"/>

View File

@ -393,6 +393,10 @@
<runtime-relative-path>ext/jackson-databind-2.9.7.jar</runtime-relative-path>
<binary-origin>release\modules\ext\jackson-databind-2.9.7.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/jackson-dataformat-csv-2.9.7.jar</runtime-relative-path>
<binary-origin>release\modules\ext\jackson-dataformat-csv-2.9.7.jar</binary-origin>
</class-path-extension>
<class-path-extension>
<runtime-relative-path>ext/okhttp-2.7.5.jar</runtime-relative-path>
<binary-origin>release\modules\ext\okhttp-2.7.5.jar</binary-origin>

View File

@ -18,28 +18,27 @@
*/
package org.sleuthkit.autopsy.modules.leappanalyzers;
import java.io.BufferedReader;
import com.fasterxml.jackson.databind.MappingIterator;
import com.fasterxml.jackson.dataformat.csv.CsvMapper;
import com.fasterxml.jackson.dataformat.csv.CsvSchema;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import static java.util.Locale.US;
import java.util.Map;
import java.util.logging.Level;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
@ -57,7 +56,6 @@ import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Blackboard;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskException;
@ -76,30 +74,30 @@ public final class LeappFileProcessor {
*/
private static class TsvColumn {
private final String attributeName;
private final BlackboardAttribute.Type attributeType;
private final String columnName;
private final boolean required;
/**
* Main constructor.
*
* @param attributeName The BlackboardAttribute name or null if not
* used.
* @param attributeType The BlackboardAttribute type or null if not
* used. used.
* @param columnName The name of the column in the tsv file.
* @param required Whether or not this attribute is required to be
* present.
*/
TsvColumn(String attributeName, String columnName, boolean required) {
this.attributeName = attributeName;
TsvColumn(BlackboardAttribute.Type attributeType, String columnName, boolean required) {
this.attributeType = attributeType;
this.columnName = columnName;
this.required = required;
}
/**
* @return The BlackboardAttribute name or null if not used.
* @return The BlackboardAttribute type or null if not used.
*/
String getAttributeName() {
return attributeName;
BlackboardAttribute.Type getAttributeType() {
return attributeType;
}
/**
@ -123,7 +121,7 @@ public final class LeappFileProcessor {
private final String xmlFile; //NON-NLS
private final Map<String, String> tsvFiles;
private final Map<String, String> tsvFileArtifacts;
private final Map<String, BlackboardArtifact.Type> tsvFileArtifacts;
private final Map<String, String> tsvFileArtifactComments;
private final Map<String, List<TsvColumn>> tsvFileAttributes;
@ -220,17 +218,15 @@ public final class LeappFileProcessor {
String fileName = FilenameUtils.getName(LeappFileName);
File LeappFile = new File(LeappFileName);
if (tsvFileAttributes.containsKey(fileName)) {
List<TsvColumn> attrList = tsvFileAttributes.get(fileName);
BlackboardArtifact.Type artifactType = null;
try {
BlackboardArtifact.Type artifactType = Case.getCurrentCase().getSleuthkitCase().getArtifactType(tsvFileArtifacts.get(fileName));
List<TsvColumn> attrList = tsvFileAttributes.get(fileName);
artifactType = tsvFileArtifacts.get(fileName);
processFile(LeappFile, attrList, fileName, artifactType, bbartifacts, LeappImageFile);
} catch (TskCoreException ex) {
throw new IngestModuleException(String.format("Error getting Blackboard Artifact Type for %s", tsvFileArtifacts.get(fileName)), ex);
throw new IngestModuleException(String.format("Error getting Blackboard Artifact Type for %s", artifactType == null ? "<null>" : artifactType.toString()), ex);
}
}
}
if (!bbartifacts.isEmpty()) {
@ -256,16 +252,7 @@ public final class LeappFileProcessor {
File LeappFile = new File(LeappFileName);
if (tsvFileAttributes.containsKey(fileName)) {
List<TsvColumn> attrList = tsvFileAttributes.get(fileName);
BlackboardArtifact.Type artifactType = null;
try {
artifactType = Case.getCurrentCase().getSleuthkitCase().getArtifactType(tsvFileArtifacts.get(fileName));
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, String.format("Error getting Blackboard Artifact Type for %s", tsvFileArtifacts.get(fileName)), ex);
}
if (artifactType == null) {
continue;
}
BlackboardArtifact.Type artifactType = tsvFileArtifacts.get(fileName);
try {
processFile(LeappFile, attrList, fileName, artifactType, bbartifacts, dataSource);
@ -294,191 +281,149 @@ public final class LeappFileProcessor {
return;
}
try (BufferedReader reader = new BufferedReader(new FileReader(LeappFile))) {
String header = reader.readLine();
// Check first line, if it is null then no heading so nothing to match to, close and go to next file.
if (header != null) {
Map<Integer, String> columnNumberToProcess = findColumnsToProcess(fileName, header, attrList);
String line = reader.readLine();
while (line != null) {
Collection<BlackboardAttribute> bbattributes = processReadLine(line, columnNumberToProcess, fileName);
try (MappingIterator<Map<String, String>> iterator = new CsvMapper()
.readerFor(Map.class)
.with(CsvSchema.emptySchema().withHeader().withColumnSeparator('\t'))
.readValues(LeappFile)) {
if (!bbattributes.isEmpty() && !blkBoard.artifactExists(dataSource, BlackboardArtifact.ARTIFACT_TYPE.fromID(artifactType.getTypeID()), bbattributes)) {
BlackboardArtifact bbartifact = createArtifactWithAttributes(artifactType.getTypeID(), dataSource, bbattributes);
if (bbartifact != null) {
bbartifacts.add(bbartifact);
}
int lineNum = 1;
while (iterator.hasNext()) {
Map<String, String> keyVals = iterator.next();
Collection<BlackboardAttribute> bbattributes = processReadLine(keyVals, attrList, fileName, lineNum++);
if (!bbattributes.isEmpty() && !blkBoard.artifactExists(dataSource, BlackboardArtifact.ARTIFACT_TYPE.fromID(artifactType.getTypeID()), bbattributes)) {
BlackboardArtifact bbartifact = createArtifactWithAttributes(artifactType.getTypeID(), dataSource, bbattributes);
if (bbartifact != null) {
bbartifacts.add(bbartifact);
}
line = reader.readLine();
}
}
}
}
/**
* Process the line read and create the necessary attributes for it
* Process the line read and create the necessary attributes for it.
*
* @param line a tsv line to process that was read
* @param columnNumberToProcess Which columns to process in the tsv line
* @param fileName name of file begin processed
*
* @return
* @param lineKeyValues A mapping of column names to values for the line.
* @param attrList The list of attributes as specified for the schema of
* this file.
* @param fileName The name of the file being processed.
* @param lineNum The line number in the file.
* @return The collection of blackboard attributes for the artifact created
* from this line.
* @throws IngestModuleException
*/
private Collection<BlackboardAttribute> processReadLine(String line, Map<Integer, String> columnNumberToProcess, String fileName) throws IngestModuleException {
if (MapUtils.isEmpty(columnNumberToProcess)) {
private Collection<BlackboardAttribute> processReadLine(Map<String, String> lineKeyValues, List<TsvColumn> attrList, String fileName, int lineNum) throws IngestModuleException {
if (MapUtils.isEmpty(lineKeyValues)) {
return Collections.emptyList();
} else if (line == null) {
} else if (lineKeyValues == null) {
logger.log(Level.WARNING, "Line is null. Returning empty list for attributes.");
return Collections.emptyList();
}
String[] columnValues;
List<BlackboardAttribute> attrsToRet = new ArrayList<>();
for (TsvColumn colAttr : attrList) {
if (colAttr.getAttributeType() == null) {
continue;
}
// Check to see if the 2 values are equal, they may not be equal if there is no corresponding data in the line.
// or if the size of the line to split is not equal to the column numbers we are looking to process. This
// can happen when the last value of the tsv line has no data in it.
// If this happens then adding an empty value(s) for each columnValue where data does not exist
Integer maxColumnNumber = Collections.max(columnNumberToProcess.keySet());
if ((maxColumnNumber > line.split("\\t").length) || (columnNumberToProcess.size() > line.split("\\t").length)) {
columnValues = Arrays.copyOf(line.split("\\t"), maxColumnNumber + 1);
} else {
columnValues = line.split("\\t");
}
// TODO error handling
String value = lineKeyValues.get(colAttr.getColumnName());
if (value == null) {
logger.log(Level.WARNING, String.format("No value found for column %s at line %d in file %s.", colAttr.getColumnName(), lineNum, fileName));
continue;
}
Collection<BlackboardAttribute> bbattributes = new ArrayList<BlackboardAttribute>();
for (Map.Entry<Integer, String> columnToProcess : columnNumberToProcess.entrySet()) {
Integer columnNumber = columnToProcess.getKey();
String attributeName = columnToProcess.getValue();
if (columnValues[columnNumber] != null) {
try {
BlackboardAttribute.Type attributeType = Case.getCurrentCase().getSleuthkitCase().getAttributeType(attributeName.toUpperCase());
if (attributeType == null) {
continue;
}
String attrType = attributeType.getValueType().getLabel().toUpperCase();
checkAttributeType(bbattributes, attrType, columnValues, columnNumber, attributeType, fileName);
} catch (TskCoreException ex) {
throw new IngestModuleException(String.format("Error getting Attribute type for Attribute Name %s", attributeName), ex); //NON-NLS
}
BlackboardAttribute attr = (value == null) ? null : getAttribute(colAttr.getAttributeType(), value, fileName);
if (value != null) {
attrsToRet.add(attr);
}
}
if (tsvFileArtifactComments.containsKey(fileName)) {
bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_COMMENT, MODULE_NAME, tsvFileArtifactComments.get(fileName)));
}
return bbattributes;
}
private void checkAttributeType(Collection<BlackboardAttribute> bbattributes, String attrType, String[] columnValues, int columnNumber, BlackboardAttribute.Type attributeType,
String fileName) {
if (columnValues == null || columnNumber < 0 || columnNumber > columnValues.length || columnValues[columnNumber] == null) {
logger.log(Level.WARNING, String.format("Unable to determine column value at index %d in columnValues: %s",
columnNumber,
columnValues == null ? "<null>" : "[" + String.join(", ", columnValues) + "]"));
return;
}
String columnValue = columnValues[columnNumber];
if (attrType.matches("STRING")) {
bbattributes.add(new BlackboardAttribute(attributeType, MODULE_NAME, columnValue));
} else if (attrType.matches("INTEGER")) {
try {
bbattributes.add(new BlackboardAttribute(attributeType, MODULE_NAME, Integer.valueOf(columnValue)));
} catch (NumberFormatException ex) {
logger.log(Level.WARNING, String.format("Unable to format %s as an integer.", columnValue), ex);
}
} else if (attrType.matches("LONG")) {
try {
bbattributes.add(new BlackboardAttribute(attributeType, MODULE_NAME, Long.valueOf(columnValue)));
} catch (NumberFormatException ex) {
logger.log(Level.WARNING, String.format("Unable to format %s as an long.", columnValue), ex);
}
} else if (attrType.matches("DOUBLE")) {
try {
bbattributes.add(new BlackboardAttribute(attributeType, MODULE_NAME, Double.valueOf(columnValue)));
} catch (NumberFormatException ex) {
logger.log(Level.WARNING, String.format("Unable to format %s as an double.", columnValue), ex);
}
} else if (attrType.matches("BYTE")) {
try {
bbattributes.add(new BlackboardAttribute(attributeType, MODULE_NAME, Byte.valueOf(columnValue)));
} catch (NumberFormatException ex) {
logger.log(Level.WARNING, String.format("Unable to format %s as an byte.", columnValue), ex);
}
} else if (attrType.matches("DATETIME")) {
// format of data should be the same in all the data and the format is 2020-03-28 01:00:17
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-d HH:mm:ss", US);
Long dateLong = Long.valueOf(0);
try {
Date newDate = dateFormat.parse(columnValue);
dateLong = newDate.getTime() / 1000;
bbattributes.add(new BlackboardAttribute(attributeType, MODULE_NAME, dateLong));
} catch (ParseException ex) {
// catching error and displaying date that could not be parsed
// we set the timestamp to 0 and continue on processing
logger.log(Level.WARNING, String.format("Failed to parse date/time %s for attribute type %s in file %s.", columnValue, attributeType.getDisplayName(), fileName)); //NON-NLS
}
} else if (attrType.matches("JSON")) {
bbattributes.add(new BlackboardAttribute(attributeType, MODULE_NAME, columnValue));
} else {
// Log this and continue on with processing
logger.log(Level.WARNING, String.format("Attribute Type %s not defined.", attrType)); //NON-NLS
}
return attrsToRet;
}
/**
* Process the first line of the tsv file which has the headings. Match the
* headings to the columns in the XML mapping file so we know which columns
* to process.
*
* @param fileName The name of the file in which these column headers exist.
* @param line a tsv heading line of the columns in the file
* @param attrList the list of headings we want to process
*
* @return the numbered column(s) and attribute(s) we want to use for the
* column(s)
* The format of time stamps in tsv.
*/
private Map<Integer, String> findColumnsToProcess(String fileName, String line, List<TsvColumn> attrList) {
String[] columnNames = line.split("\\t");
HashMap<Integer, String> columnsToProcess = new HashMap<>();
private static final DateFormat TIMESTAMP_FORMAT = new SimpleDateFormat("yyyy-MM-d HH:mm:ss", US);
Integer columnPosition = 0;
for (String columnName : columnNames) {
// for some reason the first column of the line has unprintable characters so removing them
String cleanColumnName = columnName.trim().replaceAll("[^\\n\\r\\t\\p{Print}]", "");
for (TsvColumn tsvColumn : attrList) {
if (cleanColumnName.equalsIgnoreCase(tsvColumn.getColumnName())) {
columnsToProcess.put(columnPosition, tsvColumn.getAttributeName());
break;
}
}
columnPosition++;
/**
* Gets an appropriate attribute based on the attribute type and string
* value.
*
* @param attrType The attribute type.
* @param value The string value to be converted to the appropriate data
* type for the attribute type.
* @param fileName The file name that the value comes from.
* @return The generated blackboard attribute.
*/
private BlackboardAttribute getAttribute(BlackboardAttribute.Type attrType, String value, String fileName) {
if (attrType == null || value == null) {
logger.log(Level.WARNING, String.format("Unable to parse attribute type %s for value '%s' in fileName %s",
attrType == null ? "<null>" : attrType.toString(),
value == null ? "<null>" : value,
fileName == null ? "<null>" : fileName));
return null;
}
if (columnsToProcess.size() != attrList.size()) {
String missingColumns = IntStream.range(0, attrList.size())
.filter((idx) -> !columnsToProcess.containsKey(attrList.get(idx).getAttributeName()))
.mapToObj((idx) -> String.format("'%s'", attrList.get(idx).getColumnName() == null ? "<null>" : attrList.get(idx).getColumnName()))
.collect(Collectors.joining(", "));
logger.log(Level.WARNING, String.format("Columns size expected not found in file %s based on xml from %s. Column Keys Missing = [%s]; Header Line = '%s'.",
this.xmlFile == null ? "<null>" : this.xmlFile,
fileName,
missingColumns,
line));
String trimmed = value.trim();
switch (attrType.getValueType()) {
case JSON:
case STRING:
return parseAttrValue(trimmed, attrType, fileName, (v) -> new BlackboardAttribute(attrType, MODULE_NAME, v));
case INTEGER:
return parseAttrValue(trimmed, attrType, fileName, (v) -> new BlackboardAttribute(attrType, MODULE_NAME, (int) Double.valueOf(v).intValue()));
case LONG:
return parseAttrValue(trimmed, attrType, fileName, (v) -> new BlackboardAttribute(attrType, MODULE_NAME, (long) Double.valueOf(v).longValue()));
case DOUBLE:
return parseAttrValue(trimmed, attrType, fileName, (v) -> new BlackboardAttribute(attrType, MODULE_NAME, (double) Double.valueOf(v)));
case BYTE:
return parseAttrValue(trimmed, attrType, fileName, (v) -> new BlackboardAttribute(attrType, MODULE_NAME, new byte[]{Byte.valueOf(v)}));
case DATETIME:
return parseAttrValue(value, attrType, fileName, (v) -> new BlackboardAttribute(attrType, MODULE_NAME, TIMESTAMP_FORMAT.parse(v).getTime() / 1000));
default:
// Log this and continue on with processing
logger.log(Level.WARNING, String.format("Attribute Type %s for file %s not defined.", attrType, fileName)); //NON-NLS
return null;
}
}
return columnsToProcess;
/**
* Handles converting a string to a blackboard attribute.
*/
private interface ParseExceptionFunction {
/**
* Handles converting a string value to a blackboard attribute.
*
* @param orig The original string value.
* @return The generated blackboard attribute.
* @throws ParseException
* @throws NumberFormatException
*/
BlackboardAttribute apply(String orig) throws ParseException, NumberFormatException;
}
/**
* Runs parsing function on string value to convert to right data type and
* generates a blackboard attribute for that converted data type.
*
* @param value The string value.
* @param attrType The blackboard attribute type.
* @param fileName The name of the file from which the value comes.
* @param valueConverter The means of converting the string value to an
* appropriate blackboard attribute.
* @return The generated blackboard attribute or null if not determined.
*/
private BlackboardAttribute parseAttrValue(String value, BlackboardAttribute.Type attrType, String fileName, ParseExceptionFunction valueConverter) {
try {
return valueConverter.apply(value);
} catch (NumberFormatException | ParseException ex) {
logger.log(Level.WARNING, String.format("Unable to format '%s' as value type %s while converting to attributes from %s.", value, attrType.getValueType().getLabel(), fileName), ex);
return null;
}
}
@NbBundle.Messages({
@ -546,10 +491,10 @@ public final class LeappFileProcessor {
if (foundArtifactType == null) {
logger.log(Level.SEVERE, String.format("No known artifact mapping found for [artifact: %s, %s]",
artifactName, getXmlFileIdentifier(parentName)));
} else {
tsvFileArtifacts.put(parentName, foundArtifactType);
}
tsvFileArtifacts.put(parentName, artifactName);
if (!comment.toLowerCase().matches("null")) {
tsvFileArtifactComments.put(parentName, comment);
}
@ -606,7 +551,7 @@ public final class LeappFileProcessor {
}
TsvColumn thisCol = new TsvColumn(
attributeName.toLowerCase(),
foundAttrType,
columnName.toLowerCase(),
"yes".compareToIgnoreCase(required) == 0);