working on translation dictionary implementation

This commit is contained in:
Greg DiCristofaro 2021-01-05 10:18:59 -05:00
parent d84da21363
commit 48cdcdd602
13 changed files with 89 additions and 26 deletions

View File

@ -4,7 +4,7 @@ from typing import List, Iterable, Tuple
import csv import csv
import os import os
from fileutil import OMITTED_ADDITION, get_filename_addition, DELETED_ADDITION, FOUND_ADDITION from fileutil import get_filename_addition
from outputresult import OutputResult from outputresult import OutputResult
@ -54,6 +54,16 @@ def csv_to_records(input_path: str, header_row: bool) -> Tuple[List[List[str]],
return results, header return results, header
# For use with creating csv filenames for entries that have been omitted.
OMITTED_ADDITION = '-omitted'
# For use with creating csv filenames for entries that have been deleted.
DELETED_ADDITION = '-deleted'
# For translations where
FOUND_ADDITION = '-found'
def write_results_to_csv(results: OutputResult, output_path: str): def write_results_to_csv(results: OutputResult, output_path: str):
""" """
Writes the result of processing to the output path as a csv file. If omitted values are present, for output_path of Writes the result of processing to the output path as a csv file. If omitted values are present, for output_path of

View File

@ -5,12 +5,14 @@ As a consequence, it also requires git >= 1.7.0 and python >= 3.4.
import sys import sys
from envutil import get_proj_dir from envutil import get_proj_dir
from excelutil import write_results_to_xlsx from excelutil import write_results_to_xlsx
from gitutil import get_property_files_diff, get_git_root, get_commit_id from gitutil import get_property_files_diff, get_git_root, get_commit_id, get_tree
from itemchange import convert_to_output from itemchange import convert_to_output
from csvutil import write_results_to_csv from csvutil import write_results_to_csv
import argparse import argparse
from langpropsutil import get_commit_for_language, LANG_FILENAME from langpropsutil import get_commit_for_language, LANG_FILENAME
from outputtype import OutputType from outputtype import OutputType
from languagedictutil import extract_translations
from propsutil import get_lang_bundle_name, DEFAULT_PROPS_FILENAME
def main(): def main():
@ -41,7 +43,7 @@ def main():
help='Specify the path to the properties file containing key value pairs of language mapped to ' help='Specify the path to the properties file containing key value pairs of language mapped to '
'the commit of when bundles for that language were most recently updated.') 'the commit of when bundles for that language were most recently updated.')
parser.add_argument('-t', '--translation-dict', dest='translation_dict', type=bool, required=False, default=False, parser.add_argument('-td', '--translation-dict', dest='translation_dict', type=bool, required=False, default=False,
help='If this flag is specified, a dictionary mapping original prop key values to translated ' help='If this flag is specified, a dictionary mapping original prop key values to translated '
'values. If this flag is specified, it will ') 'values. If this flag is specified, it will ')
@ -55,7 +57,7 @@ def main():
output_type = args.output_type output_type = args.output_type
show_translated_col = not args.no_translated_col show_translated_col = not args.no_translated_col
language_updates_file = args.language_file language_updates_file = args.language_file
use_translation_dict = args.translation_dict
lang = args.language lang = args.language
if lang is not None: if lang is not None:
commit_1_id = get_commit_for_language(lang, language_updates_file) commit_1_id = get_commit_for_language(lang, language_updates_file)
@ -66,6 +68,13 @@ def main():
parser.print_help(sys.stderr) parser.print_help(sys.stderr)
sys.exit(1) sys.exit(1)
translation_dict = None
if use_translation_dict and lang:
translation_dict = extract_translations(
file_iter=get_tree(repo_path, commit_1_id),
orig_filename=DEFAULT_PROPS_FILENAME,
translated_filename=get_lang_bundle_name(lang))
commit_2_id = args.commit_2_id commit_2_id = args.commit_2_id
show_commits = not args.no_commits show_commits = not args.no_commits
@ -73,6 +82,7 @@ def main():
processing_result = convert_to_output(changes, processing_result = convert_to_output(changes,
commit1_id=get_commit_id(repo_path, commit_1_id) if show_commits else None, commit1_id=get_commit_id(repo_path, commit_1_id) if show_commits else None,
commit2_id=get_commit_id(repo_path, commit_2_id) if show_commits else None, commit2_id=get_commit_id(repo_path, commit_2_id) if show_commits else None,
translation_dict=translation_dict,
show_translated_col=show_translated_col, show_translated_col=show_translated_col,
separate_deleted=True) separate_deleted=True)

View File

@ -8,6 +8,7 @@ from typing import Union
def get_proj_dir(path: Union[pathlib.PurePath, str] = __file__) -> str: def get_proj_dir(path: Union[pathlib.PurePath, str] = __file__) -> str:
""" """
Gets parent directory of this file (and subsequently, the project). Gets parent directory of this file (and subsequently, the project).
Args: Args:
path: Can be overridden to provide a different file. This will return the parent of that file in that instance. path: Can be overridden to provide a different file. This will return the parent of that file in that instance.

View File

@ -6,6 +6,7 @@ from pathlib import Path
def get_path_pieces(orig_path: str) -> Tuple[str, Union[str, None], Union[str, None]]: def get_path_pieces(orig_path: str) -> Tuple[str, Union[str, None], Union[str, None]]:
"""Retrieves path pieces. This is a naive approach as it determines if a file is present based on the """Retrieves path pieces. This is a naive approach as it determines if a file is present based on the
presence of an extension. presence of an extension.
Args: Args:
orig_path: The original path to deconstruct. orig_path: The original path to deconstruct.
@ -27,6 +28,7 @@ def get_path_pieces(orig_path: str) -> Tuple[str, Union[str, None], Union[str, N
def get_joined_path(folder: str, file_name: str) -> str: def get_joined_path(folder: str, file_name: str) -> str:
""" """
Gets a joined folder and filename. Gets a joined folder and filename.
Args: Args:
folder: The folder. folder: The folder.
file_name: The filename. file_name: The filename.
@ -53,16 +55,6 @@ def get_new_path(orig_path: str, new_filename: str) -> str:
return str(Path(parent_dir) / Path(new_filename)) return str(Path(parent_dir) / Path(new_filename))
# For use with creating csv filenames for entries that have been omitted.
OMITTED_ADDITION = '-omitted'
# For use with creating csv filenames for entries that have been deleted.
DELETED_ADDITION = '-deleted'
# For translations where
FOUND_ADDITION = '-found'
def get_filename_addition(orig_path: str, filename_addition: str) -> str: def get_filename_addition(orig_path: str, filename_addition: str) -> str:
"""Gets filename with addition. So if item is '/path/name.ext' and the filename_addition is '-add', the new result """Gets filename with addition. So if item is '/path/name.ext' and the filename_addition is '-add', the new result
would be '/path/name-add.ext'. would be '/path/name-add.ext'.

View File

@ -1,7 +1,7 @@
"""Functions relating to using git and GitPython with an existing repo. """Functions relating to using git and GitPython with an existing repo.
""" """
from git import Repo, Diff, Blob from git import Repo, Diff, Blob, Tree
from typing import List, Union, Iterator, Tuple, Any from typing import List, Union, Iterator, Tuple, Any
from itemchange import ItemChange, get_changed from itemchange import ItemChange, get_changed
from pathlib import Path from pathlib import Path
@ -17,6 +17,7 @@ def get_git_root(child_path: str) -> str:
""" """
Taken from https://stackoverflow.com/questions/22081209/find-the-root-of-the-git-repository-where-the-file-lives, Taken from https://stackoverflow.com/questions/22081209/find-the-root-of-the-git-repository-where-the-file-lives,
this obtains the root path of the git repo in which this file exists. this obtains the root path of the git repo in which this file exists.
Args: Args:
child_path: The path of a child within the repo. child_path: The path of a child within the repo.
@ -146,6 +147,21 @@ def list_paths(root_tree, path: Path = Path('.')) -> Iterator[Tuple[str, Blob]]:
yield from list_paths(tree, path / tree.name) yield from list_paths(tree, path / tree.name)
def get_tree(repo_path: str, commit_id: str) -> Tree:
"""
Retrieves the tree that can be walked for files and file content at the specified commit.
Args:
repo_path: The path to the repo or a child directory of the repo.
commit_id: The commit id.
Returns: The tree.
"""
repo = Repo(repo_path, search_parent_directories=True)
commit = repo.commit(commit_id.strip())
return commit.tree
def get_property_file_entries(repo_path: str, at_commit: str = 'HEAD', def get_property_file_entries(repo_path: str, at_commit: str = 'HEAD',
property_file_extension: str = DEFAULT_PROPS_EXTENSION) -> Iterator[PropEntry]: property_file_extension: str = DEFAULT_PROPS_EXTENSION) -> Iterator[PropEntry]:
""" """
@ -157,11 +173,8 @@ def get_property_file_entries(repo_path: str, at_commit: str = 'HEAD',
property_file_extension: The extension to use for scanning for property files. property_file_extension: The extension to use for scanning for property files.
Returns: An iterator of PropEntry objects. Returns: An iterator of PropEntry objects.
""" """
repo = Repo(repo_path, search_parent_directories=True) for item in get_tree(repo_path, at_commit):
commit = repo.commit(at_commit.strip())
for item in list_paths(commit.tree):
path, blob = item path, blob = item
if path.endswith(property_file_extension): if path.endswith(property_file_extension):
for key, val in get_entry_dict(get_text(blob)).items(): for key, val in get_entry_dict(get_text(blob)).items():

View File

@ -58,6 +58,7 @@ class ItemChange:
def get_row(self, show_translated_col: bool) -> List[str]: def get_row(self, show_translated_col: bool) -> List[str]:
"""Returns the list of values to be entered as a row in csv serialization. """Returns the list of values to be entered as a row in csv serialization.
Args: Args:
show_translated_col (bool): Whether or not the translated columns are showing; otherwise use default. show_translated_col (bool): Whether or not the translated columns are showing; otherwise use default.
@ -92,6 +93,7 @@ def convert_to_output(items: Iterator[ItemChange],
separate_deleted: bool = True) -> OutputResult: separate_deleted: bool = True) -> OutputResult:
""" """
Converts PropEntry objects to an output result to be written to a tabular datasource. Converts PropEntry objects to an output result to be written to a tabular datasource.
Args: Args:
items: The PropEntry items. items: The PropEntry items.
commit1_id: The first commit id to be shown in the header or None. commit1_id: The first commit id to be shown in the header or None.

View File

@ -23,6 +23,7 @@ def _get_props_path(language_updates_file: Union[str, None]):
def get_commit_for_language(language: str, language_updates_file: Union[str, None] = None) -> Union[str, None]: def get_commit_for_language(language: str, language_updates_file: Union[str, None] = None) -> Union[str, None]:
""" """
Retrieves the latest commit for a particular language. Retrieves the latest commit for a particular language.
Args: Args:
language: The language key. language: The language key.
language_updates_file: The file containing the most recent updates. If not provided, the default file located language_updates_file: The file containing the most recent updates. If not provided, the default file located
@ -45,6 +46,7 @@ def get_commit_for_language(language: str, language_updates_file: Union[str, Non
def set_commit_for_language(language: str, latest_commit: str, language_updates_file: Union[str, None] = None): def set_commit_for_language(language: str, latest_commit: str, language_updates_file: Union[str, None] = None):
""" """
Sets the most recent update for a language within the language updates file. Sets the most recent update for a language within the language updates file.
Args: Args:
language: The language key. language: The language key.
latest_commit: The commit for how recent the language is. latest_commit: The commit for how recent the language is.

View File

@ -19,6 +19,7 @@ class FoundValue:
def __init__(self, common_path, original_file, translated_file, key, orig_val, translated_val): def __init__(self, common_path, original_file, translated_file, key, orig_val, translated_val):
""" """
Constructor. Constructor.
Args: Args:
common_path: The folder common to both files. common_path: The folder common to both files.
original_file: The original file path. original_file: The original file path.
@ -38,15 +39,22 @@ class FoundValue:
def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: str, translated_filename: str) \ def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: str, translated_filename: str) \
-> Dict[str, FoundValue]: -> Dict[str, FoundValue]:
""" """
Creates a translations dictionary based on comparing the values of keys in an original bundles file and a translated
bundles file in the same directory. For instance, if /path/to/original.properties and
/path/to/translated.properties both exist and in both files, a key-value pairing for keyA exists, the dictionary
will contain an entry mapping the original value for keyA to the translated value and other metadata for that
key.
Args: Args:
file_iter: file_iter: An iterator of tuples containing the path and the content of the file.
orig_filename: orig_filename: The original file name (i.e. 'bundle.properties-MERGED').
translated_filename: translated_filename: The translated file name (i.e. 'Bundle_ja.properties').
Returns: Returns: A dictionary mapping original values to translated values.
""" """
# Create a dictionary mapping parent path to the file content for both original and translated files
original_files: Dict[str, Tuple[str, Blob]] = dict() original_files: Dict[str, Tuple[str, Blob]] = dict()
translated_files: Dict[str, Tuple[str, Blob]] = dict() translated_files: Dict[str, Tuple[str, Blob]] = dict()
@ -57,8 +65,8 @@ def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: s
elif file_name.strip().lower() == translated_filename.strip().lower(): elif file_name.strip().lower() == translated_filename.strip().lower():
translated_files[file_name] = (parent_dir, content) translated_files[file_name] = (parent_dir, content)
# determine original and translated files with common parent folders and find common keys
to_ret: Dict[str, FoundValue] = dict() to_ret: Dict[str, FoundValue] = dict()
for common_folder, ((original_path, original_blob), (translated_path, translated_blob))\ for common_folder, ((original_path, original_blob), (translated_path, translated_blob))\
in common_entries(original_files, translated_files): in common_entries(original_files, translated_files):
orig_dict = sanitize_prop_dict_keys(get_entry_dict(original_blob)) orig_dict = sanitize_prop_dict_keys(get_entry_dict(original_blob))
@ -77,6 +85,15 @@ def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: s
def sanitize_prop_dict_keys(dct: Dict[str, str]) -> Dict[str, str]: def sanitize_prop_dict_keys(dct: Dict[str, str]) -> Dict[str, str]:
"""
Sanitizes all the keys in a dictionary (i.e. strips white space and makes lower case).
Args:
dct: The dictionary.
Returns: The dictionary with sanitized keys.
"""
return {k.strip().lower(): v for k, v in dct.items()} return {k.strip().lower(): v for k, v in dct.items()}
@ -89,6 +106,7 @@ def common_entries(*dcts: Dict[K, V]) -> Iterator[Tuple[K, Tuple[V, ...]]]:
Taken from https://stackoverflow.com/questions/16458340/python-equivalent-of-zip-for-dictionaries, Taken from https://stackoverflow.com/questions/16458340/python-equivalent-of-zip-for-dictionaries,
creates creates an iterator of tuples where the left value is the common key value and the right hand value is creates creates an iterator of tuples where the left value is the common key value and the right hand value is
a tuple of all the matching values in order that the dictionaries were ordered in parameters. a tuple of all the matching values in order that the dictionaries were ordered in parameters.
Args: Args:
*dcts: The dictionaries in order to provide common key/values. *dcts: The dictionaries in order to provide common key/values.

View File

@ -25,6 +25,7 @@ class OutputResult:
style: Union[List[ColumnStyle], None] = None, freeze_first_row: bool = True): style: Union[List[ColumnStyle], None] = None, freeze_first_row: bool = True):
""" """
Constructs a ProcessingResult. Constructs a ProcessingResult.
Args: Args:
results: Items to be written as results. Data will be written such that the item at row,cell will be results: Items to be written as results. Data will be written such that the item at row,cell will be
located within result at results[row][col]. located within result at results[row][col].

View File

@ -40,6 +40,7 @@ def convert_to_output(items: Iterator[PropEntry], commit_id: Union[str, None] =
show_translated_col: bool = True, value_regex: Union[str, None] = None) -> OutputResult: show_translated_col: bool = True, value_regex: Union[str, None] = None) -> OutputResult:
""" """
Converts PropEntry objects to an output result to be written to a tabular datasource. Converts PropEntry objects to an output result to be written to a tabular datasource.
Args: Args:
items: The PropEntry items. items: The PropEntry items.
commit_id: The commit id to be shown in the header or None. commit_id: The commit id to be shown in the header or None.

View File

@ -5,9 +5,11 @@ from jproperties import Properties
import os import os
# The default extension for property files in autopsy repo # The default extension for property files in autopsy repo
DEFAULT_PROPS_EXTENSION = 'properties-MERGED' DEFAULT_PROPS_EXTENSION = 'properties-MERGED'
# The default filename for property files in autopsy repo
DEFAULT_PROPS_FILENAME = 'Bundle.{ext}'.format(ext=DEFAULT_PROPS_EXTENSION)
def get_lang_bundle_name(language: str) -> str: def get_lang_bundle_name(language: str) -> str:
""" """
@ -43,6 +45,7 @@ def get_entry_dict_from_path(props_path: str) -> Union[Dict[str, str], None]:
""" """
Retrieves a dictionary mapping the properties represented in the string or None if no properties file can be found Retrieves a dictionary mapping the properties represented in the string or None if no properties file can be found
at that path. at that path.
Args: Args:
props_path: The path to the properties file. props_path: The path to the properties file.

View File

@ -32,6 +32,7 @@ def create_output_result(row_header: List[str], results: List[List[str]],
""" """
Creates OutputResult from components. Creates OutputResult from components.
Args: Args:
row_header: The row header. row_header: The row header.
results: The results. results: The results.
@ -47,4 +48,9 @@ def create_output_result(row_header: List[str], results: List[List[str]],
deleted_result = [row_header] + deleted if deleted else None deleted_result = [row_header] + deleted if deleted else None
found_result = [row_header] + found_translation if found_translation else None found_result = [row_header] + found_translation if found_translation else None
return OutputResult([row_header] + results, omitted_result, deleted_result, style) return OutputResult(
results=[row_header] + results,
omitted=omitted_result,
deleted=deleted_result,
found=found_result,
style=style)

View File

@ -195,6 +195,7 @@ class DataRows:
deleted_results: Union[List[List[str]], None] = None): deleted_results: Union[List[List[str]], None] = None):
""" """
Creates a DataRows object. Creates a DataRows object.
Args: Args:
results: The 2d list of strings representing cells. results: The 2d list of strings representing cells.
header: The header row if present. header: The header row if present.
@ -208,6 +209,7 @@ class DataRows:
def get_csv_rows(input_path: str, has_header: bool) -> DataRows: def get_csv_rows(input_path: str, has_header: bool) -> DataRows:
""" """
Gets rows of a csv file in a DataRows format. Gets rows of a csv file in a DataRows format.
Args: Args:
input_path: The input path of the file. input_path: The input path of the file.
has_header: Whether or not it has a header. has_header: Whether or not it has a header.
@ -222,6 +224,7 @@ def get_csv_rows(input_path: str, has_header: bool) -> DataRows:
def get_xlsx_rows(input_path: str, has_header: bool, results_sheet: str, deleted_sheet: str) -> DataRows: def get_xlsx_rows(input_path: str, has_header: bool, results_sheet: str, deleted_sheet: str) -> DataRows:
""" """
Gets worksheets of an excel workbook in a DataRows format. Gets worksheets of an excel workbook in a DataRows format.
Args: Args:
input_path: The input path of the file. input_path: The input path of the file.
has_header: Whether or not is has a header. has_header: Whether or not is has a header.
@ -250,6 +253,7 @@ def get_prop_entries_from_data(datarows: DataRows, path_idx: int, key_idx: int,
path_converter: Callable) -> List[PropEntry]: path_converter: Callable) -> List[PropEntry]:
""" """
Converts a DataRows object into PropEntry objects. Converts a DataRows object into PropEntry objects.
Args: Args:
datarows: The DataRows object. datarows: The DataRows object.
path_idx: The index of the column containing the path. path_idx: The index of the column containing the path.