Unverified Commit 488759d2 authored by LSinev's avatar LSinev Committed by GitHub
Browse files

Apply some best practices and guideline recommendations to code (#1363)

* raise Exception, not a string

Additional info https://peps.python.org/pep-0352/#exception-hierarchy-changes
https://docs.python.org/3.8/tutorial/errors.html#raising-exceptions

* Apply PEP8 recommendation to prefer isinstance

"Object type comparisons should always use isinstance() instead of comparing types directly"
https://peps.python.org/pep-0008/

* Remove dangerous default mutable values in arguments

https://pylint.readthedocs.io/en/stable/user_guide/messages/warning/dangerous-default-value.html

* Format logging messages with fstring (not with format)

Additional info
https://pylint.readthedocs.io/en/stable/user_guide/messages/warning/logging-format-interpolation.html
There are also discussions about the speed of formatting while logging or some unintended code executions
https://github.com/pylint-dev/pylint/issues/2395
https://stackoverflow.com/a/54368109
but at least one format (fstring one) will be used throughout the project

* Specify utf-8 encoding for `open` explicitly

If not specified, it may be supposed differently in different environments, OSes, and Python versions. See
https://peps.python.org/pep-0597/
https://docs.python.org/3.11/library/locale.html#locale.getencoding
https://docs.python.org/3.10/library/os.html#utf8-mode
https://pylint.readthedocs.io/en/stable/user_guide/messages/warning/unspecified-encoding.html

Helps also if some code from English language tasks is taken as inspiration for tasks in non-English languages.

* Use inline-ignoring comments to pass pre-commit instead of identity process

https://flake8.pycqa.org/en/3.0.1/user/ignoring-errors.html#in-line-ignoring-errors
https://www.flake8rules.com/rules/F841.html

flake8 comments are supported by ruff: https://docs.astral.sh/ruff/linter/#error-suppression
parent 97a67d27
......@@ -50,7 +50,7 @@ def process_docs(dataset, set_answer_type="bool"):
obs_list["abstract"].append(abstract)
obs_list["question"].append(question)
obs_list["answer_type"].append(answer_type)
if type(answer) == list:
if isinstance(answer, list):
answer = ", ".join(answer)
obs_list["answer"].append(answer)
......
......@@ -51,7 +51,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for lang in LANGUAGES:
file_name = f"xwinograd_{lang}.yaml"
try:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
......
......@@ -23,7 +23,7 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()
with open(args.benchmark_path) as file:
with open(args.benchmark_path, encoding="utf-8") as file:
TASK_LIST = yaml.full_load(file)
for task in tqdm(TASK_LIST):
eval_logger.info(f"Processing {task}")
......@@ -57,5 +57,5 @@ if __name__ == "__main__":
file_save_path = os.path.join(file_path, full_file_name)
eval_logger.info(f"Save to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(config_dict, yaml_file)
......@@ -119,7 +119,7 @@ class Buckets:
def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
pile_statistics = json.load(open("pile_statistics.json", "r"))
pile_statistics = json.load(open("pile_statistics.json", "r", encoding="utf-8"))
pile_document_count = pile_statistics["Document Count"]
start_offsets = pile_statistics["File Start Offsets"]
......@@ -212,4 +212,4 @@ if __name__ == "__main__":
info_dict = {"title": "dataset ngrams", "ngram_size": 13}
info_dict_path = os.path.join(args.working_directory, "info.json")
json.dump(info_dict, open(info_dict_path, "w"))
json.dump(info_dict, open(info_dict_path, "w", encoding="utf-8"))
......@@ -79,7 +79,7 @@ if __name__ == "__main__":
stats_file_path = "pile_statistics.json"
if os.path.exists(stats_file_path):
stats = json.load(open(stats_file_path, "r"))
stats = json.load(open(stats_file_path, "r", encoding="utf-8"))
else:
document_count, total_document_size_chars, start_offsets = get_stats()
stats = {
......@@ -88,7 +88,7 @@ if __name__ == "__main__":
"Total Pile Characters": total_document_size_chars,
"File Start Offsets": start_offsets,
}
json.dump(stats, open(stats_file_path, "w"), indent=4)
json.dump(stats, open(stats_file_path, "w", encoding="utf-8"), indent=4)
print(f"document_count: {stats['Document Count']}")
print(f"total_chars: {stats['Total Pile Characters']}")
......
......@@ -61,14 +61,14 @@ if __name__ == "__main__":
if not filenames:
continue
path_readme = os.path.join(dirpath, "README.md")
with open(path_readme, "w") as f:
with open(path_readme, "w", encoding="utf-8") as f:
# get path name, only last folder
path_name = dirpath.split("/")[-1]
f.write(f"# {path_name} \n\n")
for filename in sorted([f for f in filenames if f.endswith(".json")]):
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
with open(path, "r", encoding="utf-8") as f:
result_dict = json.load(f)
with open(path_readme, "a") as f:
with open(path_readme, "a", encoding="utf-8") as f:
f.write(f"## {filename} \n")
f.write(f"{make_table(result_dict)} \n")
......@@ -50,5 +50,5 @@ if __name__ == "__main__":
values.append(v)
writer.value_matrix = values
table = writer.dumps()
with open(args.output, "w") as f:
with open(args.output, "w", encoding="utf-8") as f:
f.write(table)
......@@ -94,7 +94,11 @@ def eval_models(args, branch=None):
ret = os.system(command)
results[model] = json.load(open(output_path)) if ret == 0 else {"results": {}}
results[model] = (
json.load(open(output_path, encoding="utf-8"))
if ret == 0
else {"results": {}}
)
end_time = time.time()
......
......@@ -53,7 +53,7 @@ def main():
os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items():
if type(task) == tuple:
if isinstance(task, tuple):
group_name, task = task
rnd = random.Random()
rnd.seed(args.seed)
......
......@@ -69,18 +69,20 @@ def main():
model_args = re.sub(
"/|=",
"__",
json.load(open(Path(args.data_path, model, "results.json")))["config"][
"model_args"
],
json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
)["config"]["model_args"],
)
with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r"
Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
"r",
encoding="utf-8",
) as file:
data = json.loads(file.read())
configs = json.load(open(Path(args.data_path, model, "results.json")))[
"configs"
]
configs = json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
)["configs"]
config = configs[task]
if model_index == 0: # Only need to assemble data for the first model
......@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str):
list: A list of tasks for the model.
"""
dir_path = Path(data_path, model)
config = (json.load(open(Path(dir_path, "results.json")))["configs"],)
config = (
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
)
return list(config[0].keys())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment