Unverified Commit 5b5db28d authored by Riyasat Ohib's avatar Riyasat Ohib Committed by GitHub
Browse files

weigit status and checking for file modification and tracking (#1021)

* [Fix] Restructure for wgit availability as a package

* Preliminary implementation of wgit status

* [Feat] Addition of wgit status
1. Functionalities to check the status of the repo.
2. Checks if file has been modified, whether changes added or added changes commited.

* [test] Addition of tests for weigit status
1. Some minor refactors and docstring changes

* [Fix] Changes in repo status test

* [test] status test fix
1. made the test status printing order independent

* [refactor] Metadata dirs mirroring chkpt paths, changes in wgit status
1. Metadata files are now created within wgit with directory structure mirroring the relative paths of the checkpoint/files they track.
2. Changes in status: 3 statuses now.
3. Changes in tests.
4. Some code refactoring.

* [cleanup] minor changes in comments and cleanup
parent 775a0f06
......@@ -80,7 +80,8 @@ def main(argv: List[str] = None) -> None:
if args.command == "status":
repo = Repo(Path.cwd())
repo.status()
out = repo.status()
print(out)
if args.command == "log":
repo = Repo(Path.cwd())
......
......@@ -6,7 +6,7 @@
from pathlib import Path
import subprocess
import sys
from typing import List, Tuple
from typing import Dict, List, Tuple
import pygit2
......@@ -123,9 +123,18 @@ class PyGit:
"""returns the path of the git repository PyGit is wrapped around"""
return self.repo.path
def status(self) -> None:
"""Show the status of the git repo"""
print(self.repo.status())
def status(self) -> Dict:
"""Gathers the status of the git repo within wgit and returns a dictionary detailing the status.
The dictionary contains the relative paths of the metadata files as keys and the values represent
the status of the file in the form of an int number as status codes. These status codes are
elaborated within PyGit2's documentation: https://www.pygit2.org/index_file.html#status and
https://github.com/libgit2/pygit2/blob/320ee5e733039d4a3cc952b287498dbc5737c353/src/pygit2.c#L312-L320
Returns: {"relative path to a file" : pygit2 status codes}
"""
status_dict = self.repo.status()
tracking_dict = dict(filter(lambda item: item[1] != pygit2.GIT_STATUS_IGNORED, status_dict.items()))
return tracking_dict
def _set_author_config(self, name: str, email: str) -> Tuple[str, str]:
"""Set the name and email for the pygit repo collecting from the gitconfig.
......
......@@ -3,11 +3,12 @@
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
from enum import Enum
import json
import pathlib
from pathlib import Path
import sys
from typing import Tuple, Union
from typing import Dict, Tuple, Union
from .pygit import PyGit
from .sha1_store import SHA1_store
......@@ -49,7 +50,10 @@ class Repo:
self._sha1_store = SHA1_store(weigit_dir, init=True)
# # Make the .wgit a git repo
gitignore_files = [self._sha1_store_path.name, self._sha1_store.ref_file_path.name]
gitignore_files = [
self._sha1_store_path.name,
self._sha1_store.ref_file_path.name,
]
self._pygit = PyGit(weigit_dir, gitignore=gitignore_files)
elif exists and init:
......@@ -78,13 +82,14 @@ class Repo:
if self._exists(self.wgit_parent):
# create the corresponding metadata file
file_path = Path(in_file_path)
metadata_file, parent_sha1 = self._process_metadata_file(file_path.name)
rel_file_path = self._rel_file_path(file_path)
metadata_file, parent_sha1 = self._process_metadata_file(rel_file_path)
# add the file to the sha1_store
sha1_hash = self._sha1_store.add(file_path, parent_sha1)
# write metadata to the metadata-file
self._write_metadata(metadata_file, sha1_hash)
self._write_metadata(metadata_file, file_path, sha1_hash)
self._pygit.add() # add to the .wgit/.git repo
else:
sys.stderr.write("fatal: no wgit repo exists!\n")
......@@ -104,10 +109,28 @@ class Repo:
sys.stderr.write("fatal: no wgit repo exists!\n")
sys.exit(1)
def status(self) -> None:
"""Show the state of the working tree."""
def status(self) -> Dict:
"""Show the state of the weigit working tree. State can be
1. dirty with changes/modifications not added to weigit repo,
2. dirty with a file changes added but not committed
3. clean and tracking files after a change has been committed, or clean with with an empty repo.
"""
if self._exists(self.wgit_parent):
print("wgit status")
pygit_status = self._pygit.status()
status = self._get_metdata_files()
if status:
out_status = dict()
for metadata_file, is_modified in status.items():
# if metadata_file is among the keys of pygit_status dict, it has not been commited to git yet.
if is_modified:
out_status[str(metadata_file)] = RepoStatus.CHANGES_NOT_ADDED
elif not is_modified and metadata_file in pygit_status.keys():
out_status[str(metadata_file)] = RepoStatus.CHANGES_ADDED_NOT_COMMITED
elif not is_modified and metadata_file not in pygit_status.keys():
out_status[str(metadata_file)] = RepoStatus.CLEAN
return out_status
else: # if status dict is empty, nothing has been added so far.
return {"": RepoStatus.CLEAN} # sub case of case-3, clean with an empty repo
else:
sys.stderr.write("fatal: no wgit repo exists!\n")
sys.exit(1)
......@@ -153,10 +176,52 @@ class Repo:
self._exists(self.wgit_parent)
return self._repo_path
def _process_metadata_file(self, metadata_fname: str) -> Tuple[Path, str]:
"""Create a metadata_file corresponding to the file to be tracked by weigit if the first version of the file is encountered.
If a version already exists, open the file and get the sha1_hash of the last version as parent_sha1"""
def _get_metdata_files(self) -> Dict:
"""Walk the directories that contain the metadata files and check the status of those files,
whether they have been modified or not.
"""
metadata_d = dict()
for file in self.path.iterdir(): # iterate over the .wgit directory
# exlude all the .wgit files and directory
if file.name not in {"sha1_store", "sha1_refs.json", ".git", ".gitignore"}:
# perform a directory walk on the metadata_file directories to find the metadata files
for path in file.rglob("*"):
if path.is_file():
rel_path = str(path.relative_to(self.path)) # metadata path relative to .wgit dir
metadata_d[rel_path] = self._is_file_modified(path)
return metadata_d
def _is_metadata_file(self, file: Path) -> bool:
"""Checks whether a file is a valid metadata file by matching keys and checking if it has valid
json data."""
try:
with open(file) as f:
metadata = json.load(f)
is_metadata = set(metadata.keys()) == {
"SHA1",
"file_path",
"last_modified_time_stamp",
} # TODO: Consider storing the keys as a class attribute, instead of hard coding.
except json.JSONDecodeError:
return False # not a json file, so not valid metadata file
return is_metadata
def _is_file_modified(self, file: Path) -> bool:
"""Checks whether a file has been modified since its last recorded modification time recorded in the metadata_file"""
with open(file) as f:
data = json.load(f)
# get the last modified timestamp recorded by weigit and the current modified timestamp. If not the
# same, then file has been modified since last weigit updated metadata
last_mod_timestamp = data["last_modified_time_stamp"]
curr_mod_timestamp = Path(data["file_path"]).stat().st_mtime
return not curr_mod_timestamp == last_mod_timestamp
def _process_metadata_file(self, metadata_fname: Path) -> Tuple[Path, str]:
"""Create a metadata_file corresponding to the file to be tracked by weigit if the first version of the file
is encountered. If a version already exists, open the file and get the sha1_hash of the last version as parent_sha1"""
metadata_file = self.path.joinpath(metadata_fname)
metadata_file.parent.mkdir(parents=True, exist_ok=True) # create parent dirs for metadata file
if not metadata_file.exists() or not metadata_file.stat().st_size:
metadata_file.touch()
parent_sha1 = "ROOT"
......@@ -166,18 +231,29 @@ class Repo:
parent_sha1 = ref_data["SHA1"]["__sha1_full__"]
return metadata_file, parent_sha1
def _write_metadata(self, metadata_file: Path, sha1_hash: str) -> None:
"""Write metadata to the metadata file file"""
change_time = Path(metadata_file).stat().st_ctime
def _write_metadata(self, metadata_file: Path, file_path: Path, sha1_hash: str) -> None:
"""Write metadata to the metadata file"""
change_time = Path(file_path).stat().st_mtime
metadata = {
"SHA1": {
"__sha1_full__": sha1_hash,
},
"file_path": str(file_path),
"last_modified_time_stamp": change_time,
}
with open(metadata_file, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=4)
def _rel_file_path(self, filepath: Path) -> Path:
"""Find the relative part to the filepath from the current working directory and return the relative path."""
# get the absolute path
filepath = filepath.resolve()
# using zipped loop we get the path common to the filepath and cwd
for i, (x, y) in enumerate(zip(filepath.parts, Path.cwd().parts)):
pass
# return the relative part (path not common to cwd)
return Path(*filepath.parts[i:])
def _exists(self, check_dir: Path) -> bool:
"""Returns True if a valid wgit exists within the cwd and iteratively checks to the root directory and
sets the self._repo_path attribute to the wgit path.
......@@ -209,3 +285,11 @@ class Repo:
git_exists = check_dir.joinpath(".wgit/.git").exists()
gitignore_exists = check_dir.joinpath(".wgit/.gitignore").exists()
return wgit_exists, sha1_refs, git_exists, gitignore_exists
class RepoStatus(Enum):
"""Collections of Repo Statuses"""
CLEAN = 1
CHANGES_NOT_ADDED = 2
CHANGES_ADDED_NOT_COMMITED = 3
......@@ -3,7 +3,6 @@
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import json
import os
from pathlib import Path
......@@ -13,6 +12,7 @@ import shutil
import pytest
from fairscale.experimental.wgit import repo as api
from fairscale.experimental.wgit.repo import RepoStatus
@pytest.fixture
......@@ -30,7 +30,7 @@ def create_test_dir():
os.chdir(test_dir)
# create random checkpoints
size_list = [30e5, 35e5, 40e5]
size_list = [30e5, 35e5, 40e5, 40e5]
for i, size in enumerate(size_list):
with open(f"checkpoint_{i}.pt", "wb") as f:
f.write(os.urandom(int(size)))
......@@ -58,10 +58,11 @@ def test_api_init(capsys, repo):
def test_api_add(capsys, repo):
fnum = random.randint(0, 2)
chkpt0 = f"checkpoint_{fnum}.pt"
repo.add(f"checkpoint_{fnum}.pt")
repo.add(chkpt0)
sha1_hash = repo._sha1_store._get_sha1_hash(chkpt0)
with open(os.path.join(".wgit", f"checkpoint_{fnum}.pt"), "r") as f:
metadata_path = repo._rel_file_path(Path(chkpt0))
with open(os.path.join(".wgit", metadata_path), "r") as f:
json_data = json.load(f)
sha1_dir_0 = f"{sha1_hash[:2]}/" + f"{sha1_hash[2:]}"
......@@ -77,10 +78,51 @@ def test_api_commit(capsys, repo):
def test_api_status(capsys, repo):
repo.status()
captured = capsys.readouterr()
assert captured.out == "wgit status\n"
assert captured.err == ""
# delete the repo and initialize a new one:
shutil.rmtree(".wgit")
repo = api.Repo(Path.cwd(), init=True)
# check status before any file is added
out = repo.status()
assert out == {"": RepoStatus.CLEAN}
# check status before after a file is added but not committed
chkpt0 = f"checkpoint_{random.randint(0, 1)}.pt"
repo.add(chkpt0)
out = repo.status()
key_list = list(repo._get_metdata_files().keys())
assert out == {key_list[0]: RepoStatus.CHANGES_ADDED_NOT_COMMITED}
# check status after commit
repo.commit("e1")
out = repo.status()
assert out == {key_list[0]: RepoStatus.CLEAN}
# check status after a new change has been made to the file
with open(chkpt0, "wb") as f:
f.write(os.urandom(int(15e5)))
out = repo.status()
assert out == {key_list[0]: RepoStatus.CHANGES_NOT_ADDED}
# add the new changes made to weigit
repo.add(chkpt0)
out = repo.status()
assert out == {key_list[0]: RepoStatus.CHANGES_ADDED_NOT_COMMITED}
# check status after a new different file is added to be tracked by weigit
chkpt3 = "checkpoint_3.pt"
repo.add(chkpt3)
key_list = list(repo._get_metdata_files().keys())
out = repo.status()
assert out == {
key_list[0]: RepoStatus.CHANGES_ADDED_NOT_COMMITED,
key_list[1]: RepoStatus.CHANGES_ADDED_NOT_COMMITED,
}
# check status after the new file is commited to be tracked by weigit
repo.commit("e2")
out = repo.status()
assert out == {key_list[0]: RepoStatus.CLEAN, key_list[1]: RepoStatus.CLEAN}
def test_api_log(capsys, repo):
......
......@@ -27,7 +27,6 @@ def create_test_dir():
shutil.rmtree(test_dir)
os.makedirs(test_dir)
os.chdir(test_dir)
# create random checkpoints
size_list = [30e5, 35e5, 40e5]
for i, size in enumerate(size_list):
......@@ -56,10 +55,8 @@ def test_cli_add(capsys):
Path.cwd().joinpath(".wgit"),
init=False,
)
sha1_hash = sha1_store._get_sha1_hash(chkpt0)
with open(os.path.join(".wgit", "checkpoint_0.pt"), "r") as f:
with open(os.path.join(".wgit", "wgit_testing/checkpoint_0.pt"), "r") as f:
json_data = json.load(f)
sha1_dir_0 = f"{sha1_hash[:2]}/" + f"{sha1_hash[2:]}"
......@@ -77,7 +74,7 @@ def test_cli_commit(capsys):
def test_cli_status(capsys):
cli.main(["status"])
captured = capsys.readouterr()
assert captured.out == "wgit status\n"
assert captured.out == "{'wgit_testing/checkpoint_0.pt': <RepoStatus.CLEAN: 1>}\n"
assert captured.err == ""
......
......@@ -81,7 +81,7 @@ def test_sha1_add(sha1_configs, sha1_store):
metadata_file, parent_sha1 = repo._process_metadata_file(chkpt1.name)
sha1_hash = sha1_store.add(sha1_configs.checkpoint_1a, parent_sha1)
repo._write_metadata(metadata_file, sha1_hash)
repo._write_metadata(metadata_file, chkpt1, sha1_hash)
# for checkpoint 1
metadata_file = sha1_configs.test_path.joinpath(sha1_configs.checkpoint_1a.name)
......@@ -97,7 +97,7 @@ def test_sha1_refs(sha1_configs, sha1_store):
def add_checkpoint(checkpoint):
metadata_file, parent_sha1 = repo._process_metadata_file(checkpoint.name)
sha1_hash = sha1_store.add(checkpoint, parent_sha1)
repo._write_metadata(metadata_file, sha1_hash)
repo._write_metadata(metadata_file, checkpoint, sha1_hash)
return sha1_hash
with open(sha1_configs.sha1_ref, "r") as file:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment