Unverified Commit a1cc3874 authored by Riyasat Ohib's avatar Riyasat Ohib Committed by GitHub
Browse files

weigit: Fixed file tracking with metadata. Changes in sha1_store for better...


weigit: Fixed file tracking with metadata. Changes in sha1_store for better encapsulation. Docstrings. (#1013)

* [Feat] Fixed file tracking with metadata. Change in sha1_store for better encapsulation. Tests.

1. Adds metadata creation per added file and independently tracks version of each separate file added. That is, now creates separate metadata files
   for each file to be tracked.
2. Changes in reference tracking to accomodate the change in 1.
3. Somes changes in SHA1_store for better encapsulation.
4. Modified the tests to reflect above.

* [Feat]
1. Added docstrings to the classes.
2. Added a recursively search for the weigit repo upto root.
3. Some refactor of the codes.

* [Feat][Refactor] repo and sha1_store add modification and separation. Modification in reference tracking

1. Separation of add functionalities of repo.add and sha1_store.add.
2. Updated the reference tracking.
3. New tests and code refactor

* [Fix] Sha1_store fix overlap in first two characters of sha1 hash.
1. Accept multiple sha1 hash's with same two starting characters and create directories accordingly.

* [Fix] Minor refactoring and test fix

* [Fix] Fix for pygit class initialization in cases when no .gitconfig file is available
Co-authored-by: default avatarRiyasat Ohib <riohib@devfair0756.h2.fair>
parent 9c195fe2
...@@ -4,12 +4,28 @@ ...@@ -4,12 +4,28 @@
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
from pathlib import Path from pathlib import Path
from typing import List import subprocess
import sys
from typing import List, Tuple
import pygit2 import pygit2
class PyGit: class PyGit:
"""
PyGit class represents a git repo within a weigit repo.
Args:
parent_path (pathlib.Path)
Has to be the full path of the parent!
gitignore (List)
a list of files to be added to the .gitignore
name (str)
Name of the author of the git repo. Optionally used if it can't be determined from user's .gitconfig.
email (str)
email address of the author of the git repo
"""
def __init__( def __init__(
self, self,
parent_path: Path, parent_path: Path,
...@@ -17,20 +33,15 @@ class PyGit: ...@@ -17,20 +33,15 @@ class PyGit:
name: str = "user", name: str = "user",
email: str = "user@email.com", email: str = "user@email.com",
) -> None: ) -> None:
"""
PyGit class to wrap the wgit/.git repo and interact with the git repo.
Args:
parent_path: Has to be the full path of the parent!
"""
# Find if a git repo exists within .wgit repo: # Find if a git repo exists within .wgit repo:
# If exists: then discover it and set the self.gitrepo path to its path # If exists: then discover it and set the self.gitrepo path to its path
self._parent_path = parent_path self._parent_path = parent_path
self.name = name
self.email = email
git_repo_found = pygit2.discover_repository(self._parent_path) git_repo_found = pygit2.discover_repository(self._parent_path)
# If gitconfig file exists use the name and email from the file
self.name, self.email = self._set_author_config(name, email)
if git_repo_found: if git_repo_found:
# grab the parent dir of this git repo # grab the parent dir of this git repo
git_repo = Path(pygit2.discover_repository(self._parent_path)) git_repo = Path(pygit2.discover_repository(self._parent_path))
...@@ -50,7 +61,13 @@ class PyGit: ...@@ -50,7 +61,13 @@ class PyGit:
self._init_wgit_git(gitignore) self._init_wgit_git(gitignore)
def _init_wgit_git(self, gitignore: List) -> None: def _init_wgit_git(self, gitignore: List) -> None:
"""Initializes a .git within .wgit directory, making it a git repo.""" """
Initializes a .git within .wgit directory, making it a git repo.
Args:
gitignore (List)
a list of file paths to be ignored by the wgit git repo.
"""
self.repo = pygit2.init_repository(str(self._parent_path), False) self.repo = pygit2.init_repository(str(self._parent_path), False)
self.path = self._parent_path.joinpath(".git") self.path = self._parent_path.joinpath(".git")
...@@ -62,14 +79,25 @@ class PyGit: ...@@ -62,14 +79,25 @@ class PyGit:
file.write(f"{item}\n") file.write(f"{item}\n")
def add(self) -> None: def add(self) -> None:
"""git add all the untracked files not in gitignore, to the .wgit/.git repo.""" """
git add all the untracked files not in gitignore, to the .wgit/.git repo.
"""
# If .wgit is git repo, add all the files in .wgit not being ignored to git # If .wgit is git repo, add all the files in .wgit not being ignored to git
# TODO: Add functionalities for add specific files and add all files.
if self._exists: if self._exists:
self.repo.index.add_all() self.repo.index.add_all()
self.repo.index.write() self.repo.index.write()
else:
sys.stderr.write("fatal: git repo does not exist")
def commit(self, message: str) -> None: def commit(self, message: str) -> None:
"""git commit the staged changes to the .wgit/.git repo.""" """
git commit the staged changes to the .wgit/.git repo.
Args:
message (str)
Commit message
"""
# If .wgit is git repo, commit the staged files to git # If .wgit is git repo, commit the staged files to git
if self._exists: if self._exists:
# if no commit exists, set ref to HEAD and parents to empty # if no commit exists, set ref to HEAD and parents to empty
...@@ -96,5 +124,21 @@ class PyGit: ...@@ -96,5 +124,21 @@ class PyGit:
return self.repo.path return self.repo.path
def status(self) -> None: def status(self) -> None:
"""Print the status of the git repo""" """Show the status of the git repo"""
print(self.repo.status()) print(self.repo.status())
def _set_author_config(self, name: str, email: str) -> Tuple[str, str]:
"""Set the name and email for the pygit repo collecting from the gitconfig.
If not available in gitconfig, set the values from the passed arguments."""
gitconfig = Path("~/.gitconfig").expanduser()
# parse the .gitconfig file for name and email
try:
set_name = subprocess.run(["git", "config", "user.name"], capture_output=True, text=True).stdout.rstrip()
set_email = subprocess.run(["git", "config", "user.email"], capture_output=True, text=True).stdout.rstrip()
if not set_name or not set_email:
set_name = name
set_email = email
except BaseException:
set_name = name
set_email = email
return set_name, set_email
...@@ -3,118 +3,198 @@ ...@@ -3,118 +3,198 @@
# This source code is licensed under the BSD license found in the # This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
import json
import pathlib import pathlib
from pathlib import Path from pathlib import Path
import sys import sys
from typing import Union from typing import Tuple, Union
from .pygit import PyGit from .pygit import PyGit
from .sha1_store import SHA1_store from .sha1_store import SHA1_store
class Repo: class Repo:
def __init__(self, parent_dir: Path, init: bool = False) -> None:
"""Features:
1. Create the wgit directory if it does not exist.
2. SHA1Store.init()
3. Create SHA1 .wgit/sha1_refs.json
3. Initialize a .git directory within the .wgit using `git init`.
4. add a .gitignore within the .wgit directory, so that the git repo within will ignore `sha1_refs.json`
""" """
Represents the WeiGit repo for tracking neural network weights and their versions.
Args:
parent_dir (pathlib.Path, str)
The path to the parent directory where a weigit repo will be created. In the case a repo already exists, it will be wrapped with this class.
init (bool, optional)
- If ``True``, initializes a new WeiGit repo in the parent_dir. Initialization creates a `.wgit` directory within the <parent_dir>, triggers an initialization
of a sha1_store in the ./<parent_dir>/.wgit directory, and makes the ./<parent_dir>/.wgit a git repository through git initialization.
- If ``False``, a new WeiGit repo is not initialized and the existing repo is simply wrapped, populating the `wgit_parent` and other internal attributes.
- Default: False
"""
def __init__(self, parent_dir: Union[Path, str] = Path.cwd(), init: bool = False) -> None:
"""initialize a weigit repo: Subsequently, also initialize a sha1_store and a pygit git repo within as
part of the weigit initialization process"""
# If repo does not exist, creates a new wgit repo object with self.repo.path pointing to the path of repo # If repo does not exist, creates a new wgit repo object with self.repo.path pointing to the path of repo
# and notes all the internal files. # and notes all the internal files.
# else, if repo already exists: create a pygit object from the .wgit/.git. # else, if repo already exists: create a pygit object from the .wgit/.git.
self.wgit_parent = parent_dir self.wgit_parent = Path(parent_dir)
self._repo_path: Union[None, Path] = None self._repo_path: Union[None, Path] = None
self._wgit_dir = Path(".wgit")
self._metadata_file = Path(".wgit/checkpoint.pt")
self._sha1_ref = Path(".wgit/sha1_refs.json")
self._wgit_git_path = Path(".wgit/.git") self._wgit_git_path = Path(".wgit/.git")
self._sha1_store_path = Path(".wgit/sha1_store") self._sha1_store_path = Path(".wgit/sha1_store")
if not self._exists() and init: exists = self._exists(self.wgit_parent)
if not exists and init:
# No weigit repo exists and is being initialized with init=True # No weigit repo exists and is being initialized with init=True
# Make .wgit directory, create sha1_refs and metadata file # Make .wgit directory, create sha1_refs
self._wgit_dir.mkdir(exist_ok=True) weigit_dir = self.wgit_parent.joinpath(".wgit")
self._metadata_file.touch(exist_ok=False) weigit_dir.mkdir(parents=False, exist_ok=True)
self._sha1_ref.touch(exist_ok=False)
# Initializing sha1_store only after wgit has been initialized!
self._sha1_store = SHA1_store(weigit_dir, init=True)
# # Make the .wgit a git repo # # Make the .wgit a git repo
gitignore_files = [self._sha1_store_path.name, self._sha1_ref.name] gitignore_files = [self._sha1_store_path.name, self._sha1_store.ref_file_path.name]
self._pygit = PyGit(self.wgit_parent.joinpath(self._wgit_dir), gitignore=gitignore_files) self._pygit = PyGit(weigit_dir, gitignore=gitignore_files)
# Initializing sha1_store only after wgit has been initialized! elif exists and init:
self._sha1_store = SHA1_store(self._wgit_dir, self._metadata_file, self._sha1_ref, init=True)
elif self._exists() and init:
# if weigit repo already exists and init is being called, wrap the existing .wgit/.git repo with PyGit # if weigit repo already exists and init is being called, wrap the existing .wgit/.git repo with PyGit
self._sha1_store = SHA1_store( self._sha1_store = SHA1_store(self.path)
self._wgit_dir, self._pygit = PyGit(self.path)
self._metadata_file,
self._sha1_ref, elif exists and not init:
)
self._pygit = PyGit(self.wgit_parent.joinpath(self._wgit_dir))
elif self._exists() and not init:
# weigit exists and non-init commands are triggered # weigit exists and non-init commands are triggered
self._sha1_store = SHA1_store( self._sha1_store = SHA1_store(self.path)
self._wgit_dir, self._pygit = PyGit(self.path)
self._metadata_file,
self._sha1_ref,
)
self._pygit = PyGit(self.wgit_parent.joinpath(self._wgit_dir))
else: else:
# weigit doesn't exist and is not trying to be initialized (triggers during non-init commands) # weigit doesn't exist and is not trying to be initialized (triggers during non-init commands)
sys.stderr.write("fatal: not a wgit repository!\n") sys.stderr.write("fatal: not a wgit repository!\n")
sys.exit(1)
def add(self, in_file_path: str) -> None:
"""
Adds a file to the wgit repo.
Args:
file_path (str)
Path to the file to be added to the weigit repo
"""
if self._exists(self.wgit_parent):
# create the corresponding metadata file
file_path = Path(in_file_path)
metadata_file, parent_sha1 = self._process_metadata_file(file_path.name)
# add the file to the sha1_store
sha1_hash = self._sha1_store.add(file_path, parent_sha1)
def add(self, file_path: str) -> None: # write metadata to the metadata-file
"""Adds a file to the wgit repo""" self._write_metadata(metadata_file, sha1_hash)
if self._exists():
self._sha1_store.add(file_path) # add the filefile to the sha1_store
self._pygit.add() # add to the .wgit/.git repo self._pygit.add() # add to the .wgit/.git repo
else:
sys.stderr.write("fatal: no wgit repo exists!\n")
sys.exit(1)
def commit(self, message: str) -> None: def commit(self, message: str) -> None:
"""Commits staged changes to the repo""" """
if self._exists(): Commits staged changes to the repo.
Args:
message (str)
The commit message
"""
if self._exists(self.wgit_parent):
self._pygit.commit(message) self._pygit.commit(message)
else:
sys.stderr.write("fatal: no wgit repo exists!\n")
sys.exit(1)
def status(self) -> None: def status(self) -> None:
"""Skeleton""" """Show the state of the working tree."""
if self._exists(): if self._exists(self.wgit_parent):
print("wgit status") print("wgit status")
else:
sys.stderr.write("fatal: no wgit repo exists!\n")
sys.exit(1)
def log(self, file: str) -> None: def log(self, file: str) -> None:
"""Returns the WeiGit log of commit history.""" """
if self._exists(): Returns the WeiGit log of commit history.
Args:
file (str, optional)
Show the log of the commit history of the repo. Optionally, show the log history of a specific file.
"""
if self._exists(self.wgit_parent):
if file: if file:
print(f"wgit log of the file: {file}") print(f"wgit log of the file: {file}")
else: else:
print("wgit log") print("wgit log")
else:
sys.stderr.write("fatal: no wgit repo exists!\n")
sys.exit(1)
def checkout(self, sha1: str) -> None: def checkout(self, sha1: str) -> None:
"""Checkout a previously commited version of the checkpoint""" """
if self._exists(): Checkout a previously commited version of the checkpoint.
print("wgit checkout: sha1")
Args:
sha1 (str) The sha1 hash of the file version to checkout.
"""
raise NotImplementedError
def compression(self) -> None: def compression(self) -> None:
"""Not Implemented: Compression functionalities""" """Not Implemented: Compression functionalities"""
print("Not Implemented!") raise NotImplementedError
def checkout_by_steps(self) -> None: def checkout_by_steps(self) -> None:
"""Not Implemented: Checkout by steps""" """Not Implemented: Checkout by steps"""
print("Not Implemented!") raise NotImplementedError
@property @property
def path(self) -> str: def path(self) -> Path:
"""Get the path to the WeiGit repo""" """Get the path to the WeiGit repo"""
if self._repo_path is None: if self._repo_path is None:
self._exists() self._exists(self.wgit_parent)
return str(self._repo_path) return self._repo_path
def _exists(self) -> bool: def _process_metadata_file(self, metadata_fname: str) -> Tuple[Path, str]:
"""Returns True if a valid wgit exists within the cwd, and sets the self._repo_path to the wgit path.""" """Create a metadata_file corresponding to the file to be tracked by weigit if the first version of the file is encountered.
if self._weigit_repo_exists(self.wgit_parent): If a version already exists, open the file and get the sha1_hash of the last version as parent_sha1"""
self._repo_path = self.wgit_parent.joinpath(".wgit") metadata_file = self.path.joinpath(metadata_fname)
if not metadata_file.exists() or not metadata_file.stat().st_size:
metadata_file.touch()
parent_sha1 = "ROOT"
else:
with open(metadata_file, "r") as f:
ref_data = json.load(f)
parent_sha1 = ref_data["SHA1"]["__sha1_full__"]
return metadata_file, parent_sha1
def _write_metadata(self, metadata_file: Path, sha1_hash: str) -> None:
"""Write metadata to the metadata file file"""
change_time = Path(metadata_file).stat().st_ctime
metadata = {
"SHA1": {
"__sha1_full__": sha1_hash,
},
"last_modified_time_stamp": change_time,
}
with open(metadata_file, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=4)
def _exists(self, check_dir: Path) -> bool:
"""Returns True if a valid wgit exists within the cwd and iteratively checks to the root directory and
sets the self._repo_path attribute to the wgit path.
Args:
check_dir (Path)
path to the directory from where search is started.
"""
if self._weigit_repo_exists(check_dir):
self._repo_path = check_dir.joinpath(".wgit")
else:
root = Path(check_dir.parts[0])
while check_dir != root:
check_dir = check_dir.parent
if self._weigit_repo_exists(check_dir):
self._repo_path = check_dir.joinpath(".wgit")
break
return True if self._repo_path is not None else False return True if self._repo_path is not None else False
def _weigit_repo_exists(self, check_dir: pathlib.Path) -> bool: def _weigit_repo_exists(self, check_dir: pathlib.Path) -> bool:
......
...@@ -6,51 +6,61 @@ ...@@ -6,51 +6,61 @@
import hashlib import hashlib
import json import json
import os
from pathlib import Path from pathlib import Path
import shutil import shutil
import sys import sys
from typing import Union, cast from typing import Union
from .utils import ExitCode from .utils import ExitCode
class SHA1_store: class SHA1_store:
def __init__(self, weigit_path: Path, metadata_file: Path, sha1_refs: Path, init: bool = False) -> None:
""" """
Planned Features: Represent the sha1_store within the WeiGit repo for handling added file to the store and managing references.
1. def init
2. def add <file or data> -> SHA1 Args:
3. def remove (SHA1) weigit_path (pathlib.Path)
4. def add_ref(children_SHA1, parent_SHA1) The path to the weigit repo where a sha1_store will be created, or if already exists will be wrapped.
5. def read(SHA1): -> init (bool, optional)
6. def lookup(SHA1): -> file path to the data. NotFound Exception if not found. - If ``True``, initializes a new sha1_store in the weigit_path. Initialization creates a `sha1_store` directory within WeiGit repo in ./<weigit_path>/,
and a `sha1_refs.json` withiin ./<weigit_path>/.
- If ``False``, a new sha1_store is not initialized and the existing sha1_store is simply wrapped, populating the `name`, `path` and the `ref_file_path` attributes.
- Default: False
""" """
def __init__(self, weigit_path: Path, init: bool = False) -> None:
"""Create or wrap (if already exists) a sha1_store within the WeiGit repo."""
# should use the sha1_refs.json to track the parent references. # should use the sha1_refs.json to track the parent references.
self.name = "sha1_store" self.name = "sha1_store"
self.path = weigit_path.joinpath(self.name) self.path = weigit_path.joinpath(self.name)
self.ref_file_path = weigit_path.joinpath("sha1_refs.json")
self._ref_file_name = Path.cwd().joinpath(sha1_refs) self._weigit_path = weigit_path
self._metadata_file = Path.cwd().joinpath(metadata_file)
# initialize the sha1_store # initialize the sha1_store
if init: if init:
try: try:
if not self.path.exists(): if not self.path.exists():
Path.mkdir(self.path, parents=False, exist_ok=False) Path.mkdir(self.path, parents=False, exist_ok=False)
self.ref_file_path.touch(exist_ok=False)
except FileExistsError as error: except FileExistsError as error:
sys.stderr.write(f"An exception occured while creating Sha1_store: {repr(error)}\n") sys.stderr.write(f"An exception occured while creating Sha1_store: {repr(error)}\n")
sys.exit(ExitCode.FILE_EXISTS_ERROR) sys.exit(ExitCode.FILE_EXISTS_ERROR)
def add(self, file_path: str) -> None: def add(self, file_path: Path, parent_sha1: str) -> str:
"""Adds a file/checkpoint to the internal sha1_store and update the metadata and the """
sha1 references accordingly. Adds a file/checkpoint to the internal sha1_store and the sha1 references accordingly.
First, a sha1 hash is calculated. Utilizing the sha1 hash string, the actual file in <in_file_path> is moved
within the sha1_store and the sha1 reference file is updated accordingly with the information of their parents
node (if exists) and whether the new version is a leaf node or not.
Args:
in_file_path (str): path to the file to be added to the sha1_store.
""" """
sha1_hash = self.get_sha1_hash(file_path) sha1_hash = self._get_sha1_hash(file_path)
# use the sha1_hash to create a directory with first2 sha naming convention # use the sha1_hash to create a directory with first2 sha naming convention
try: try:
repo_fdir = self.path.joinpath(sha1_hash[:2]) repo_fdir = self.path.joinpath(sha1_hash[:2])
repo_fdir.mkdir(exist_ok=False) repo_fdir.mkdir(exist_ok=True)
except FileExistsError as error: except FileExistsError as error:
sys.stderr.write(f"An exception occured: {repr(error)}\n") sys.stderr.write(f"An exception occured: {repr(error)}\n")
sys.exit(ExitCode.FILE_EXISTS_ERROR) sys.exit(ExitCode.FILE_EXISTS_ERROR)
...@@ -58,61 +68,22 @@ class SHA1_store: ...@@ -58,61 +68,22 @@ class SHA1_store:
# First transfer the file to the internal sha1_store # First transfer the file to the internal sha1_store
repo_fpath = Path.cwd().joinpath(repo_fdir, sha1_hash[2:]) repo_fpath = Path.cwd().joinpath(repo_fdir, sha1_hash[2:])
shutil.copy2(file_path, repo_fpath) shutil.copy2(file_path, repo_fpath)
change_time = Path(repo_fpath).stat().st_ctime
# Create the dependency Graph and track reference # Create the dependency Graph and track reference
self._add_ref(current_sha1_hash=sha1_hash) self._add_ref(sha1_hash, parent_sha1)
metadata = {
"SHA1": {
"__sha1_full__": sha1_hash,
},
"file_path": str(repo_fpath),
"time_stamp": change_time,
}
# Populate the meta_data file with the meta_data and git add
self._add_metadata_to_json(metadata)
except BaseException as error: except BaseException as error:
# in case of failure: Cleans up the sub-directories created to store sha1-named checkpoints # in case of failure: Cleans up the sub-directories created to store sha1-named checkpoints
sys.stderr.write(f"An exception occured: {repr(error)}\n") sys.stderr.write(f"An exception occured: {repr(error)}\n")
shutil.rmtree(repo_fdir) shutil.rmtree(repo_fdir)
return sha1_hash
def _add_ref(self, current_sha1_hash: str) -> None: def _get_sha1_hash(self, file_path: Union[str, Path]) -> str:
"""Populates the sha1_refs.json file when file is added and keeps track of reference to earlier commits""" """return the sha1 hash of a file
if not os.path.getsize(self._ref_file_name): # If no entry yet Args:
with open(self._ref_file_name) as f: file_path (str, Path): Path to the file whose sha1 hash is to be calculalated and returned.
ref_data = { """
current_sha1_hash: {"parent": "ROOT", "child": "HEAD", "ref_count": 0},
}
with open(self._ref_file_name, "w", encoding="utf-8") as f:
json.dump(ref_data, f, ensure_ascii=False, indent=4)
else:
with open(self._ref_file_name, "r") as f:
ref_data = json.load(f)
# get the last head and replace it's child from HEAD -> this sha1
for key, vals in ref_data.items():
if vals["child"] == "HEAD":
parent = key
ref_data[parent]["child"] = current_sha1_hash
# increase the ref counter of that (now parent sha1)
ref_count = cast(int, ref_data[parent]["ref_count"])
ref_count += 1
ref_data[parent]["ref_count"] = ref_count
# Add this new sha1 as a new entry, make the earlier sha1 a parent
# make "HEAD" as a child, and json dump
ref_data[current_sha1_hash] = {"parent": parent, "child": "HEAD", "ref_count": 0}
# Try
with open(self._ref_file_name, "w", encoding="utf-8") as f:
json.dump(ref_data, f, ensure_ascii=False, indent=4)
def get_sha1_hash(self, file_path: Union[str, Path]) -> str:
""" " return the sha1 hash of a file"""
SHA1_BUF_SIZE = 104857600 # Reading file in 100MB chunks SHA1_BUF_SIZE = 104857600 # Reading file in 100MB chunks
sha1 = hashlib.sha1() sha1 = hashlib.sha1()
...@@ -124,8 +95,69 @@ class SHA1_store: ...@@ -124,8 +95,69 @@ class SHA1_store:
sha1.update(data) sha1.update(data)
return sha1.hexdigest() return sha1.hexdigest()
def _add_metadata_to_json(self, metadata: dict) -> None: def _add_ref(self, current_sha1_hash: str, parent_hash: str) -> None:
"""Populates the meta_data_file: checkpoint.pt with the meta_data""" """
file_pt_json = self._metadata_file Populates the sha1_refs.json file when file is added and keeps track of reference to earlier file additions.
with open(file_pt_json, "w", encoding="utf-8") as f: If the sha1_refs.json file is empty, then a new tracking entry of the added file is logged in the sha1_refs file.
json.dump(metadata, f, ensure_ascii=False, indent=4) If the file already has an entry, first it checks if the incoming new added file is a new version of any of the
existing entries. If it is, then logs the tracking info as a new version of that existing entry.
Otherwise a new entry for the new added file is created for tracking.
Args:
file_path (pathlib.Path)
Path to the incoming added file.
current_sha1_hash (str)
The sha1 hash of the incoming added file.
"""
# Check the current state of the reference file and check if the added file already has an entry.
sha1_refs_empty = self._sha1_refs_file_state()
# if the file is empty: add the first entry
if sha1_refs_empty:
with open(self.ref_file_path) as f:
ref_data = {current_sha1_hash: {"parent": "ROOT", "ref_count": 1, "is_leaf": True}}
self._write_to_json(self.ref_file_path, ref_data)
else:
# Open sha1 reference file and check if there is a parent_hash not equal to Root?
# if Yes, find parent and add the child. Else, just add a new entry
with open(self.ref_file_path, "r") as f:
ref_data = json.load(f)
if parent_hash != "ROOT":
# get the last head and replace it's child from HEAD -> this sha1
ref_data[parent_hash]["is_leaf"] = False
ref_data[current_sha1_hash] = {"parent": parent_hash, "ref_count": 1, "is_leaf": True}
else:
ref_data[current_sha1_hash] = {"parent": "ROOT", "ref_count": 1, "is_leaf": True}
self._write_to_json(self.ref_file_path, ref_data)
def _sha1_refs_file_state(self) -> bool:
"""
Checks the state of the sha1 reference file, whether the file is empty or not.
If not empty, it checks whether the input file in <file_path> has an older entry (version)
in the reference file.
Args:
file_path (pathlib.Path)
input File whose entry will be checked if it exists in the reference file.
"""
try:
with open(self.ref_file_path, "r") as f:
ref_data = json.load(f)
sha1_refs_empty: bool = False
except json.JSONDecodeError as error:
if not self.ref_file_path.stat().st_size:
sha1_refs_empty = True
return sha1_refs_empty
def _write_to_json(self, file: Path, data: dict) -> None:
"""
Populates a json file with data.
Args:
file (pathlib.Path)
path to the file to be written in.
data (pathlib.Path)
Data to be written in the file.
"""
with open(file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
...@@ -7,11 +7,11 @@ ...@@ -7,11 +7,11 @@
import json import json
import os import os
from pathlib import Path from pathlib import Path
import random
import shutil import shutil
import pytest import pytest
from fairscale.experimental.wgit import cli
from fairscale.experimental.wgit import repo as api from fairscale.experimental.wgit import repo as api
...@@ -39,7 +39,7 @@ def create_test_dir(): ...@@ -39,7 +39,7 @@ def create_test_dir():
@pytest.fixture @pytest.fixture
def repo(): def repo():
repo = api.Repo(Path.cwd()) repo = api.Repo(Path.cwd(), init=True)
return repo return repo
...@@ -56,16 +56,16 @@ def test_api_init(capsys, repo): ...@@ -56,16 +56,16 @@ def test_api_init(capsys, repo):
def test_api_add(capsys, repo): def test_api_add(capsys, repo):
chkpt0 = "checkpoint_0.pt" fnum = random.randint(0, 2)
repo.add("checkpoint_0.pt") chkpt0 = f"checkpoint_{fnum}.pt"
repo.add(f"checkpoint_{fnum}.pt")
sha1_hash = repo._sha1_store.get_sha1_hash(chkpt0) sha1_hash = repo._sha1_store._get_sha1_hash(chkpt0)
with open(os.path.join(".wgit", "checkpoint.pt"), "r") as f: with open(os.path.join(".wgit", f"checkpoint_{fnum}.pt"), "r") as f:
json_data = json.load(f) json_data = json.load(f)
sha1_dir_0 = f"{sha1_hash[:2]}/" + f"{sha1_hash[2:]}" sha1_dir_0 = f"{sha1_hash[:2]}/" + f"{sha1_hash[2:]}"
assert json_data["SHA1"] == {"__sha1_full__": sha1_hash} assert json_data["SHA1"] == {"__sha1_full__": sha1_hash}
assert json_data["file_path"] == os.path.join(os.getcwd(), ".wgit/sha1_store/", sha1_dir_0)
def test_api_commit(capsys, repo): def test_api_commit(capsys, repo):
...@@ -78,7 +78,6 @@ def test_api_commit(capsys, repo): ...@@ -78,7 +78,6 @@ def test_api_commit(capsys, repo):
def test_api_status(capsys, repo): def test_api_status(capsys, repo):
repo.status() repo.status()
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == "wgit status\n" assert captured.out == "wgit status\n"
assert captured.err == "" assert captured.err == ""
...@@ -91,11 +90,11 @@ def test_api_log(capsys, repo): ...@@ -91,11 +90,11 @@ def test_api_log(capsys, repo):
assert captured.err == "" assert captured.err == ""
def test_cli_checkout(capsys): def test_api_checkout(repo):
cli.main(["checkout", "sha1"]) try:
captured = capsys.readouterr() repo.checkout("sha1")
assert captured.out == "wgit checkout: sha1\n" except NotImplementedError:
assert captured.err == "" assert True
def teardown_module(module): def teardown_module(module):
......
...@@ -54,19 +54,16 @@ def test_cli_add(capsys): ...@@ -54,19 +54,16 @@ def test_cli_add(capsys):
sha1_store = SHA1_store( sha1_store = SHA1_store(
Path.cwd().joinpath(".wgit"), Path.cwd().joinpath(".wgit"),
Path.cwd().joinpath(".wgit", "checkpoint.pt"),
Path.cwd().joinpath(".wgit", "sha1_refs.json"),
init=False, init=False,
) )
sha1_hash = sha1_store.get_sha1_hash(chkpt0) sha1_hash = sha1_store._get_sha1_hash(chkpt0)
with open(os.path.join(".wgit", "checkpoint.pt"), "r") as f: with open(os.path.join(".wgit", "checkpoint_0.pt"), "r") as f:
json_data = json.load(f) json_data = json.load(f)
sha1_dir_0 = f"{sha1_hash[:2]}/" + f"{sha1_hash[2:]}" sha1_dir_0 = f"{sha1_hash[:2]}/" + f"{sha1_hash[2:]}"
assert json_data["SHA1"] == {"__sha1_full__": sha1_hash} assert json_data["SHA1"] == {"__sha1_full__": sha1_hash}
assert json_data["file_path"] == os.path.join(os.getcwd(), ".wgit/sha1_store/", sha1_dir_0)
def test_cli_commit(capsys): def test_cli_commit(capsys):
...@@ -92,10 +89,10 @@ def test_cli_log(capsys): ...@@ -92,10 +89,10 @@ def test_cli_log(capsys):
def test_cli_checkout(capsys): def test_cli_checkout(capsys):
try:
cli.main(["checkout", "sha1"]) cli.main(["checkout", "sha1"])
captured = capsys.readouterr() except NotImplementedError:
assert captured.out == "wgit checkout: sha1\n" assert True
assert captured.err == ""
def teardown_module(module): def teardown_module(module):
......
...@@ -11,6 +11,7 @@ import shutil ...@@ -11,6 +11,7 @@ import shutil
import pytest import pytest
from fairscale.experimental.wgit.repo import Repo
from fairscale.experimental.wgit.sha1_store import SHA1_store from fairscale.experimental.wgit.sha1_store import SHA1_store
...@@ -20,84 +21,123 @@ def sha1_configs(): ...@@ -20,84 +21,123 @@ def sha1_configs():
class Sha1StorePaths: class Sha1StorePaths:
test_dirs = Path("temp_wgit_testing/.wgit") test_dirs = Path("temp_wgit_testing/.wgit")
test_path = Path.cwd().joinpath(test_dirs) test_path = Path.cwd().joinpath(test_dirs)
metadata_file = test_path.joinpath("checkpoint.pt")
sha1_ref = test_path.joinpath("sha1_refs.json") sha1_ref = test_path.joinpath("sha1_refs.json")
chkpt_dir = test_path.joinpath("checkpoint") chkpt1a_dir = test_path.joinpath("checkpoint_1a")
checkpoint_1 = test_path.joinpath("checkpoint", "checkpoint_1.pt") chkpt1b_dir = test_path.joinpath("checkpoint_1b")
checkpoint_2 = test_path.joinpath("checkpoint", "checkpoint_2.pt") chkpt1c_dir = test_path.joinpath("checkpoint_1c")
checkpoint_3 = test_path.joinpath("checkpoint", "checkpoint_3.pt") checkpoint_1a = test_path.joinpath("checkpoint_1a", "checkpoint_1.pt")
checkpoint_1b = test_path.joinpath("checkpoint_1b", "checkpoint_1.pt")
checkpoint_1c = test_path.joinpath("checkpoint_1c", "checkpoint_1.pt")
checkpoint_2 = test_path.joinpath("checkpoint_1a", "checkpoint_2.pt")
checkpoint_3 = test_path.joinpath("checkpoint_1a", "checkpoint_3.pt")
metadata_1 = test_path.joinpath("checkpoint_1.pt")
metadata_2 = test_path.joinpath("checkpoint_2.pt")
metadata_3 = test_path.joinpath("checkpoint_3.pt")
return Sha1StorePaths return Sha1StorePaths
@pytest.fixture @pytest.fixture
def sha1_store(sha1_configs): def sha1_store(sha1_configs):
sha1_store = SHA1_store(sha1_configs.test_dirs, sha1_configs.metadata_file, sha1_configs.sha1_ref, init=False) repo = Repo(sha1_configs.test_path.parent, init=False)
return sha1_store sha1_store = SHA1_store(sha1_configs.test_dirs, init=False)
return repo, sha1_store
def test_setup(sha1_configs): def test_setup(sha1_configs):
# Set up the testing directory # Set up the testing directory
sha1_configs.test_dirs.mkdir(parents=True, exist_ok=True) # create test .wgit dir sha1_configs.test_dirs.mkdir(parents=True, exist_ok=True) # create test .wgit dir
sha1_configs.metadata_file.touch()
sha1_configs.sha1_ref.touch()
# Create the test checkpoint files # Create the test checkpoint files
sha1_configs.chkpt_dir.mkdir(exist_ok=False) sha1_configs.chkpt1a_dir.mkdir(exist_ok=False)
sha1_configs.checkpoint_1.touch() sha1_configs.chkpt1b_dir.mkdir(exist_ok=False)
sha1_configs.checkpoint_2.touch() sha1_configs.chkpt1c_dir.mkdir(exist_ok=False)
# Create random checkpoints # Create random checkpoints
size_list = [30e5, 35e5, 40e5] size_list = [25e5, 27e5, 30e5, 35e5, 40e5]
chkpts = [sha1_configs.checkpoint_1, sha1_configs.checkpoint_2, sha1_configs.checkpoint_3] chkpts = [
sha1_configs.checkpoint_1a,
sha1_configs.checkpoint_1b,
sha1_configs.checkpoint_1c,
sha1_configs.checkpoint_2,
sha1_configs.checkpoint_3,
]
for file, size in zip(chkpts, size_list): for file, size in zip(chkpts, size_list):
with open(file, "wb") as f: with open(file, "wb") as f:
f.write(os.urandom(int(size))) f.write(os.urandom(int(size)))
sha1_store = SHA1_store(sha1_configs.test_dirs, sha1_configs.metadata_file, sha1_configs.sha1_ref, init=True) repo = Repo(sha1_configs.test_path.parent, init=True)
sha1_store = SHA1_store(sha1_configs.test_dirs, init=True)
return sha1_store return sha1_store
def test_sha1_add(sha1_configs, sha1_store): def test_sha1_add(sha1_configs, sha1_store):
# add the file to sha1_store repo, sha1_store = sha1_store
sha1_store.add(sha1_configs.checkpoint_1)
with open(sha1_configs.metadata_file, "r") as file: # Add checkpoint_1: Create the meta_data
metadata = json.load(file) chkpt1 = sha1_configs.checkpoint_1a
metadata_file, parent_sha1 = repo._process_metadata_file(chkpt1.name)
file_sha1 = metadata["SHA1"]["__sha1_full__"] sha1_hash = sha1_store.add(sha1_configs.checkpoint_1a, parent_sha1)
repo._write_metadata(metadata_file, sha1_hash)
# Check metadata file creation # for checkpoint 1
assert file_sha1 == sha1_store.get_sha1_hash(sha1_configs.checkpoint_1) metadata_file = sha1_configs.test_path.joinpath(sha1_configs.checkpoint_1a.name)
assert metadata["file_path"] == str(sha1_configs.test_path.joinpath(sha1_store.name, file_sha1[:2], file_sha1[2:]))
with open(metadata_file, "r") as file:
metadata = json.load(file)
assert metadata["SHA1"]["__sha1_full__"] == sha1_hash
def test_sha1_refs(sha1_configs, sha1_store): def test_sha1_refs(sha1_configs, sha1_store):
# Check reference creation repo, sha1_store = sha1_store
def add_checkpoint(checkpoint):
metadata_file, parent_sha1 = repo._process_metadata_file(checkpoint.name)
sha1_hash = sha1_store.add(checkpoint, parent_sha1)
repo._write_metadata(metadata_file, sha1_hash)
return sha1_hash
with open(sha1_configs.sha1_ref, "r") as file: with open(sha1_configs.sha1_ref, "r") as file:
refs_data = json.load(file) refs_data = json.load(file)
# get checkpoint1 sha1 # get checkpoint1 sha1
sha1_chkpt1 = sha1_store.get_sha1_hash(sha1_configs.checkpoint_1) sha1_chkpt1a_hash = sha1_store._get_sha1_hash(sha1_configs.checkpoint_1a)
assert refs_data[sha1_chkpt1]["parent"] == "ROOT" assert refs_data[sha1_chkpt1a_hash]["parent"] == "ROOT"
assert refs_data[sha1_chkpt1]["child"] == "HEAD" assert refs_data[sha1_chkpt1a_hash]["ref_count"] == 1
assert refs_data[sha1_chkpt1]["ref_count"] == 0
ck1a_sha1_hash = sha1_store._get_sha1_hash(sha1_configs.checkpoint_1a)
# add checkpoint new version of checkpoint-1
ck1b_sha1_hash = add_checkpoint(sha1_configs.checkpoint_1b)
# add checkpoint 2 and checkpoint 3 # Add new checkpoints 2 and 3
sha1_store.add(sha1_configs.checkpoint_2) ck2_sha1_hash = add_checkpoint(sha1_configs.checkpoint_2)
sha1_store.add(sha1_configs.checkpoint_3) ck3_sha1_hash = add_checkpoint(sha1_configs.checkpoint_3)
# add another version of checkpoint 1
ck1c_sha1_hash = add_checkpoint(sha1_configs.checkpoint_1c)
# load ref file after Sha1 add # load ref file after Sha1 add
with open(sha1_configs.sha1_ref, "r") as file: with open(sha1_configs.sha1_ref, "r") as file:
refs_data = json.load(file) refs_data = json.load(file)
# get checkpoint1 sha1 # Tests for same file versions
sha1_chkpt2 = sha1_store.get_sha1_hash(sha1_configs.checkpoint_2) assert refs_data[ck1b_sha1_hash]["parent"] == ck1a_sha1_hash
sha1_chkpt3 = sha1_store.get_sha1_hash(sha1_configs.checkpoint_3) assert refs_data[ck1c_sha1_hash]["parent"] == ck1b_sha1_hash
assert refs_data[ck1b_sha1_hash]["ref_count"] == 1
assert refs_data[sha1_chkpt2]["parent"] == sha1_chkpt1 assert refs_data[ck1a_sha1_hash]["is_leaf"] is False
assert refs_data[sha1_chkpt2]["child"] == sha1_chkpt3 assert refs_data[ck1a_sha1_hash]["is_leaf"] is False
assert refs_data[sha1_chkpt2]["ref_count"] == 1 assert refs_data[ck1b_sha1_hash]["is_leaf"] is False
assert refs_data[ck1c_sha1_hash]["is_leaf"] is True
# Tests for new files
assert refs_data[ck2_sha1_hash]["parent"] == "ROOT"
assert refs_data[ck2_sha1_hash]["is_leaf"] is True
assert refs_data[ck3_sha1_hash]["parent"] == "ROOT"
assert refs_data[ck3_sha1_hash]["is_leaf"] is True
def test_tear_down(sha1_configs): def test_tear_down(sha1_configs):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment