Commit 9e768b59 authored by zhuwenwen's avatar zhuwenwen
Browse files
parents 7bc5a8e3 8aed02b9
......@@ -4,11 +4,10 @@ on:
pull_request:
types: [synchronize, opened, reopened]
paths:
- 'applications/Chat/coati/**'
- 'applications/Chat/requirements.txt'
- 'applications/Chat/setup.py'
- 'applications/Chat/examples/**'
- "applications/Chat/coati/**"
- "applications/Chat/requirements.txt"
- "applications/Chat/setup.py"
- "applications/Chat/examples/**"
jobs:
tests:
......@@ -20,7 +19,7 @@ jobs:
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat
options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
timeout-minutes: 30
defaults:
run:
......@@ -29,28 +28,26 @@ jobs:
- name: Checkout ColossalAI
uses: actions/checkout@v2
- name: Install ColossalAI and ChatGPT
- name: Install ChatGPT
run: |
pip install -e .
cd applications/Chat
pip install -v .
pip install -r examples/requirements.txt
- name: Install Transformers
run: |
cd applications/Chat
git clone https://github.com/hpcaitech/transformers
cd transformers
pip install -v .
pip install transformers==4.30.2
- name: Execute Examples
run: |
cd applications/Chat
rm -rf ~/.cache/colossalai
./examples/test_ci.sh
./tests/test_inference.sh
./tests/test_benchmarks.sh
./tests/test_train.sh
env:
NCCL_SHM_DISABLE: 1
MAX_JOBS: 8
SFT_DATASET: /data/scratch/github_actions/chat/data.json
PROMPT_PATH: /data/scratch/github_actions/chat/prompts_en.jsonl
PROMPT_DATASET: /data/scratch/github_actions/chat/prompts_en.jsonl
PRETRAIN_DATASET: /data/scratch/github_actions/chat/alpaca_data.json
......@@ -30,9 +30,8 @@ jobs:
- name: Checkout ColossalAI
uses: actions/checkout@v2
- name: Install ColossalAI and ChatGPT
- name: Install ChatGPT
run: |
pip install -e .
cd applications/Chat
pip install -v .
pip install -r requirements-test.txt
......
......@@ -22,13 +22,13 @@ def compare_dirs(dir1, dir2):
# If the corresponding item doesn't exist in the second directory, the directories are different
if not os.path.exists(item_path2):
print(f'Found mismatch: {item_path1}, {item_path2}')
print(f"Found mismatch: {item_path1}, {item_path2}")
return False
# If the corresponding item is a directory, we compare the two directories recursively
if os.path.isdir(item_path1) and os.path.isdir(item_path2):
if not compare_dirs(item_path1, item_path2):
print(f'Found mismatch: {item_path1}, {item_path2}')
print(f"Found mismatch: {item_path1}, {item_path2}")
return False
# both are files
......@@ -37,16 +37,16 @@ def compare_dirs(dir1, dir2):
# If the corresponding item is not a file or a directory, the directories are different
else:
print(f'Found mismatch: {item_path1}, {item_path2}')
print(f"Found mismatch: {item_path1}, {item_path2}")
return False
# If all items are the same, the directories are the same
return True
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--directory', help="The directory where the multi-language source files are kept.")
parser.add_argument("-d", "--directory", help="The directory where the multi-language source files are kept.")
args = parser.parse_args()
i18n_folders = os.listdir(args.directory)
......@@ -56,7 +56,7 @@ if __name__ == '__main__':
for i in range(1, len(i18n_folders)):
dir1 = i18n_folders[0]
dir2 = i18n_folders[i]
print(f'comparing {dir1} vs {dir2}')
print(f"comparing {dir1} vs {dir2}")
match = compare_dirs(i18n_folders[0], i18n_folders[i])
if not match:
......
......@@ -4,7 +4,7 @@ import os
def check_inputs(input_list):
for path in input_list:
real_path = os.path.join('examples', path)
real_path = os.path.join("examples", path)
if not os.path.exists(real_path):
return False
return True
......@@ -12,16 +12,16 @@ def check_inputs(input_list):
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fileNameList', type=str, help="List of file names")
parser.add_argument("-f", "--fileNameList", type=str, help="List of file names")
args = parser.parse_args()
name_list = args.fileNameList.split(",")
is_correct = check_inputs(name_list)
if is_correct:
print('success')
print("success")
else:
print('failure')
print("failure")
if __name__ == '__main__':
if __name__ == "__main__":
main()
......@@ -17,21 +17,21 @@ def show_files(path, all_files):
def join(input_list, sep=None):
return (sep or ' ').join(input_list)
return (sep or " ").join(input_list)
def main():
contents = show_files('examples/', [])
contents = show_files("examples/", [])
all_loc = []
for file_loc in contents:
split_loc = file_loc.split('/')
split_loc = file_loc.split("/")
# must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
if len(split_loc) >= 4:
re_loc = '/'.join(split_loc[1:3])
re_loc = "/".join(split_loc[1:3])
if re_loc not in all_loc:
all_loc.append(re_loc)
print(all_loc)
if __name__ == '__main__':
if __name__ == "__main__":
main()
......@@ -3,7 +3,7 @@ import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files")
parser.add_argument("-f", "--fileNameList", type=str, help="The list of changed files")
args = parser.parse_args()
name_list = args.fileNameList.split(":")
folder_need_check = set()
......@@ -15,10 +15,10 @@ def main():
# - application
# - file
if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
folder_need_check.add('/'.join(loc.split("/")[1:3]))
folder_need_check.add("/".join(loc.split("/")[1:3]))
# Output the result using print. Then the shell can get the values.
print(list(folder_need_check))
if __name__ == '__main__':
if __name__ == "__main__":
main()
import os
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Any, Dict, List
......@@ -10,8 +9,7 @@ import seaborn
from requests_toolbelt import MultipartEncoder
@dataclass
class Contributor:
class Counter(dict):
"""
Dataclass for a github contributor.
......@@ -19,8 +17,40 @@ class Contributor:
name (str): name of the contributor
num_commits_this_week (int): number of commits made within one week
"""
name: str
num_commits_this_week: int
def record(self, item: str):
if item in self:
self[item] += 1
else:
self[item] = 1
def to_sorted_list(self):
data = [(key, value) for key, value in self.items()]
data.sort(key=lambda x: x[1], reverse=True)
return data
def get_utc_time_one_week_ago():
"""
Get the UTC time one week ago.
"""
now = datetime.utcnow()
start_datetime = now - timedelta(days=7)
return start_datetime
def datetime2str(dt):
"""
Convert datetime to string in the format of YYYY-MM-DDTHH:MM:SSZ
"""
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
def str2datetime(string):
"""
Convert string in the format of YYYY-MM-DDTHH:MM:SSZ to datetime
"""
return datetime.strptime(string, "%Y-%m-%dT%H:%M:%SZ")
def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title: str, output_path: str) -> None:
......@@ -36,9 +66,30 @@ def plot_bar_chart(x: List[Any], y: List[Any], xlabel: str, ylabel: str, title:
plt.savefig(output_path, dpi=1200)
def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str, int]:
def get_organization_repositories(github_token, organization_name) -> List[str]:
"""
Retrieve the public repositories under the organization.
"""
url = f"https://api.github.com/orgs/{organization_name}/repos?type=public"
# prepare header
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
res = requests.get(url, headers=headers).json()
repo_list = []
for item in res:
repo_list.append(item["name"])
return repo_list
def get_issue_pull_request_comments(github_token: str, org_name: str, repo_name: str, since: str) -> Dict[str, int]:
"""
Retrive the issue/PR comments made by our members in the last 7 days.
Retrieve the issue/PR comments made by our members in the last 7 days.
Args:
github_token (str): GitHub access token for API calls
......@@ -46,9 +97,9 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
"""
# prepare header
headers = {
'Authorization': f'Bearer {github_token}',
'Accept': 'application/vnd.github+json',
'X-GitHub-Api-Version': '2022-11-28'
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
user_engagement_count = {}
......@@ -56,28 +107,28 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
# do pagination to the API
page = 1
while True:
comment_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/comments?since={since}&page={page}'
comment_api = f"https://api.github.com/repos/{org_name}/{repo_name}/issues/comments?since={since}&page={page}"
comment_response = requests.get(comment_api, headers=headers).json()
if len(comment_response) == 0:
break
else:
for item in comment_response:
comment_author_relationship = item['author_association']
if comment_author_relationship != 'MEMBER':
comment_author_relationship = item["author_association"]
if comment_author_relationship != "MEMBER":
# if the comment is not made by our member
# we don't count this comment towards user engagement
continue
issue_id = item['issue_url'].split('/')[-1]
issue_api = f'https://api.github.com/repos/hpcaitech/ColossalAI/issues/{issue_id}'
issue_id = item["issue_url"].split("/")[-1]
issue_api = f"https://api.github.com/repos/{org_name}/{repo_name}/issues/{issue_id}"
issue_response = requests.get(issue_api, headers=headers).json()
issue_author_relationship = issue_response['author_association']
issue_author_relationship = issue_response["author_association"]
if issue_author_relationship != 'MEMBER':
if issue_author_relationship != "MEMBER":
# this means that the issue/PR is not created by our own people
# any comments in this issue/PR by our member will be counted towards the leaderboard
member_name = item['user']['login']
member_name = item["user"]["login"]
if member_name in user_engagement_count:
user_engagement_count[member_name] += 1
......@@ -87,9 +138,9 @@ def get_issue_pull_request_comments(github_token: str, since: str) -> Dict[str,
return user_engagement_count
def get_discussion_comments(github_token, since) -> Dict[str, int]:
def get_discussion_comments(github_token: str, org_name: str, repo_name: str, since: str) -> Dict[str, int]:
"""
Retrive the discussion comments made by our members in the last 7 days.
Retrieve the discussion comments made by our members in the last 7 days.
This is only available via the GitHub GraphQL API.
Args:
......@@ -102,10 +153,10 @@ def get_discussion_comments(github_token, since) -> Dict[str, int]:
if cursor is None:
offset_str = ""
else:
offset_str = f", after: \"{cursor}\""
offset_str = f', after: "{cursor}"'
query = f"""
{{
repository(owner: "hpcaitech", name: "ColossalAI"){{
repository(owner: "{org_name}", name: "{repo_name}"){{
discussions(first: {num} {offset_str}){{
edges {{
cursor
......@@ -131,10 +182,10 @@ def get_discussion_comments(github_token, since) -> Dict[str, int]:
if cursor is None:
offset_str = ""
else:
offset_str = f", before: \"{cursor}\""
offset_str = f', before: "{cursor}"'
query = f"""
{{
repository(owner: "hpcaitech", name: "ColossalAI"){{
repository(owner: "{org_name}", name: "{repo_name}"){{
discussion(number: {discussion_number}){{
title
comments(last: {num} {offset_str}){{
......@@ -169,8 +220,8 @@ def get_discussion_comments(github_token, since) -> Dict[str, int]:
# a utility function to make call to Github GraphQL API
def _call_graphql_api(query):
headers = {"Authorization": f"Bearer {github_token}"}
json_data = {'query': query}
response = requests.post('https://api.github.com/graphql', json=json_data, headers=headers)
json_data = {"query": query}
response = requests.post("https://api.github.com/graphql", json=json_data, headers=headers)
data = response.json()
return data
......@@ -183,21 +234,21 @@ def get_discussion_comments(github_token, since) -> Dict[str, int]:
data = _call_graphql_api(query)
found_discussion_out_of_time_range = False
edges = data['data']['repository']['discussions']['edges']
edges = data["data"]["repository"]["discussions"]["edges"]
if len(edges) == 0:
break
else:
# keep the discussion whose author is not a member
for edge in edges:
# print the discussion title
discussion = edge['node']
discussion = edge["node"]
discussion_updated_at = str2datetime(discussion["updatedAt"])
discussion_updated_at = datetime.strptime(discussion['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
# check if the updatedAt is within the last 7 days
# if yes, add it to dicussion_numbers
# if yes, add it to discussion_numbers
if discussion_updated_at > since:
if discussion['authorAssociation'] != 'MEMBER':
discussion_numbers.append(discussion['number'])
if discussion["authorAssociation"] != "MEMBER":
discussion_numbers.append(discussion["number"])
else:
found_discussion_out_of_time_range = True
......@@ -205,54 +256,55 @@ def get_discussion_comments(github_token, since) -> Dict[str, int]:
break
else:
# update cursor
cursor = edges[-1]['cursor']
cursor = edges[-1]["cursor"]
# get the dicussion comments and replies made by our member
# get the discussion comments and replies made by our member
user_engagement_count = {}
for dicussion_number in discussion_numbers:
for discussion_number in discussion_numbers:
cursor = None
num_per_request = 10
while True:
query = _generate_comment_reply_count_for_discussion(dicussion_number, num_per_request, cursor)
query = _generate_comment_reply_count_for_discussion(discussion_number, num_per_request, cursor)
data = _call_graphql_api(query)
# get the comments
edges = data['data']['repository']['discussion']['comments']['edges']
edges = data["data"]["repository"]["discussion"]["comments"]["edges"]
# update the cursor
if len(edges) == 0:
break
else:
# update cursor for pagination
cursor = edges[-1]['cursor']
cursor = edges[-1]["cursor"]
for edge in edges:
comment = edge['node']
if comment['authorAssociation'] == 'MEMBER':
comment = edge["node"]
if comment["authorAssociation"] == "MEMBER":
# check if the updatedAt is within the last 7 days
# if yes, add it to user_engagement_count
comment_updated_at = datetime.strptime(comment['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
comment_updated_at = datetime.strptime(comment["updatedAt"], "%Y-%m-%dT%H:%M:%SZ")
if comment_updated_at > since:
member_name = comment['author']['login']
member_name = comment["author"]["login"]
if member_name in user_engagement_count:
user_engagement_count[member_name] += 1
else:
user_engagement_count[member_name] = 1
# get the replies
reply_edges = comment['replies']['edges']
reply_edges = comment["replies"]["edges"]
if len(reply_edges) == 0:
continue
else:
for reply_edge in reply_edges:
reply = reply_edge['node']
if reply['authorAssociation'] == 'MEMBER':
reply = reply_edge["node"]
if reply["authorAssociation"] == "MEMBER":
# check if the updatedAt is within the last 7 days
# if yes, add it to dicussion_numbers
reply_updated_at = datetime.strptime(reply['updatedAt'], "%Y-%m-%dT%H:%M:%SZ")
# if yes, add it to discussion_numbers
reply_updated_at = datetime.strptime(reply["updatedAt"], "%Y-%m-%dT%H:%M:%SZ")
if reply_updated_at > since:
member_name = reply['author']['login']
member_name = reply["author"]["login"]
if member_name in user_engagement_count:
user_engagement_count[member_name] += 1
else:
......@@ -260,7 +312,9 @@ def get_discussion_comments(github_token, since) -> Dict[str, int]:
return user_engagement_count
def generate_user_engagement_leaderboard_image(github_token: str, output_path: str) -> bool:
def generate_user_engagement_leaderboard_image(
github_token: str, org_name: str, repo_list: List[str], output_path: str
) -> bool:
"""
Generate the user engagement leaderboard image for stats within the last 7 days
......@@ -270,22 +324,31 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
"""
# request to the Github API to get the users who have replied the most in the last 7 days
now = datetime.utcnow()
start_datetime = now - timedelta(days=7)
start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
start_datetime = get_utc_time_one_week_ago()
start_datetime_str = datetime2str(start_datetime)
# get the issue/PR comments and discussion comment count
issue_pr_engagement_count = get_issue_pull_request_comments(github_token=github_token, since=start_datetime_str)
discussion_engagement_count = get_discussion_comments(github_token=github_token, since=start_datetime)
total_engagement_count = {}
# update the total engagement count
total_engagement_count.update(issue_pr_engagement_count)
for name, count in discussion_engagement_count.items():
if name in total_engagement_count:
total_engagement_count[name] += count
else:
total_engagement_count[name] = count
def _update_count(counter):
for name, count in counter.items():
if name in total_engagement_count:
total_engagement_count[name] += count
else:
total_engagement_count[name] = count
for repo_name in repo_list:
print(f"Fetching user engagement count for {repo_name}/{repo_name}")
issue_pr_engagement_count = get_issue_pull_request_comments(
github_token=github_token, org_name=org_name, repo_name=repo_name, since=start_datetime_str
)
discussion_engagement_count = get_discussion_comments(
github_token=github_token, org_name=org_name, repo_name=repo_name, since=start_datetime
)
# update the total engagement count
_update_count(issue_pr_engagement_count)
_update_count(discussion_engagement_count)
# prepare the data for plotting
x = []
......@@ -302,20 +365,17 @@ def generate_user_engagement_leaderboard_image(github_token: str, output_path: s
x.append(count)
y.append(name)
# use Shanghai time to display on the image
start_datetime_str = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%dT%H:%M:%SZ")
# plot the leaderboard
xlabel = f"Number of Comments made (since {start_datetime_str})"
ylabel = "Member"
title = 'Active User Engagement Leaderboard'
title = "Active User Engagement Leaderboard"
plot_bar_chart(x, y, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
return True
else:
return False
def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
def generate_contributor_leaderboard_image(github_token, org_name, repo_list, output_path) -> bool:
"""
Generate the contributor leaderboard image for stats within the last 7 days
......@@ -324,54 +384,81 @@ def generate_contributor_leaderboard_image(github_token, output_path) -> bool:
output_path (str): the path to save the image
"""
# request to the Github API to get the users who have contributed in the last 7 days
URL = 'https://api.github.com/repos/hpcaitech/ColossalAI/stats/contributors'
headers = {
'Authorization': f'Bearer {github_token}',
'Accept': 'application/vnd.github+json',
'X-GitHub-Api-Version': '2022-11-28'
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
while True:
response = requests.get(URL, headers=headers).json()
counter = Counter()
start_datetime = get_utc_time_one_week_ago()
if len(response) != 0:
# sometimes the Github API returns empty response for unknown reason
# request again if the response is empty
break
def _get_url(org_name, repo_name, page):
return f"https://api.github.com/repos/{org_name}/{repo_name}/pulls?per_page=50&page={page}&state=closed"
contributor_list = []
def _iterate_by_page(org_name, repo_name):
page = 1
stop = False
# get number of commits for each contributor
start_timestamp = None
for item in response:
num_commits_this_week = item['weeks'][-1]['c']
name = item['author']['login']
contributor = Contributor(name=name, num_commits_this_week=num_commits_this_week)
contributor_list.append(contributor)
while not stop:
print(f"Fetching pull request data for {org_name}/{repo_name} - page{page}")
url = _get_url(org_name, repo_name, page)
# update start_timestamp
start_timestamp = item['weeks'][-1]['w']
while True:
response = requests.get(url, headers=headers).json()
if isinstance(response, list):
# sometimes the Github API returns nothing
# request again if the response is not a list
break
print("Empty response, request again...")
if len(response) == 0:
# if the response is empty, stop
stop = True
break
# count the pull request and author from response
for pr_data in response:
merged_at = pr_data["merged_at"]
author = pr_data["user"]["login"]
if merged_at is None:
continue
merge_datetime = str2datetime(merged_at)
if merge_datetime < start_datetime:
# if we found a pull request that is merged before the start_datetime
# we stop
stop = True
break
else:
# record the author1
counter.record(author)
# next page
page += 1
for repo_name in repo_list:
_iterate_by_page(org_name, repo_name)
# convert unix timestamp to Beijing datetime
start_datetime = datetime.fromtimestamp(start_timestamp, tz=pytz.timezone('Asia/Shanghai'))
start_datetime_str = start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
bj_start_datetime = datetime.fromtimestamp(start_datetime.timestamp(), tz=pytz.timezone("Asia/Shanghai"))
bj_start_datetime_str = datetime2str(bj_start_datetime)
# sort by number of commits
contributor_list.sort(key=lambda x: x.num_commits_this_week, reverse=True)
contribution_list = counter.to_sorted_list()
# remove contributors who has zero commits
contributor_list = [x for x in contributor_list if x.num_commits_this_week > 0]
# prepare the data for plotting
x = [x.num_commits_this_week for x in contributor_list]
y = [x.name for x in contributor_list]
author_list = [x[0] for x in contribution_list]
num_commit_list = [x[1] for x in contribution_list]
# plot
if len(x) > 0:
xlabel = f"Number of Commits (since {start_datetime_str})"
if len(author_list) > 0:
xlabel = f"Number of Pull Requests (since {bj_start_datetime_str})"
ylabel = "Contributor"
title = 'Active Contributor Leaderboard'
plot_bar_chart(x, y, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
title = "Active Contributor Leaderboard"
plot_bar_chart(num_commit_list, author_list, xlabel=xlabel, ylabel=ylabel, title=title, output_path=output_path)
return True
else:
return False
......@@ -386,14 +473,14 @@ def upload_image_to_lark(lark_tenant_token: str, image_path: str) -> str:
image_path (str): the path to the image to be uploaded
"""
url = "https://open.feishu.cn/open-apis/im/v1/images"
form = {'image_type': 'message', 'image': (open(image_path, 'rb'))} # 需要替换具体的path
form = {"image_type": "message", "image": (open(image_path, "rb"))} # 需要替换具体的path
multi_form = MultipartEncoder(form)
headers = {
'Authorization': f'Bearer {lark_tenant_token}', ## 获取tenant_access_token, 需要替换为实际的token
"Authorization": f"Bearer {lark_tenant_token}", ## 获取tenant_access_token, 需要替换为实际的token
}
headers['Content-Type'] = multi_form.content_type
headers["Content-Type"] = multi_form.content_type
response = requests.request("POST", url, headers=headers, data=multi_form).json()
return response['data']['image_key']
return response["data"]["image_key"]
def generate_lark_tenant_access_token(app_id: str, app_secret: str) -> str:
......@@ -404,10 +491,10 @@ def generate_lark_tenant_access_token(app_id: str, app_secret: str) -> str:
app_id (str): Lark app id
app_secret (str): Lark app secret
"""
url = 'https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal'
data = {'app_id': app_id, 'app_secret': app_secret}
url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal"
data = {"app_id": app_id, "app_secret": app_secret}
response = requests.post(url, json=data).json()
return response['tenant_access_token']
return response["tenant_access_token"]
def send_image_to_lark(image_key: str, webhook_url: str) -> None:
......@@ -434,31 +521,37 @@ def send_message_to_lark(message: str, webhook_url: str):
requests.post(webhook_url, json=data)
if __name__ == '__main__':
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
CONTRIBUTOR_IMAGE_PATH = 'contributor_leaderboard.png'
USER_ENGAGEMENT_IMAGE_PATH = 'engagement_leaderboard.png'
if __name__ == "__main__":
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
CONTRIBUTOR_IMAGE_PATH = "contributor_leaderboard.png"
USER_ENGAGEMENT_IMAGE_PATH = "engagement_leaderboard.png"
ORG_NAME = "hpcaitech"
# get all open source repositories
REPO_LIST = get_organization_repositories(GITHUB_TOKEN, ORG_NAME)
# generate images
contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, CONTRIBUTOR_IMAGE_PATH)
engagement_success = generate_user_engagement_leaderboard_image(GITHUB_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
contrib_success = generate_contributor_leaderboard_image(GITHUB_TOKEN, ORG_NAME, REPO_LIST, CONTRIBUTOR_IMAGE_PATH)
engagement_success = generate_user_engagement_leaderboard_image(
GITHUB_TOKEN, ORG_NAME, REPO_LIST, USER_ENGAGEMENT_IMAGE_PATH
)
# upload images
APP_ID = os.environ['LARK_APP_ID']
APP_SECRET = os.environ['LARK_APP_SECRET']
APP_ID = os.environ["LARK_APP_ID"]
APP_SECRET = os.environ["LARK_APP_SECRET"]
LARK_TENANT_TOKEN = generate_lark_tenant_access_token(app_id=APP_ID, app_secret=APP_SECRET)
contributor_image_key = upload_image_to_lark(LARK_TENANT_TOKEN, CONTRIBUTOR_IMAGE_PATH)
user_engagement_image_key = upload_image_to_lark(LARK_TENANT_TOKEN, USER_ENGAGEMENT_IMAGE_PATH)
# send message to lark
LARK_WEBHOOK_URL = os.environ['LARK_WEBHOOK_URL']
LARK_WEBHOOK_URL = os.environ["LARK_WEBHOOK_URL"]
message = """本周的社区榜单出炉啦!
1. 开发贡献者榜单
2. 用户互动榜单
注:
- 开发贡献者测评标准为:本周由公司成员提交的commit次数
- 用户互动榜单测评标准为:本周由公司成员在非成员创建的issue/PR/discussion中回复的次数
- 开发贡献者测评标准为:本周由公司成员与社区在所有开源仓库提交的Pull Request次数
- 用户互动榜单测评标准为:本周由公司成员在非成员在所有开源仓库创建的issue/PR/discussion中回复的次数
"""
send_message_to_lark(message, LARK_WEBHOOK_URL)
......@@ -467,7 +560,7 @@ if __name__ == '__main__':
if contrib_success:
send_image_to_lark(contributor_image_key, LARK_WEBHOOK_URL)
else:
send_message_to_lark("本周没有成员贡献commit,无榜单图片生成。", LARK_WEBHOOK_URL)
send_message_to_lark("本周没有成员贡献PR,无榜单图片生成。", LARK_WEBHOOK_URL)
# send user engagement image to lark
if engagement_success:
......
......@@ -7,27 +7,27 @@ import re
import requests
COMMIT_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/commits'
TAGS_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/tags'
COMMIT_API = "https://api.github.com/repos/hpcaitech/ColossalAI/commits"
TAGS_API = "https://api.github.com/repos/hpcaitech/ColossalAI/tags"
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--out', type=str, help='output path for the release draft', required=True)
parser.add_argument('--version', type=str, help='current version to release', required=True)
parser.add_argument("--out", type=str, help="output path for the release draft", required=True)
parser.add_argument("--version", type=str, help="current version to release", required=True)
return parser.parse_args()
def get_latest_tag_commit(headers=None):
res = requests.get(url=TAGS_API, headers=headers)
data = res.json()
commit_hash = data[0]['commit']['sha']
version = data[0]['name']
commit_hash = data[0]["commit"]["sha"]
version = data[0]["name"]
return commit_hash, version
def get_commit_info(commit_hash, headers=None):
api = f'{COMMIT_API}/{commit_hash}'
api = f"{COMMIT_API}/{commit_hash}"
res = requests.get(url=api, headers=headers)
return res.json()
......@@ -37,7 +37,7 @@ def get_all_commit_info(since, headers=None):
results = []
while True:
api = f'{COMMIT_API}?since={since}&per_page=100&page={page}'
api = f"{COMMIT_API}?since={since}&per_page=100&page={page}"
resp = requests.get(url=api, headers=headers)
data = resp.json()
......@@ -53,21 +53,21 @@ def get_all_commit_info(since, headers=None):
def collate_release_info(commit_info_list):
results = dict()
pattern = pattern = r'\[.*\]'
pattern = pattern = r"\[.*\]"
for commit_info in commit_info_list:
author = commit_info['commit']['author']['name']
author = commit_info["commit"]["author"]["name"]
try:
author_url = commit_info['author']['url']
author_url = commit_info["author"]["url"]
except:
# author can be None
author_url = None
msg = commit_info['commit']['message']
msg = commit_info["commit"]["message"]
match = re.search(pattern, msg)
if match:
tag = match.group().lstrip('[').rstrip(']').capitalize()
tag = match.group().lstrip("[").rstrip("]").capitalize()
if tag not in results:
results[tag] = []
results[tag].append((msg, author, author_url))
......@@ -89,42 +89,43 @@ def generate_release_post_markdown(current_version, last_version, release_info):
for msg, author, author_url in v:
# only keep the first line
msg = msg.split('\n')[0]
msg = msg.split("\n")[0]
if author_url:
item = f'{msg} by [{author}]({author_url})\n'
item = f"{msg} by [{author}]({author_url})\n"
else:
item = f'{msg} by {author}\n'
text.append(f'- {item}')
item = f"{msg} by {author}\n"
text.append(f"- {item}")
text.append('\n')
text.append("\n")
# add full change log
text.append(
f'**Full Changelog**: https://github.com/hpcaitech/ColossalAI/compare/{current_version}...{last_version}')
f"**Full Changelog**: https://github.com/hpcaitech/ColossalAI/compare/{current_version}...{last_version}"
)
return text
if __name__ == '__main__':
if __name__ == "__main__":
args = parse_args()
token = os.environ['GITHUB_API_TOKEN']
headers = {'Authorization': token}
token = os.environ["GITHUB_API_TOKEN"]
headers = {"Authorization": token}
# get previous release tag
last_release_commit, last_version = get_latest_tag_commit(headers)
last_release_commit_info = get_commit_info(last_release_commit, headers=headers)
last_release_date = last_release_commit_info['commit']['author']['date']
last_release_date = last_release_commit_info["commit"]["author"]["date"]
# get the commits since last release
commit_info = get_all_commit_info(since=last_release_date, headers=headers)
commit_info = commit_info[:-1] # remove the release commit
commit_info = commit_info[:-1] # remove the release commit
# collate into markdown
release_info = collate_release_info(commit_info)
markdown_text = generate_release_post_markdown(args.version, last_version, release_info)
# write into a file
with open(args.out, 'w') as f:
with open(args.out, "w") as f:
for line in markdown_text:
f.write(line)
......@@ -5,8 +5,8 @@ import requests
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--message', type=str)
parser.add_argument('-u', '--url', type=str)
parser.add_argument("-m", "--message", type=str)
parser.add_argument("-u", "--url", type=str)
return parser.parse_args()
......@@ -15,6 +15,6 @@ def send_message_to_lark(message, webhook_url):
requests.post(webhook_url, json=data)
if __name__ == '__main__':
if __name__ == "__main__":
args = parse_args()
send_message_to_lark(args.message, args.url)
......@@ -155,3 +155,7 @@ colossalai/version.py
# ignore coverage test file
coverage.lcov
coverage.xml
# ignore testmon and coverage files
.coverage
.testmondata*
......@@ -3,3 +3,5 @@ line_length = 120
multi_line_output=3
include_trailing_comma = true
ignore_comments = true
profile = black
honor_noqa = true
repos:
- repo: https://github.com/PyCQA/autoflake
rev: v2.2.1
hooks:
- id: autoflake
name: autoflake (python)
args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
name: sort all imports (python)
- repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.32.0
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.9.1
hooks:
- id: yapf
name: yapf formatter
args: ['--style=.style.yapf', '--parallel', '--in-place']
- id: black
name: black formatter
args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v13.0.1
hooks:
- id: clang-format
name: clang formatter
types_or: [c++, c]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
......
[style]
based_on_style = google
spaces_before_comment = 4
split_before_logical_operator = true
column_limit = 120
......@@ -30,6 +30,12 @@ pip install <options> -e .
### Unit Tests
We use [PyTest](https://docs.pytest.org/en/latest/) to execute tests. You can install pytest by `pip install pytest`. As some of the tests require initialization of the distributed backend, GPUs are needed to execute these tests.
To set up the environment for unit testing, first change your current directory to the root directory of your local ColossalAI repository, then run
```bash
pip install -r requirements/requirements-test.txt
```
If you encounter an error telling "Could not find a version that satisfies the requirement fbgemm-gpu==0.2.0", please downgrade your python version to 3.8 or 3.9 and try again.
If you only want to run CPU tests, you can run
```bash
......@@ -138,4 +144,4 @@ You can now create a pull request on the GitHub webpage of your repository. The
Do write clearly the description of your pull request and [link the pull request to your target issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). This will automatically close the issue when the pull request is approved.
In case of code conflict, you should rebase your branch and resolve the conflicts manually.
\ No newline at end of file
In case of code conflict, you should rebase your branch and resolve the conflicts manually.
......@@ -396,3 +396,84 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
---------------- LICENSE FOR VLLM TEAM ----------------
from VLLM TEAM:
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://github.com/vllm-project/vllm/blob/main/LICENSE
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
---------------- LICENSE FOR LIGHTLLM TEAM ----------------
from LIGHTLLM TEAM:
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://github.com/ModelTC/lightllm/blob/main/LICENSE
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
---------------- LICENSE FOR AutoGPTQ ----------------
From AutoGPTQ:
MIT License
Copyright (c) 2023 潘其威(William)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
---------------- LICENSE FOR exllama ----------------
From exllama:
MIT License
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
......@@ -16,7 +16,7 @@
[![Documentation](https://readthedocs.org/projects/colossalai/badge/?version=latest)](https://colossalai.readthedocs.io/en/latest/?badge=latest)
[![CodeFactor](https://www.codefactor.io/repository/github/hpcaitech/colossalai/badge)](https://www.codefactor.io/repository/github/hpcaitech/colossalai)
[![HuggingFace badge](https://img.shields.io/badge/%F0%9F%A4%97HuggingFace-Join-yellow)](https://huggingface.co/hpcai-tech)
[![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&amp)](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w)
[![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&amp)](https://github.com/hpcaitech/public_assets/tree/main/colossalai/contact/slack)
[![WeChat badge](https://img.shields.io/badge/微信-加入-green?logo=wechat&amp)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png)
......@@ -25,14 +25,15 @@
</div>
## Latest News
* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
* [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
* [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
* [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
* [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
* [2023/01] [Hardware Savings Up to 46 Times for AIGC and Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02)
* [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
* [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
* [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
## Table of Contents
<ul>
......@@ -41,6 +42,7 @@
<li>
<a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
<ul>
<li><a href="#Colossal-LLaMA-2">Colossal-LLaMA-2: One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution</a></li>
<li><a href="#ColossalChat">ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline</a></li>
<li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
<li><a href="#Biomedicine">Biomedicine: Acceleration of AlphaFold Protein Structure</a></li>
......@@ -49,6 +51,7 @@
<li>
<a href="#Parallel-Training-Demo">Parallel Training Demo</a>
<ul>
<li><a href="#LLaMA2">LLaMA 1/2</a></li>
<li><a href="#GPT-3">GPT-3</a></li>
<li><a href="#GPT-2">GPT-2</a></li>
<li><a href="#BERT">BERT</a></li>
......@@ -124,15 +127,55 @@ distributed training and inference in a few lines.
## Colossal-AI in the Real World
### Colossal-LLaMA-2
- One half-day of training using a few hundred dollars yields similar results to mainstream large models, open-source and commercial-free domain-specific LLM solution.
[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
[[blog]](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
[[model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base)
| | Backbone | Tokens Consumed | | MMLU | CMMLU | AGIEval | GAOKAO | CEval |
| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :-----: | :----: | :----: | :------------------------------: |
| | | - | | 5-shot | 5-shot | 5-shot | 0-shot | 5-shot |
| Baichuan-7B | - | 1.2T | | 42.32 (42.30) | 44.53 (44.02) | 38.72 | 36.74 | 42.80 |
| Baichuan-13B-Base | - | 1.4T | | 50.51 (51.60) | 55.73 (55.30) | 47.20 | 51.41 | 53.60 |
| Baichuan2-7B-Base | - | 2.6T | | 46.97 (54.16) | 57.67 (57.07) | 45.76 | 52.60 | 54.00 |
| Baichuan2-13B-Base | - | 2.6T | | 54.84 (59.17) | 62.62 (61.97) | 52.08 | 58.25 | 58.10 |
| ChatGLM-6B | - | 1.0T | | 39.67 (40.63) | 41.17 (-) | 40.10 | 36.53 | 38.90 |
| ChatGLM2-6B | - | 1.4T | | 44.74 (45.46) | 49.40 (-) | 46.36 | 45.49 | 51.70 |
| InternLM-7B | - | 1.6T | | 46.70 (51.00) | 52.00 (-) | 44.77 | 61.64 | 52.80 |
| Qwen-7B | - | 2.2T | | 54.29 (56.70) | 56.03 (58.80) | 52.47 | 56.42 | 59.60 |
| | | | | | | | | |
| Llama-2-7B | - | 2.0T | | 44.47 (45.30) | 32.97 (-) | 32.60 | 25.46 | - |
| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B | 1.0T | | 37.43 | 29.92 | 32.00 | 27.57 | - |
| wenge-research/yayi-7b-llama2 | Llama-2-7B | - | | 38.56 | 31.52 | 30.99 | 25.95 | - |
| ziqingyang/chinese-llama-2-7b | Llama-2-7B | - | | 33.86 | 34.69 | 34.52 | 25.18 | 34.2 |
| TigerResearch/tigerbot-7b-base | Llama-2-7B | 0.3T | | 43.73 | 42.04 | 37.64 | 30.61 | - |
| LinkSoul/Chinese-Llama-2-7b | Llama-2-7B | - | | 48.41 | 38.31 | 38.45 | 27.72 | - |
| FlagAlpha/Atom-7B | Llama-2-7B | 0.1T | | 49.96 | 41.10 | 39.83 | 33.00 | - |
| IDEA-CCNL/Ziya-LLaMA-13B-v1.1 | Llama-13B | 0.11T | | 50.25 | 40.99 | 40.04 | 30.54 | - |
| | | | | | | | | |
| **Colossal-LLaMA-2-7b-base** | Llama-2-7B | **0.0085T** | | 53.06 | 49.89 | 51.48 | 58.82 | 50.2 |
### ColossalChat
<div align="center">
<a href="https://chat.colossalai.org/">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Chat-demo.png" width="700" />
<a href="https://www.youtube.com/watch?v=HcTiHzApHm0">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20YouTube.png" width="700" />
</a>
</div>
[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) [[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) [[demo]](https://chat.colossalai.org)
[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline.
[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
[[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
[[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
<p id="ColossalChat-Speed" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
</p>
- Up to 10 times faster for RLHF PPO Stage3 Training
<p id="ColossalChat_scaling" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>
......@@ -205,6 +248,23 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
<p align="right">(<a href="#top">back to top</a>)</p>
## Parallel Training Demo
### LLaMA2
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/llama2_pretraining.png" width=600/>
</p>
- 70 billion parameter LLaMA2 model training accelerated by 195%
[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama2)
[[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)
### LLaMA1
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
</p>
- 65-billion-parameter large model pretraining accelerated by 38%
[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
### GPT-3
<p align="center">
......@@ -352,6 +412,22 @@ If you want to install and enable CUDA kernel fusion (compulsory installation wh
CUDA_EXT=1 pip install .
```
For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
```bash
# clone the repository
git clone https://github.com/hpcaitech/ColossalAI.git
cd ColossalAI
# download the cub library
wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
unzip 1.8.0.zip
cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
# install
CUDA_EXT=1 pip install .
```
<p align="right">(<a href="#top">back to top</a>)</p>
## Use Docker
......@@ -426,6 +502,7 @@ To cite this project, you can use the following BibTeX citation.
}
```
Colossal-AI has been accepted as official tutorial by top conferences [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), etc.
Colossal-AI has been accepted as official tutorial by top conferences [NeurIPS](https://nips.cc/), [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/),
[PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), [NVIDIA GTC](https://www.nvidia.com/en-us/on-demand/session/gtcspring23-S51482/) ,etc.
<p align="right">(<a href="#top">back to top</a>)</p>
......@@ -145,4 +145,4 @@ docs/.build
# wandb log
example/wandb/
examples/awesome-chatgpt-prompts/
\ No newline at end of file
examples/awesome-chatgpt-prompts/
......@@ -4,7 +4,6 @@
<span>ColossalChat</span>
</h1>
## Table of Contents
- [Table of Contents](#table-of-contents)
......@@ -34,7 +33,9 @@
- [Authors](#authors)
- [Citations](#citations)
- [Licenses](#licenses)
---
## What is ColossalChat and Coati ?
[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) is the project to implement LLM with RLHF, powered by the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) project.
......@@ -42,6 +43,7 @@
Coati stands for `ColossalAI Talking Intelligence`. It is the name for the module implemented in this project and is also the name of the large language model developed by the ColossalChat project.
The Coati package provides a unified large language model framework that has implemented the following functions
- Supports comprehensive large-model training acceleration capabilities for ColossalAI, without requiring knowledge of complex distributed training algorithms
- Supervised datasets collection
- Supervised instructions fine-tuning
......@@ -56,29 +58,42 @@ The Coati package provides a unified large language model framework that has imp
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/chatgpt.png" width=700/>
</p>
Image source: https://openai.com/blog/chatgpt
Image source: https://openai.com/blog/chatgpt
</div>
**As Colossal-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
More details can be found in the latest news.
* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
* [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
- [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
- [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
## Online demo
You can experience the performance of Coati7B on this page.
[chat.colossalai.org](https://chat.colossalai.org/)
<div align="center">
<a href="https://www.youtube.com/watch?v=HcTiHzApHm0">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20YouTube.png" width="700" />
</a>
</div>
[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline.
[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
[[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
[[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
Due to resource constraints, we will only provide this service from 29th Mar 2023 to 5 April 2023. However, we have provided the inference code in the [inference](./inference/) folder. The WebUI will be open-sourced soon as well.
<p id="ColossalChat-Speed" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
</p>
> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: `torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --num_collect_steps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32`
> Warning: Due to model and dataset size limitations, Coati is just a baby model, Coati7B may output incorrect information and lack the ability for multi-turn dialogue. There is still significant room for improvement.
## Install
### Install the environment
```shell
```bash
conda create -n coati
conda activate coati
git clone https://github.com/hpcaitech/ColossalAI.git
......@@ -87,22 +102,20 @@ pip install .
```
### Install the Transformers
Given Hugging Face hasn't officially supported the LLaMA models, We fork a branch of Transformers that can be compatible with our code
```shell
git clone https://github.com/hpcaitech/transformers
cd transformers
pip install .
```bash
pip install transformers==4.30.2
```
## How to use?
### Supervised datasets collection
we collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
[InstructionWild](https://github.com/XueFuzhao/InstructionWild)
We collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
[InstructionWild](https://github.com/XueFuzhao/InstructionWild) and in this [file](https://github.com/XueFuzhao/InstructionWild/blob/main/data/README.md).
Here is how we collected the data
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/data-collect.png" width=500/>
</p>
......@@ -112,12 +125,28 @@ Here is how we collected the data
Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.
You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
[[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
**Note**: the supervised dataset follows the following format,
```json
[
{
"instruction": "Provide a list of the top 10 most popular mobile games in Asia",
"input": "",
"output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
"id": 0
},
...
]
```
### RLHF Training Stage2 - Training reward model
Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model
You can run the `examples/train_rm.sh` to start a reward model training.
[[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
### RLHF Training Stage3 - Training model with reinforcement learning by human feedback
......@@ -128,6 +157,39 @@ Stage3 uses reinforcement learning algorithm, which is the most complex part of
</p>
You can run the `examples/train_prompts.sh` to start training PPO with human feedback.
[[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
**Note**: the required datasets follow the following format,
- `pretrain dataset`
```json
[
{
"instruction": "Provide a list of the top 10 most popular mobile games in Asia",
"input": "",
"output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
"id": 0
},
...
]
```
- `prompt dataset`
```json
[
{
"instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
"id": 0
},
{
"instruction": "Write a descriptive paragraph about a memorable vacation you went on",
"id": 1
},
...
]
```
For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples).
......@@ -135,9 +197,9 @@ For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree
We provide an online inference server and a benchmark. We aim to run inference on single GPU, so quantization is essential when using large models.
We support 8-bit quantization (RTN), 4-bit quantization (GPTQ), and FP16 inference. You can
Online inference server scripts can help you deploy your own services.
We support 8-bit quantization (RTN), 4-bit quantization (GPTQ), and FP16 inference.
Online inference server scripts can help you deploy your own services.
For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/inference).
## Coati7B examples
......@@ -147,6 +209,7 @@ For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tre
<details><summary><b>E-mail</b></summary>
![phd](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/Phd.png)
</details>
<details><summary><b>coding</b></summary>
......@@ -180,6 +243,7 @@ For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tre
</details>
### Open QA
<details><summary><b>Game</b></summary>
![Game](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/game.png)
......@@ -213,6 +277,7 @@ For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tre
You can find more examples in this [repo](https://github.com/XueFuzhao/InstructionWild/blob/main/comparison.md).
### Limitation
<details><summary><b>Limitation for LLaMA-finetuned models</b></summary>
- Both Alpaca and ColossalChat are based on LLaMA. It is hard to compensate for the missing knowledge in the pre-training stage.
- Lack of counting ability: Cannot count the number of items in a list.
......@@ -236,7 +301,7 @@ You can find more examples in this [repo](https://github.com/XueFuzhao/Instructi
We have integrated the Transformers save and load pipeline, allowing users to freely call Hugging Face's language models and save them in the HF format.
```
```python
from coati.models.llama import LlamaLM
from coati.trainer import SFTTrainer
......@@ -245,20 +310,20 @@ tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
(model, optim) = strategy.prepare((model, optim))
trainer = SFTTrainer(model=model,
strategy=strategy,
optim=optim,
train_dataloader=train_dataloader,
eval_dataloader=eval_dataloader,
batch_size=args.batch_size,
max_epochs=args.max_epochs,
accumulation_steps = args.accumulation_steps
)
strategy=strategy,
optim=optim,
train_dataloader=train_dataloader,
eval_dataloader=eval_dataloader,
batch_size=args.batch_size,
max_epochs=args.max_epochs,
accumulation_steps=args.accumulation_steps
)
trainer.fit()
# this saves in pytorch format
strategy.save_model(model, args.save_path, only_rank0=True)
# this saves in HF format. ColossalAI strategy with stage-3 doesn't support this method
# this saves in HF format
strategy.save_pretrained(model, args.save_path, only_rank0=True, tokenizer=tokenizer)
```
......@@ -269,12 +334,13 @@ strategy.save_pretrained(model, args.save_path, only_rank0=True, tokenizer=token
Here are some examples that can allow you to train a 7B model on a single or multiple consumer-grade GPUs.
If you only have a single 24G GPU, you can use the following script. `batch_size`, `lora_rank` and `grad_checkpoint` are the most important parameters to successfully train the model.
```
```bash
// [INFO]: MAX GPU MEMORY ALLOCATED: 19148.9345703125 MB
torchrun --standalone --nproc_per_node=1 train_sft.py \
--pretrain "/path/to/LLaMa-7B/" \
--model 'llama' \
--strategy naive \
--log_interval 10 \
--strategy ddp \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
......@@ -287,12 +353,12 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
```
`colossalai_gemini` strategy can enable a single 24G GPU to train the whole model without using LoRA if you have sufficient CPU memory. You can use the following script.
```
```bash
torchrun --standalone --nproc_per_node=1 train_sft.py \
--pretrain "/path/to/LLaMa-7B/" \
--model 'llama' \
--strategy colossalai_gemini \
--log_interval 10 \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
......@@ -304,12 +370,12 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
```
If you have 4x32 GB GPUs, you can even train the whole 7B model using our `colossalai_zero2_cpu` strategy! The script is given as follows.
```
```bash
torchrun --standalone --nproc_per_node=4 train_sft.py \
--pretrain "/path/to/LLaMa-7B/" \
--model 'llama' \
--strategy colossalai_zero2_cpu \
--log_interval 10 \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
......@@ -319,8 +385,8 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--max_epochs 1 \
--grad_checkpoint
```
</details>
</details>
## The Plan
......@@ -335,31 +401,33 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
- [ ] support chain-of-thought by [langchain](https://github.com/hwchase17/langchain)
### Real-time progress
You will find our progress in github project broad
[Coati](https://github.com/orgs/hpcaitech/projects/17/views/1)
You will find our progress in github [project broad](https://github.com/orgs/hpcaitech/projects/17/views/1).
## Invitation to open-source contribution
Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models from the starting point of replicating ChatGPT!
You may contact us or participate in the following ways:
1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
3. Join the Colossal-AI community on
[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
[Slack](https://github.com/hpcaitech/public_assets/tree/main/colossalai/contact/slack),
and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
4. Send your official proposal to email contact@hpcaitech.com
Thanks so much to all of our amazing contributors!
## Quick Preview
<div align="center">
<a href="https://chat.colossalai.org/">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Chat-demo.png" width="700" />
</a>
</div>
- An open-source low cost solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. [[demo]](https://chat.colossalai.org)
- An open-source low-cost solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. [[demo]](https://chat.colossalai.org)
<p id="ChatGPT_scaling" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>
......@@ -386,18 +454,21 @@ Thanks so much to all of our amazing contributors!
| Better Cases | 38 ⚔ **41** | **45** ⚔ 33 |
| Win Rate | 48% ⚔ **52%** | **58%** ⚔ 42% |
| Average Score | 7.06 ⚔ **7.13** | **7.31** ⚔ 6.82 |
- Our Coati-7B model performs better than Alpaca-7B when using GPT-4 to evaluate model performance. The Coati-7B model we evaluate is an old version we trained a few weeks ago and the new version is around the corner.
## Authors
Coati is developed by ColossalAI Team:
- [Fazzie](https://fazzie-key.cool/about/index.html)
- [FrankLeeeee](https://github.com/FrankLeeeee)
- [BlueRum](https://github.com/ht-zhou)
- [ver217](https://github.com/ver217)
- [ofey404](https://github.com/ofey404)
- [Wenhao Chen](https://github.com/CWHer)
The Phd student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
- [Zangwei Zheng](https://github.com/zhengzangw)
- [Xue Fuzhao](https://github.com/XueFuzhao)
......
......@@ -27,9 +27,12 @@ We also provide various training strategies:
We only support `torchrun` to launch now. E.g.
```shell
```bash
# run OPT-125M with no lora (lora_rank=0) on single-node single-GPU with min batch size
torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py --model 125m --critic_model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py \
--model 125m --critic_model 125m --strategy ddp \
--experience_batch_size 1 --train_batch_size 1 --lora_rank 0
# run Actor (OPT-1.3B) and Critic (OPT-350M) with lora_rank=4 on single-node 4-GPU
torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py --model 1.3b --critic_model 350m --strategy colossalai_zero2 --lora_rank 4
torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py \
--model 1.3b --critic_model 350m --strategy colossalai_zero2 --lora_rank 4
```
......@@ -8,7 +8,7 @@ from coati.models.base import RewardModel
from coati.models.opt import OPTActor, OPTCritic
from coati.trainer import PPOTrainer
from coati.trainer.callbacks import PerformanceEvaluator
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
......@@ -19,7 +19,7 @@ from colossalai.nn.optimizer import HybridAdam
def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
numel = sum(p.numel() for p in model.parameters())
if isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3 and strategy.shard_init:
if isinstance(strategy, GeminiStrategy) and strategy.shard_init:
numel *= dist.get_world_size()
return numel
......@@ -27,7 +27,7 @@ def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
def preprocess_batch(samples) -> dict:
input_ids = torch.stack(samples)
attention_mask = torch.ones_like(input_ids, dtype=torch.long)
return {'input_ids': input_ids, 'attention_mask': attention_mask}
return {"input_ids": input_ids, "attention_mask": attention_mask}
def print_rank_0(*args, **kwargs) -> None:
......@@ -39,32 +39,32 @@ def print_model_numel(model_dict: dict) -> None:
B = 1024**3
M = 1024**2
K = 1024
outputs = ''
outputs = ""
for name, numel in model_dict.items():
outputs += f'{name}: '
outputs += f"{name}: "
if numel >= B:
outputs += f'{numel / B:.2f} B\n'
outputs += f"{numel / B:.2f} B\n"
elif numel >= M:
outputs += f'{numel / M:.2f} M\n'
outputs += f"{numel / M:.2f} M\n"
elif numel >= K:
outputs += f'{numel / K:.2f} K\n'
outputs += f"{numel / K:.2f} K\n"
else:
outputs += f'{numel}\n'
outputs += f"{numel}\n"
print_rank_0(outputs)
def get_gpt_config(model_name: str) -> OPTConfig:
model_map = {
'125m': OPTConfig.from_pretrained('facebook/opt-125m'),
'350m': OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
'700m': OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
'1.3b': OPTConfig.from_pretrained('facebook/opt-1.3b'),
'2.7b': OPTConfig.from_pretrained('facebook/opt-2.7b'),
'3.5b': OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
'5.5b': OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
'6.7b': OPTConfig.from_pretrained('facebook/opt-6.7b'),
'10b': OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
'13b': OPTConfig.from_pretrained('facebook/opt-13b'),
"125m": OPTConfig.from_pretrained("facebook/opt-125m"),
"350m": OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
"700m": OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
"1.3b": OPTConfig.from_pretrained("facebook/opt-1.3b"),
"2.7b": OPTConfig.from_pretrained("facebook/opt-2.7b"),
"3.5b": OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
"5.5b": OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
"6.7b": OPTConfig.from_pretrained("facebook/opt-6.7b"),
"10b": OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
"13b": OPTConfig.from_pretrained("facebook/opt-13b"),
}
try:
return model_map[model_name]
......@@ -73,20 +73,20 @@ def get_gpt_config(model_name: str) -> OPTConfig:
def main(args):
if args.strategy == 'ddp':
if args.strategy == "ddp":
strategy = DDPStrategy()
elif args.strategy == 'colossalai_gemini':
strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
elif args.strategy == 'colossalai_gemini_cpu':
strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
elif args.strategy == 'colossalai_zero2':
strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
elif args.strategy == 'colossalai_zero2_cpu':
strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
elif args.strategy == 'colossalai_zero1':
strategy = ColossalAIStrategy(stage=1, placement_policy='cuda')
elif args.strategy == 'colossalai_zero1_cpu':
strategy = ColossalAIStrategy(stage=1, placement_policy='cpu')
elif args.strategy == "colossalai_gemini":
strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5)
elif args.strategy == "colossalai_gemini_cpu":
strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
elif args.strategy == "colossalai_zero2":
strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
elif args.strategy == "colossalai_zero2_cpu":
strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
elif args.strategy == "colossalai_zero1":
strategy = LowLevelZeroStrategy(stage=1, placement_policy="cuda")
elif args.strategy == "colossalai_zero1_cpu":
strategy = LowLevelZeroStrategy(stage=1, placement_policy="cpu")
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
......@@ -103,92 +103,106 @@ def main(args):
if args.use_kernels:
from coati.kernels import convert_to_xformer_model
actor, critic, initial_model, reward_model = map(convert_to_xformer_model,
(actor, critic, initial_model, reward_model))
actor, critic, initial_model, reward_model = map(
convert_to_xformer_model, (actor, critic, initial_model, reward_model)
)
actor_numel = get_model_numel(actor, strategy)
critic_numel = get_model_numel(critic, strategy)
initial_model_numel = get_model_numel(initial_model, strategy)
reward_model_numel = get_model_numel(reward_model, strategy)
print_model_numel({
'Actor': actor_numel,
'Critic': critic_numel,
'Initial model': initial_model_numel,
'Reward model': reward_model_numel
})
performance_evaluator = PerformanceEvaluator(actor_numel,
critic_numel,
initial_model_numel,
reward_model_numel,
enable_grad_checkpoint=False,
ignore_episodes=1)
if args.strategy.startswith('colossalai'):
print_model_numel(
{
"Actor": actor_numel,
"Critic": critic_numel,
"Initial model": initial_model_numel,
"Reward model": reward_model_numel,
}
)
performance_evaluator = PerformanceEvaluator(
actor_numel,
critic_numel,
initial_model_numel,
reward_model_numel,
enable_grad_checkpoint=False,
ignore_episodes=1,
)
if args.strategy.startswith("colossalai"):
actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
else:
actor_optim = Adam(actor.parameters(), lr=5e-6)
critic_optim = Adam(critic.parameters(), lr=5e-6)
tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
(actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
trainer = PPOTrainer(strategy,
actor,
critic,
reward_model,
initial_model,
actor_optim,
critic_optim,
ptx_coef=0,
max_epochs=args.max_epochs,
train_batch_size=args.train_batch_size,
offload_inference_models=args.offload_inference_models,
max_length=512,
do_sample=True,
temperature=1.0,
top_k=50,
use_cache=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
callbacks=[performance_evaluator])
random_prompts = torch.randint(tokenizer.vocab_size, (1000, 256), device=torch.cuda.current_device())
dataloader = DataLoader(random_prompts,
batch_size=args.experience_batch_size,
shuffle=True,
collate_fn=preprocess_batch)
trainer.fit(dataloader,
None,
num_episodes=args.num_episodes,
max_timesteps=args.max_timesteps,
update_timesteps=args.update_timesteps)
print_rank_0(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB')
if __name__ == '__main__':
dataloader = DataLoader(
random_prompts, batch_size=args.experience_batch_size, shuffle=True, collate_fn=preprocess_batch
)
trainer = PPOTrainer(
strategy,
actor,
critic,
reward_model,
initial_model,
actor_optim,
critic_optim,
tokenizer=tokenizer,
ptx_coef=0,
train_batch_size=args.train_batch_size,
offload_inference_models=args.offload_inference_models,
max_length=512,
do_sample=True,
temperature=1.0,
top_k=50,
use_cache=True,
callbacks=[performance_evaluator],
)
trainer.fit(
prompt_dataloader=dataloader,
pretrain_dataloader=None,
num_episodes=args.num_episodes,
num_update_steps=args.num_update_steps,
num_collect_steps=args.num_collect_steps,
)
print_rank_0(f"Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--model', default='125m')
parser.add_argument('--critic_model', default='125m')
parser.add_argument('--strategy',
choices=[
'ddp', 'colossalai_gemini', 'colossalai_gemini_cpu', 'colossalai_zero2',
'colossalai_zero2_cpu', 'colossalai_zero1', 'colossalai_zero1_cpu'
],
default='ddp')
parser.add_argument('--num_episodes', type=int, default=3)
parser.add_argument('--max_timesteps', type=int, default=8)
parser.add_argument('--update_timesteps', type=int, default=8)
parser.add_argument('--max_epochs', type=int, default=1)
parser.add_argument('--train_batch_size', type=int, default=8)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0)
parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
parser.add_argument('--offload_inference_models', action='store_true', default=False)
parser.add_argument('--use_kernels', action='store_true', default=False)
parser.add_argument("--model", default="125m")
parser.add_argument("--critic_model", default="125m")
parser.add_argument(
"--strategy",
choices=[
"ddp",
"colossalai_gemini",
"colossalai_gemini_cpu",
"colossalai_zero2",
"colossalai_zero2_cpu",
"colossalai_zero1",
"colossalai_zero1_cpu",
],
default="ddp",
)
parser.add_argument("--num_episodes", type=int, default=3)
parser.add_argument("--num_collect_steps", type=int, default=8)
parser.add_argument("--num_update_steps", type=int, default=1)
parser.add_argument("--train_batch_size", type=int, default=8)
parser.add_argument("--experience_batch_size", type=int, default=8)
parser.add_argument("--lora_rank", type=int, default=0)
parser.add_argument("--cuda_mem_frac", type=float, default=1.0)
parser.add_argument("--offload_inference_models", action="store_true", default=False)
parser.add_argument("--use_kernels", action="store_true", default=False)
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment