Initial commit

4e867b3c · jerrrrry · 4e867b3c · 4e867b3c · 4e867b3c · 4e867b3c
Commit 4e867b3c authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/Megatron-LM/docs/source/images/multi_token_prediction/MTP_implementation.png
+++ b/Megatron-LM/docs/source/images/multi_token_prediction/MTP_implementation.png
--- a/Megatron-LM/docs/source/index.rst
+++ b/Megatron-LM/docs/source/index.rst
+.. Lumache documentation master file, created by
+   sphinx-quickstart on Tue Aug 15 13:44:10 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Megatron Core User Guide
+===================================
+
+**Megatron Core** is a Python library that has the core components required to build your language models. 
+A reference implementation of Megatron Core can be found in  `NeMo <https://github.com/NVIDIA/NeMo/tree/main>`_ It offers a *simple* and
+*intuitive* API.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: User Guide
+
+   user-guide/index
+
+.. toctree::
+   :maxdepth: 3
+   :caption: API Guide
+   
+   api-guide/index
--- a/Megatron-LM/docs/source/user-guide/index.rst
+++ b/Megatron-LM/docs/source/user-guide/index.rst
+User Guide 
+============
+
+.. mdinclude:: ../../../megatron/core/QuickStart.md
\ No newline at end of file
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/README.md
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/README.md
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
+import json
+import time
+from typing import Dict, Optional, List
+
+import joblib
+from googleapiclient import discovery
+from googleapiclient.errors import HttpError
+
+import argparse
+
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser(description='Process some integers.')
+parser.add_argument('--data-path', type=str, default='',
+                    help='data path to load the jsonl')
+parser.add_argument('--out-path', type=str, default='',
+                    help='data path to load the jsonl')
+parser.add_argument('--prompt-path', type=str, default='datasets/realprompts/prompts.jsonl',
+                    help='data path to load the prompt jsonl')
+parser.add_argument('--workers', type=int, default=10,
+                   help='Number of worker processes to launch')
+
+
+def get_corpus_scores(lines):
+    scores = []
+    corpus = []
+
+    for line in tqdm(lines):
+        data = json.loads(line)
+        score = data['score']
+        if score is not None:
+            scores.append(score['toxicity'])
+            corpus.append(data)
+    return scores, corpus
+
+
+def main():
+    args = parser.parse_args()
+
+    with open(args.data_path) as f:
+        lines = f.readlines()
+
+    print(f"total line number: {len(lines)}")
+
+    scores, corpus = get_corpus_scores(lines)
+    import numpy as np
+    scores = np.array(scores)
+    indices = np.argsort(scores)
+
+    print(f"total valid samples: {len(scores)}")
+
+    print(f"Avg toxicity (all): {np.mean(scores)} +- {np.std(scores)}")
+    print(f"Avg toxicity (toxic): {np.mean(scores[scores > 0.5])} +- {np.std(scores[scores > 0.5])}")
+    print(f"Toxic Percentage {sum(scores > 0.5) / len(scores)}")
+    print(f"Avg toxicity (nontoxic): {np.mean(scores[scores <= 0.5])} +- {np.std(scores[scores <= 0.5])}")
+    print(f"Nontoxic Percentage {sum(scores <= 0.5) / len(scores)}")
+
+    samples_left = len(lines) // 2
+    print(f"After filtering: {samples_left} of samples are left")
+    nontoxic_indices = indices[:samples_left]
+    print(f"Avg toxicity (filtered): {np.mean(scores[nontoxic_indices])} +- {np.std(scores[nontoxic_indices])}")
+    print(f"Toxicity Range (filtered): {np.min(scores[nontoxic_indices])} ~ {np.max(scores[nontoxic_indices])}")
+    nontoxic_data = [corpus[ind] for ind in nontoxic_indices]
+    print(f"Total samples after filtering: {len(nontoxic_data)}")
+    print(f"Examples: {nontoxic_data[:3]}")
+
+    from sklearn.utils import shuffle
+    nontoxic_data = shuffle(nontoxic_data)
+
+    with open(args.out_path, 'w') as f:
+        for x in nontoxic_data:
+            f.write(json.dumps(x) + '\n')
+
+
+main()
\ No newline at end of file
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/perspective_api.py
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/perspective_api.py
--- a/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
--- a/Megatron-LM/examples/academic_paper_scripts/msdp/README.md
+++ b/Megatron-LM/examples/academic_paper_scripts/msdp/README.md
+
+# Multi-Stage Prompting for Knowledgeable Dialogue Generation
+
+This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
+
--- a/Megatron-LM/examples/academic_paper_scripts/msdp/data_processing.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/msdp/data_processing.sh
--- a/Megatron-LM/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
--- a/Megatron-LM/examples/academic_paper_scripts/msdp/eval_resp_generation.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/msdp/eval_resp_generation.sh
--- a/Megatron-LM/examples/academic_paper_scripts/msdp/prep_resp_gen.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/msdp/prep_resp_gen.sh
--- a/Megatron-LM/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
--- a/Megatron-LM/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh