bloom

9fdb7dab · yuguo960516 · 9fdb7dab · 9fdb7dab · 9fdb7dab · 9fdb7dab
Commit 9fdb7dab authored Mar 30, 2023 by yuguo960516
20 changed files
--- a/libai/inference/text_generation.py
+++ b/libai/inference/text_generation.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libai.inference.basic import BasePipeline
+from libai.utils import distributed as dist
+
+
+class TextGenerationPipeline(BasePipeline):
+    def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
+        """load pretrained model.
+
+        Args:
+            libai_cfg_model (libai.models): Lazy config Model in Libai, you can import it
+                by `from libai.config.configs.common.models.bert
+                    import pretrain_model as libai_cfg_model`
+            model_path (str): The directory path of pretrained model,
+        """
+        if mode == "huggingface":
+            from projects.MT5.utils.mt5_loader import T5LoaderHuggerFace
+
+            model_loader = T5LoaderHuggerFace(
+                libai_cfg_model,
+                libai_cfg_model.cfg,
+                model_path,
+                hidden_dropout_prob=0.0,
+                attention_probs_dropout_prob=0.0,
+                embedding_dropout_prob=0.0,
+            )
+            return model_loader.load()
+        elif mode == "libai":
+            from projects.MT5.utils.mt5_loader import T5LoaderLibai
+
+            model_loader = T5LoaderLibai(
+                libai_cfg_model,
+                libai_cfg_model.cfg,
+                model_path,
+            )
+            return model_loader.load()
+        elif mode == "random":
+            from libai.engine import DefaultTrainer
+
+            return DefaultTrainer.build_model(self.cfg)
+        else:
+            raise NotImplementedError
+
+    def _parse_parameters(self, **pipeline_parameters):
+        preprocess_params = {}
+        forward_params = {**pipeline_parameters}
+        postprocess_params = {}
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def preprocess(
+        self,
+        inputs,
+        pad: bool = False,
+        **kwargs,
+    ) -> dict:
+        # tokenizer encoder
+        encoder_ids = self.tokenizer.encode(inputs, return_tensors="of", is_global=True)
+
+        encoder_input_dict = {
+            "encoder_ids": encoder_ids,
+        }
+
+        return encoder_input_dict
+
+    def forward(self, encoder_input_dict, **kwargs) -> dict:
+        outputs = self.model.generate(encoder_input_dict["encoder_ids"], **kwargs)
+        return {"return_ids": outputs}
+
+    def postprocess(self, model_output_dict, **kwargs) -> dict:
+        return_ids = model_output_dict["return_ids"]
+        records = [
+            {"generated_text": self.tokenizer.decode(return_ids[i], skip_special_tokens=True)}
+            for i in range(return_ids.size(0))
+        ]
+        return records
+
+
+if __name__ == "__main__":
+    pipeline = TextGenerationPipeline(
+        "/path/to/libai/projects/MT5/configs/t5_inference.py",
+        data_parallel=1,
+        tensor_parallel=2,
+        pipeline_parallel=2,
+        pipeline_stage_id=[0] * 12 + [1] * 12,
+        pipeline_num_layers=12 * 2,
+        model_path="/path/to/t5-base",
+        mode="huggingface",
+    )
+
+    text = ["summarize: She is a student, She is tall, She loves study"]
+    dict1 = pipeline(text)
+    if dist.is_main_process():
+        print(dict1)
--- a/libai/inference/utils/imagenet_class.py
+++ b/libai/inference/utils/imagenet_class.py
+IMAGENET_LABELS = [
+    "tench, Tinca tinca",
+    "goldfish, Carassius auratus",
+    "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",  # noqa: E501
+    "tiger shark, Galeocerdo cuvieri",
+    "hammerhead, hammerhead shark",
+    "electric ray, crampfish, numbfish, torpedo",
+    "stingray",
+    "cock",
+    "hen",
+    "ostrich, Struthio camelus",
+    "brambling, Fringilla montifringilla",
+    "goldfinch, Carduelis carduelis",
+    "house finch, linnet, Carpodacus mexicanus",
+    "junco, snowbird",
+    "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    "robin, American robin, Turdus migratorius",
+    "bulbul",
+    "jay",
+    "magpie",
+    "chickadee",
+    "water ouzel, dipper",
+    "kite",
+    "bald eagle, American eagle, Haliaeetus leucocephalus",
+    "vulture",
+    "great grey owl, great gray owl, Strix nebulosa",
+    "European fire salamander, Salamandra salamandra",
+    "common newt, Triturus vulgaris",
+    "eft",
+    "spotted salamander, Ambystoma maculatum",
+    "axolotl, mud puppy, Ambystoma mexicanum",
+    "bullfrog, Rana catesbeiana",
+    "tree frog, tree-frog",
+    "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    "loggerhead, loggerhead turtle, Caretta caretta",
+    "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",  # noqa: E501
+    "mud turtle",
+    "terrapin",
+    "box turtle, box tortoise",
+    "banded gecko",
+    "common iguana, iguana, Iguana iguana",
+    "American chameleon, anole, Anolis carolinensis",
+    "whiptail, whiptail lizard",
+    "agama",
+    "frilled lizard, Chlamydosaurus kingi",
+    "alligator lizard",
+    "Gila monster, Heloderma suspectum",
+    "green lizard, Lacerta viridis",
+    "African chameleon, Chamaeleo chamaeleon",
+    "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",  # noqa: E501
+    "African crocodile, Nile crocodile, Crocodylus niloticus",
+    "American alligator, Alligator mississipiensis",
+    "triceratops",
+    "thunder snake, worm snake, Carphophis amoenus",
+    "ringneck snake, ring-necked snake, ring snake",
+    "hognose snake, puff adder, sand viper",
+    "green snake, grass snake",
+    "king snake, kingsnake",
+    "garter snake, grass snake",
+    "water snake",
+    "vine snake",
+    "night snake, Hypsiglena torquata",
+    "boa constrictor, Constrictor constrictor",
+    "rock python, rock snake, Python sebae",
+    "Indian cobra, Naja naja",
+    "green mamba",
+    "sea snake",
+    "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    "sidewinder, horned rattlesnake, Crotalus cerastes",
+    "trilobite",
+    "harvestman, daddy longlegs, Phalangium opilio",
+    "scorpion",
+    "black and gold garden spider, Argiope aurantia",
+    "barn spider, Araneus cavaticus",
+    "garden spider, Aranea diademata",
+    "black widow, Latrodectus mactans",
+    "tarantula",
+    "wolf spider, hunting spider",
+    "tick",
+    "centipede",
+    "black grouse",
+    "ptarmigan",
+    "ruffed grouse, partridge, Bonasa umbellus",
+    "prairie chicken, prairie grouse, prairie fowl",
+    "peacock",
+    "quail",
+    "partridge",
+    "African grey, African gray, Psittacus erithacus",
+    "macaw",
+    "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    "lorikeet",
+    "coucal",
+    "bee eater",
+    "hornbill",
+    "hummingbird",
+    "jacamar",
+    "toucan",
+    "drake",
+    "red-breasted merganser, Mergus serrator",
+    "goose",
+    "black swan, Cygnus atratus",
+    "tusker",
+    "echidna, spiny anteater, anteater",
+    "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",  # noqa: E501
+    "wallaby, brush kangaroo",
+    "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",  # noqa: E501
+    "wombat",
+    "jellyfish",
+    "sea anemone, anemone",
+    "brain coral",
+    "flatworm, platyhelminth",
+    "nematode, nematode worm, roundworm",
+    "conch",
+    "snail",
+    "slug",
+    "sea slug, nudibranch",
+    "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    "chambered nautilus, pearly nautilus, nautilus",
+    "Dungeness crab, Cancer magister",
+    "rock crab, Cancer irroratus",
+    "fiddler crab",
+    "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",  # noqa: E501
+    "American lobster, Northern lobster, Maine lobster, Homarus americanus",  # noqa: E501
+    "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",  # noqa: E501
+    "crayfish, crawfish, crawdad, crawdaddy",
+    "hermit crab",
+    "isopod",
+    "white stork, Ciconia ciconia",
+    "black stork, Ciconia nigra",
+    "spoonbill",
+    "flamingo",
+    "little blue heron, Egretta caerulea",
+    "American egret, great white heron, Egretta albus",
+    "bittern",
+    "crane",
+    "limpkin, Aramus pictus",
+    "European gallinule, Porphyrio porphyrio",
+    "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    "bustard",
+    "ruddy turnstone, Arenaria interpres",
+    "red-backed sandpiper, dunlin, Erolia alpina",
+    "redshank, Tringa totanus",
+    "dowitcher",
+    "oystercatcher, oyster catcher",
+    "pelican",
+    "king penguin, Aptenodytes patagonica",
+    "albatross, mollymawk",
+    "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",  # noqa: E501
+    "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    "dugong, Dugong dugon",
+    "sea lion",
+    "Chihuahua",
+    "Japanese spaniel",
+    "Maltese dog, Maltese terrier, Maltese",
+    "Pekinese, Pekingese, Peke",
+    "Shih-Tzu",
+    "Blenheim spaniel",
+    "papillon",
+    "toy terrier",
+    "Rhodesian ridgeback",
+    "Afghan hound, Afghan",
+    "basset, basset hound",
+    "beagle",
+    "bloodhound, sleuthhound",
+    "bluetick",
+    "black-and-tan coonhound",
+    "Walker hound, Walker foxhound",
+    "English foxhound",
+    "redbone",
+    "borzoi, Russian wolfhound",
+    "Irish wolfhound",
+    "Italian greyhound",
+    "whippet",
+    "Ibizan hound, Ibizan Podenco",
+    "Norwegian elkhound, elkhound",
+    "otterhound, otter hound",
+    "Saluki, gazelle hound",
+    "Scottish deerhound, deerhound",
+    "Weimaraner",
+    "Staffordshire bullterrier, Staffordshire bull terrier",
+    "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",  # noqa: E501
+    "Bedlington terrier",
+    "Border terrier",
+    "Kerry blue terrier",
+    "Irish terrier",
+    "Norfolk terrier",
+    "Norwich terrier",
+    "Yorkshire terrier",
+    "wire-haired fox terrier",
+    "Lakeland terrier",
+    "Sealyham terrier, Sealyham",
+    "Airedale, Airedale terrier",
+    "cairn, cairn terrier",
+    "Australian terrier",
+    "Dandie Dinmont, Dandie Dinmont terrier",
+    "Boston bull, Boston terrier",
+    "miniature schnauzer",
+    "giant schnauzer",
+    "standard schnauzer",
+    "Scotch terrier, Scottish terrier, Scottie",
+    "Tibetan terrier, chrysanthemum dog",
+    "silky terrier, Sydney silky",
+    "soft-coated wheaten terrier",
+    "West Highland white terrier",
+    "Lhasa, Lhasa apso",
+    "flat-coated retriever",
+    "curly-coated retriever",
+    "golden retriever",
+    "Labrador retriever",
+    "Chesapeake Bay retriever",
+    "German short-haired pointer",
+    "vizsla, Hungarian pointer",
+    "English setter",
+    "Irish setter, red setter",
+    "Gordon setter",
+    "Brittany spaniel",
+    "clumber, clumber spaniel",
+    "English springer, English springer spaniel",
+    "Welsh springer spaniel",
+    "cocker spaniel, English cocker spaniel, cocker",
+    "Sussex spaniel",
+    "Irish water spaniel",
+    "kuvasz",
+    "schipperke",
+    "groenendael",
+    "malinois",
+    "briard",
+    "kelpie",
+    "komondor",
+    "Old English sheepdog, bobtail",
+    "Shetland sheepdog, Shetland sheep dog, Shetland",
+    "collie",
+    "Border collie",
+    "Bouvier des Flandres, Bouviers des Flandres",
+    "Rottweiler",
+    "German shepherd, German shepherd dog, German police dog, alsatian",
+    "Doberman, Doberman pinscher",
+    "miniature pinscher",
+    "Greater Swiss Mountain dog",
+    "Bernese mountain dog",
+    "Appenzeller",
+    "EntleBucher",
+    "boxer",
+    "bull mastiff",
+    "Tibetan mastiff",
+    "French bulldog",
+    "Great Dane",
+    "Saint Bernard, St Bernard",
+    "Eskimo dog, husky",
+    "malamute, malemute, Alaskan malamute",
+    "Siberian husky",
+    "dalmatian, coach dog, carriage dog",
+    "affenpinscher, monkey pinscher, monkey dog",
+    "basenji",
+    "pug, pug-dog",
+    "Leonberg",
+    "Newfoundland, Newfoundland dog",
+    "Great Pyrenees",
+    "Samoyed, Samoyede",
+    "Pomeranian",
+    "chow, chow chow",
+    "keeshond",
+    "Brabancon griffon",
+    "Pembroke, Pembroke Welsh corgi",
+    "Cardigan, Cardigan Welsh corgi",
+    "toy poodle",
+    "miniature poodle",
+    "standard poodle",
+    "Mexican hairless",
+    "timber wolf, grey wolf, gray wolf, Canis lupus",
+    "white wolf, Arctic wolf, Canis lupus tundrarum",
+    "red wolf, maned wolf, Canis rufus, Canis niger",
+    "coyote, prairie wolf, brush wolf, Canis latrans",
+    "dingo, warrigal, warragal, Canis dingo",
+    "dhole, Cuon alpinus",
+    "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    "hyena, hyaena",
+    "red fox, Vulpes vulpes",
+    "kit fox, Vulpes macrotis",
+    "Arctic fox, white fox, Alopex lagopus",
+    "grey fox, gray fox, Urocyon cinereoargenteus",
+    "tabby, tabby cat",
+    "tiger cat",
+    "Persian cat",
+    "Siamese cat, Siamese",
+    "Egyptian cat",
+    "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",  # noqa: E501
+    "lynx, catamount",
+    "leopard, Panthera pardus",
+    "snow leopard, ounce, Panthera uncia",
+    "jaguar, panther, Panthera onca, Felis onca",
+    "lion, king of beasts, Panthera leo",
+    "tiger, Panthera tigris",
+    "cheetah, chetah, Acinonyx jubatus",
+    "brown bear, bruin, Ursus arctos",
+    "American black bear, black bear, Ursus americanus, Euarctos americanus",  # noqa: E501
+    "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    "sloth bear, Melursus ursinus, Ursus ursinus",
+    "mongoose",
+    "meerkat, mierkat",
+    "tiger beetle",
+    "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    "ground beetle, carabid beetle",
+    "long-horned beetle, longicorn, longicorn beetle",
+    "leaf beetle, chrysomelid",
+    "dung beetle",
+    "rhinoceros beetle",
+    "weevil",
+    "fly",
+    "bee",
+    "ant, emmet, pismire",
+    "grasshopper, hopper",
+    "cricket",
+    "walking stick, walkingstick, stick insect",
+    "cockroach, roach",
+    "mantis, mantid",
+    "cicada, cicala",
+    "leafhopper",
+    "lacewing, lacewing fly",
+    "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",  # noqa: E501
+    "damselfly",
+    "admiral",
+    "ringlet, ringlet butterfly",
+    "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    "cabbage butterfly",
+    "sulphur butterfly, sulfur butterfly",
+    "lycaenid, lycaenid butterfly",
+    "starfish, sea star",
+    "sea urchin",
+    "sea cucumber, holothurian",
+    "wood rabbit, cottontail, cottontail rabbit",
+    "hare",
+    "Angora, Angora rabbit",
+    "hamster",
+    "porcupine, hedgehog",
+    "fox squirrel, eastern fox squirrel, Sciurus niger",
+    "marmot",
+    "beaver",
+    "guinea pig, Cavia cobaya",
+    "sorrel",
+    "zebra",
+    "hog, pig, grunter, squealer, Sus scrofa",
+    "wild boar, boar, Sus scrofa",
+    "warthog",
+    "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    "ox",
+    "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    "bison",
+    "ram, tup",
+    "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",  # noqa: E501
+    "ibex, Capra ibex",
+    "hartebeest",
+    "impala, Aepyceros melampus",
+    "gazelle",
+    "Arabian camel, dromedary, Camelus dromedarius",
+    "llama",
+    "weasel",
+    "mink",
+    "polecat, fitch, foulmart, foumart, Mustela putorius",
+    "black-footed ferret, ferret, Mustela nigripes",
+    "otter",
+    "skunk, polecat, wood pussy",
+    "badger",
+    "armadillo",
+    "three-toed sloth, ai, Bradypus tridactylus",
+    "orangutan, orang, orangutang, Pongo pygmaeus",
+    "gorilla, Gorilla gorilla",
+    "chimpanzee, chimp, Pan troglodytes",
+    "gibbon, Hylobates lar",
+    "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    "guenon, guenon monkey",
+    "patas, hussar monkey, Erythrocebus patas",
+    "baboon",
+    "macaque",
+    "langur",
+    "colobus, colobus monkey",
+    "proboscis monkey, Nasalis larvatus",
+    "marmoset",
+    "capuchin, ringtail, Cebus capucinus",
+    "howler monkey, howler",
+    "titi, titi monkey",
+    "spider monkey, Ateles geoffroyi",
+    "squirrel monkey, Saimiri sciureus",
+    "Madagascar cat, ring-tailed lemur, Lemur catta",
+    "indri, indris, Indri indri, Indri brevicaudatus",
+    "Indian elephant, Elephas maximus",
+    "African elephant, Loxodonta africana",
+    "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    "barracouta, snoek",
+    "eel",
+    "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",  # noqa: E501
+    "rock beauty, Holocanthus tricolor",
+    "anemone fish",
+    "sturgeon",
+    "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    "lionfish",
+    "puffer, pufferfish, blowfish, globefish",
+    "abacus",
+    "abaya",
+    "academic gown, academic robe, judge's robe",
+    "accordion, piano accordion, squeeze box",
+    "acoustic guitar",
+    "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    "airliner",
+    "airship, dirigible",
+    "altar",
+    "ambulance",
+    "amphibian, amphibious vehicle",
+    "analog clock",
+    "apiary, bee house",
+    "apron",
+    "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",  # noqa: E501
+    "assault rifle, assault gun",
+    "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    "bakery, bakeshop, bakehouse",
+    "balance beam, beam",
+    "balloon",
+    "ballpoint, ballpoint pen, ballpen, Biro",
+    "Band Aid",
+    "banjo",
+    "bannister, banister, balustrade, balusters, handrail",
+    "barbell",
+    "barber chair",
+    "barbershop",
+    "barn",
+    "barometer",
+    "barrel, cask",
+    "barrow, garden cart, lawn cart, wheelbarrow",
+    "baseball",
+    "basketball",
+    "bassinet",
+    "bassoon",
+    "bathing cap, swimming cap",
+    "bath towel",
+    "bathtub, bathing tub, bath, tub",
+    "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",  # noqa: E501
+    "beacon, lighthouse, beacon light, pharos",
+    "beaker",
+    "bearskin, busby, shako",
+    "beer bottle",
+    "beer glass",
+    "bell cote, bell cot",
+    "bib",
+    "bicycle-built-for-two, tandem bicycle, tandem",
+    "bikini, two-piece",
+    "binder, ring-binder",
+    "binoculars, field glasses, opera glasses",
+    "birdhouse",
+    "boathouse",
+    "bobsled, bobsleigh, bob",
+    "bolo tie, bolo, bola tie, bola",
+    "bonnet, poke bonnet",
+    "bookcase",
+    "bookshop, bookstore, bookstall",
+    "bottlecap",
+    "bow",
+    "bow tie, bow-tie, bowtie",
+    "brass, memorial tablet, plaque",
+    "brassiere, bra, bandeau",
+    "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    "breastplate, aegis, egis",
+    "broom",
+    "bucket, pail",
+    "buckle",
+    "bulletproof vest",
+    "bullet train, bullet",
+    "butcher shop, meat market",
+    "cab, hack, taxi, taxicab",
+    "caldron, cauldron",
+    "candle, taper, wax light",
+    "cannon",
+    "canoe",
+    "can opener, tin opener",
+    "cardigan",
+    "car mirror",
+    "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    "carpenter's kit, tool kit",
+    "carton",
+    "car wheel",
+    "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",  # noqa: E501
+    "cassette",
+    "cassette player",
+    "castle",
+    "catamaran",
+    "CD player",
+    "cello, violoncello",
+    "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    "chain",
+    "chainlink fence",
+    "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",  # noqa: E501
+    "chain saw, chainsaw",
+    "chest",
+    "chiffonier, commode",
+    "chime, bell, gong",
+    "china cabinet, china closet",
+    "Christmas stocking",
+    "church, church building",
+    "cinema, movie theater, movie theatre, movie house, picture palace",
+    "cleaver, meat cleaver, chopper",
+    "cliff dwelling",
+    "cloak",
+    "clog, geta, patten, sabot",
+    "cocktail shaker",
+    "coffee mug",
+    "coffeepot",
+    "coil, spiral, volute, whorl, helix",
+    "combination lock",
+    "computer keyboard, keypad",
+    "confectionery, confectionary, candy store",
+    "container ship, containership, container vessel",
+    "convertible",
+    "corkscrew, bottle screw",
+    "cornet, horn, trumpet, trump",
+    "cowboy boot",
+    "cowboy hat, ten-gallon hat",
+    "cradle",
+    "crane",
+    "crash helmet",
+    "crate",
+    "crib, cot",
+    "Crock Pot",
+    "croquet ball",
+    "crutch",
+    "cuirass",
+    "dam, dike, dyke",
+    "desk",
+    "desktop computer",
+    "dial telephone, dial phone",
+    "diaper, nappy, napkin",
+    "digital clock",
+    "digital watch",
+    "dining table, board",
+    "dishrag, dishcloth",
+    "dishwasher, dish washer, dishwashing machine",
+    "disk brake, disc brake",
+    "dock, dockage, docking facility",
+    "dogsled, dog sled, dog sleigh",
+    "dome",
+    "doormat, welcome mat",
+    "drilling platform, offshore rig",
+    "drum, membranophone, tympan",
+    "drumstick",
+    "dumbbell",
+    "Dutch oven",
+    "electric fan, blower",
+    "electric guitar",
+    "electric locomotive",
+    "entertainment center",
+    "envelope",
+    "espresso maker",
+    "face powder",
+    "feather boa, boa",
+    "file, file cabinet, filing cabinet",
+    "fireboat",
+    "fire engine, fire truck",
+    "fire screen, fireguard",
+    "flagpole, flagstaff",
+    "flute, transverse flute",
+    "folding chair",
+    "football helmet",
+    "forklift",
+    "fountain",
+    "fountain pen",
+    "four-poster",
+    "freight car",
+    "French horn, horn",
+    "frying pan, frypan, skillet",
+    "fur coat",
+    "garbage truck, dustcart",
+    "gasmask, respirator, gas helmet",
+    "gas pump, gasoline pump, petrol pump, island dispenser",
+    "goblet",
+    "go-kart",
+    "golf ball",
+    "golfcart, golf cart",
+    "gondola",
+    "gong, tam-tam",
+    "gown",
+    "grand piano, grand",
+    "greenhouse, nursery, glasshouse",
+    "grille, radiator grille",
+    "grocery store, grocery, food market, market",
+    "guillotine",
+    "hair slide",
+    "hair spray",
+    "half track",
+    "hammer",
+    "hamper",
+    "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    "hand-held computer, hand-held microcomputer",
+    "handkerchief, hankie, hanky, hankey",
+    "hard disc, hard disk, fixed disk",
+    "harmonica, mouth organ, harp, mouth harp",
+    "harp",
+    "harvester, reaper",
+    "hatchet",
+    "holster",
+    "home theater, home theatre",
+    "honeycomb",
+    "hook, claw",
+    "hoopskirt, crinoline",
+    "horizontal bar, high bar",
+    "horse cart, horse-cart",
+    "hourglass",
+    "iPod",
+    "iron, smoothing iron",
+    "jack-o'-lantern",
+    "jean, blue jean, denim",
+    "jeep, landrover",
+    "jersey, T-shirt, tee shirt",
+    "jigsaw puzzle",
+    "jinrikisha, ricksha, rickshaw",
+    "joystick",
+    "kimono",
+    "knee pad",
+    "knot",
+    "lab coat, laboratory coat",
+    "ladle",
+    "lampshade, lamp shade",
+    "laptop, laptop computer",
+    "lawn mower, mower",
+    "lens cap, lens cover",
+    "letter opener, paper knife, paperknife",
+    "library",
+    "lifeboat",
+    "lighter, light, igniter, ignitor",
+    "limousine, limo",
+    "liner, ocean liner",
+    "lipstick, lip rouge",
+    "Loafer",
+    "lotion",
+    "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",  # noqa: E501
+    "loupe, jeweler's loupe",
+    "lumbermill, sawmill",
+    "magnetic compass",
+    "mailbag, postbag",
+    "mailbox, letter box",
+    "maillot",
+    "maillot, tank suit",
+    "manhole cover",
+    "maraca",
+    "marimba, xylophone",
+    "mask",
+    "matchstick",
+    "maypole",
+    "maze, labyrinth",
+    "measuring cup",
+    "medicine chest, medicine cabinet",
+    "megalith, megalithic structure",
+    "microphone, mike",
+    "microwave, microwave oven",
+    "military uniform",
+    "milk can",
+    "minibus",
+    "miniskirt, mini",
+    "minivan",
+    "missile",
+    "mitten",
+    "mixing bowl",
+    "mobile home, manufactured home",
+    "Model T",
+    "modem",
+    "monastery",
+    "monitor",
+    "moped",
+    "mortar",
+    "mortarboard",
+    "mosque",
+    "mosquito net",
+    "motor scooter, scooter",
+    "mountain bike, all-terrain bike, off-roader",
+    "mountain tent",
+    "mouse, computer mouse",
+    "mousetrap",
+    "moving van",
+    "muzzle",
+    "nail",
+    "neck brace",
+    "necklace",
+    "nipple",
+    "notebook, notebook computer",
+    "obelisk",
+    "oboe, hautboy, hautbois",
+    "ocarina, sweet potato",
+    "odometer, hodometer, mileometer, milometer",
+    "oil filter",
+    "organ, pipe organ",
+    "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    "overskirt",
+    "oxcart",
+    "oxygen mask",
+    "packet",
+    "paddle, boat paddle",
+    "paddlewheel, paddle wheel",
+    "padlock",
+    "paintbrush",
+    "pajama, pyjama, pj's, jammies",
+    "palace",
+    "panpipe, pandean pipe, syrinx",
+    "paper towel",
+    "parachute, chute",
+    "parallel bars, bars",
+    "park bench",
+    "parking meter",
+    "passenger car, coach, carriage",
+    "patio, terrace",
+    "pay-phone, pay-station",
+    "pedestal, plinth, footstall",
+    "pencil box, pencil case",
+    "pencil sharpener",
+    "perfume, essence",
+    "Petri dish",
+    "photocopier",
+    "pick, plectrum, plectron",
+    "pickelhaube",
+    "picket fence, paling",
+    "pickup, pickup truck",
+    "pier",
+    "piggy bank, penny bank",
+    "pill bottle",
+    "pillow",
+    "ping-pong ball",
+    "pinwheel",
+    "pirate, pirate ship",
+    "pitcher, ewer",
+    "plane, carpenter's plane, woodworking plane",
+    "planetarium",
+    "plastic bag",
+    "plate rack",
+    "plow, plough",
+    "plunger, plumber's helper",
+    "Polaroid camera, Polaroid Land camera",
+    "pole",
+    "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",  # noqa: E501
+    "poncho",
+    "pool table, billiard table, snooker table",
+    "pop bottle, soda bottle",
+    "pot, flowerpot",
+    "potter's wheel",
+    "power drill",
+    "prayer rug, prayer mat",
+    "printer",
+    "prison, prison house",
+    "projectile, missile",
+    "projector",
+    "puck, hockey puck",
+    "punching bag, punch bag, punching ball, punchball",
+    "purse",
+    "quill, quill pen",
+    "quilt, comforter, comfort, puff",
+    "racer, race car, racing car",
+    "racket, racquet",
+    "radiator",
+    "radio, wireless",
+    "radio telescope, radio reflector",
+    "rain barrel",
+    "recreational vehicle, RV, R.V.",
+    "reel",
+    "reflex camera",
+    "refrigerator, icebox",
+    "remote control, remote",
+    "restaurant, eating house, eating place, eatery",
+    "revolver, six-gun, six-shooter",
+    "rifle",
+    "rocking chair, rocker",
+    "rotisserie",
+    "rubber eraser, rubber, pencil eraser",
+    "rugby ball",
+    "rule, ruler",
+    "running shoe",
+    "safe",
+    "safety pin",
+    "saltshaker, salt shaker",
+    "sandal",
+    "sarong",
+    "sax, saxophone",
+    "scabbard",
+    "scale, weighing machine",
+    "school bus",
+    "schooner",
+    "scoreboard",
+    "screen, CRT screen",
+    "screw",
+    "screwdriver",
+    "seat belt, seatbelt",
+    "sewing machine",
+    "shield, buckler",
+    "shoe shop, shoe-shop, shoe store",
+    "shoji",
+    "shopping basket",
+    "shopping cart",
+    "shovel",
+    "shower cap",
+    "shower curtain",
+    "ski",
+    "ski mask",
+    "sleeping bag",
+    "slide rule, slipstick",
+    "sliding door",
+    "slot, one-armed bandit",
+    "snorkel",
+    "snowmobile",
+    "snowplow, snowplough",
+    "soap dispenser",
+    "soccer ball",
+    "sock",
+    "solar dish, solar collector, solar furnace",
+    "sombrero",
+    "soup bowl",
+    "space bar",
+    "space heater",
+    "space shuttle",
+    "spatula",
+    "speedboat",
+    "spider web, spider's web",
+    "spindle",
+    "sports car, sport car",
+    "spotlight, spot",
+    "stage",
+    "steam locomotive",
+    "steel arch bridge",
+    "steel drum",
+    "stethoscope",
+    "stole",
+    "stone wall",
+    "stopwatch, stop watch",
+    "stove",
+    "strainer",
+    "streetcar, tram, tramcar, trolley, trolley car",
+    "stretcher",
+    "studio couch, day bed",
+    "stupa, tope",
+    "submarine, pigboat, sub, U-boat",
+    "suit, suit of clothes",
+    "sundial",
+    "sunglass",
+    "sunglasses, dark glasses, shades",
+    "sunscreen, sunblock, sun blocker",
+    "suspension bridge",
+    "swab, swob, mop",
+    "sweatshirt",
+    "swimming trunks, bathing trunks",
+    "swing",
+    "switch, electric switch, electrical switch",
+    "syringe",
+    "table lamp",
+    "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    "tape player",
+    "teapot",
+    "teddy, teddy bear",
+    "television, television system",
+    "tennis ball",
+    "thatch, thatched roof",
+    "theater curtain, theatre curtain",
+    "thimble",
+    "thresher, thrasher, threshing machine",
+    "throne",
+    "tile roof",
+    "toaster",
+    "tobacco shop, tobacconist shop, tobacconist",
+    "toilet seat",
+    "torch",
+    "totem pole",
+    "tow truck, tow car, wrecker",
+    "toyshop",
+    "tractor",
+    "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",  # noqa: E501
+    "tray",
+    "trench coat",
+    "tricycle, trike, velocipede",
+    "trimaran",
+    "tripod",
+    "triumphal arch",
+    "trolleybus, trolley coach, trackless trolley",
+    "trombone",
+    "tub, vat",
+    "turnstile",
+    "typewriter keyboard",
+    "umbrella",
+    "unicycle, monocycle",
+    "upright, upright piano",
+    "vacuum, vacuum cleaner",
+    "vase",
+    "vault",
+    "velvet",
+    "vending machine",
+    "vestment",
+    "viaduct",
+    "violin, fiddle",
+    "volleyball",
+    "waffle iron",
+    "wall clock",
+    "wallet, billfold, notecase, pocketbook",
+    "wardrobe, closet, press",
+    "warplane, military plane",
+    "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    "washer, automatic washer, washing machine",
+    "water bottle",
+    "water jug",
+    "water tower",
+    "whiskey jug",
+    "whistle",
+    "wig",
+    "window screen",
+    "window shade",
+    "Windsor tie",
+    "wine bottle",
+    "wing",
+    "wok",
+    "wooden spoon",
+    "wool, woolen, woollen",
+    "worm fence, snake fence, snake-rail fence, Virginia fence",
+    "wreck",
+    "yawl",
+    "yurt",
+    "web site, website, internet site, site",
+    "comic book",
+    "crossword puzzle, crossword",
+    "street sign",
+    "traffic light, traffic signal, stoplight",
+    "book jacket, dust cover, dust jacket, dust wrapper",
+    "menu",
+    "plate",
+    "guacamole",
+    "consomme",
+    "hot pot, hotpot",
+    "trifle",
+    "ice cream, icecream",
+    "ice lolly, lolly, lollipop, popsicle",
+    "French loaf",
+    "bagel, beigel",
+    "pretzel",
+    "cheeseburger",
+    "hotdog, hot dog, red hot",
+    "mashed potato",
+    "head cabbage",
+    "broccoli",
+    "cauliflower",
+    "zucchini, courgette",
+    "spaghetti squash",
+    "acorn squash",
+    "butternut squash",
+    "cucumber, cuke",
+    "artichoke, globe artichoke",
+    "bell pepper",
+    "cardoon",
+    "mushroom",
+    "Granny Smith",
+    "strawberry",
+    "orange",
+    "lemon",
+    "fig",
+    "pineapple, ananas",
+    "banana",
+    "jackfruit, jak, jack",
+    "custard apple",
+    "pomegranate",
+    "hay",
+    "carbonara",
+    "chocolate sauce, chocolate syrup",
+    "dough",
+    "meat loaf, meatloaf",
+    "pizza, pizza pie",
+    "potpie",
+    "burrito",
+    "red wine",
+    "espresso",
+    "cup",
+    "eggnog",
+    "alp",
+    "bubble",
+    "cliff, drop, drop-off",
+    "coral reef",
+    "geyser",
+    "lakeside, lakeshore",
+    "promontory, headland, head, foreland",
+    "sandbar, sand bar",
+    "seashore, coast, seacoast, sea-coast",
+    "valley, vale",
+    "volcano",
+    "ballplayer, baseball player",
+    "groom, bridegroom",
+    "scuba diver",
+    "rapeseed",
+    "daisy",
+    "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",  # noqa: E501
+    "corn",
+    "acorn",
+    "hip, rose hip, rosehip",
+    "buckeye, horse chestnut, conker",
+    "coral fungus",
+    "agaric",
+    "gyromitra",
+    "stinkhorn, carrion fungus",
+    "earthstar",
+    "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",  # noqa: E501
+    "bolete",
+    "ear, spike, capitulum",
+    "toilet tissue, toilet paper, bathroom tissue",
+]
--- a/libai/layers/__init__.py
+++ b/libai/layers/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .activation import build_activation
+from .cross_entropy import ParallelCrossEntropyLoss
+from .embedding import Embedding, SinePositionalEmbedding, VocabEmbedding, PatchEmbedding
+from .layer_norm import LayerNorm, RMSLayerNorm
+from .linear import Linear, Linear1D
+from .conv import Conv1D
+from .lm_logits import LMLogits
+from .mlp import MLP
+from .transformer_layer import TransformerLayer
+from .attention import MultiheadAttention
+from .droppath import DropPath, drop_path
+
+__all__ = [
+    "Embedding",
+    "VocabEmbedding",
+    "SinePositionalEmbedding",
+    "PatchEmbedding",
+    "build_activation",
+    "Linear",
+    "Linear1D",
+    "Conv1D",
+    "MLP",
+    "LayerNorm",
+    "RMSLayerNorm",
+    "TransformerLayer",
+    "MultiheadAttention",
+    "ParallelCrossEntropyLoss",
+    "LMLogits",
+    "drop_path",
+    "DropPath",
+]
--- a/libai/layers/activation.py
+++ b/libai/layers/activation.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Optional
+
+import oneflow as flow
+from oneflow import nn
+
+
+class Activation(str, Enum):
+    SquaredReLU = "squared_relu"
+    GeLU = "gelu"
+    GeLUTanh = "gelu_tanh"
+    LeakyReLU = "leaky_relu"
+    ReLU = "relu"
+    Tanh = "tanh"
+    QuickGELU = "quick_gelu"
+
+
+# For unit testing / parity comparisons, probably not the fastest way
+class SquaredReLU(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        x_ = flow._C.relu(x)
+        return x_ * x_
+
+
+class Passthrough(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        return x
+
+
+class GeLUTanh(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        """When the approximate argument is 'tanh', Gelu is estimated with:
+        0.5 * x * (1.0 + flow.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0))))
+        """
+        return flow.nn.functional.gelu(x, approximate="tanh")
+
+
+class QuickGELU(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        return x * flow.sigmoid(1.702 * x)
+
+
+def build_activation(activation: Optional[Activation]):
+    """
+    Fetching activation layers by name, e.g.,
+    ``build_activation("gelu")`` returns ``nn.GELU()`` module.
+    """
+    if not activation:
+        return Passthrough()
+
+    return {
+        Activation.ReLU: nn.ReLU,
+        Activation.GeLU: nn.GELU,
+        Activation.GeLUTanh: GeLUTanh,
+        Activation.LeakyReLU: nn.LeakyReLU,
+        Activation.SquaredReLU: SquaredReLU,
+        Activation.Tanh: nn.Tanh,
+        Activation.QuickGELU: QuickGELU,
+    }[activation]()
--- a/libai/layers/attention.py
+++ b/libai/layers/attention.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+import math
+from typing import Tuple
+
+import oneflow as flow
+from oneflow import nn
+
+from .linear import Linear
+
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
+
+
+class MultiheadAttention(nn.Module):
+    """Multi-head attention layer, support self attention and cross attention.
+
+    Args:
+        hidden_size: size of hidden state.
+        num_attention_heads: number of attention heads.
+        is_cross_attention: used to specify whether it is self attention or cross attention.
+            Defaults to False.
+        attention_dropout_prob: dropout probability of attention weights.
+            Defaults to 0.0.
+        output_dropout_prob: dropout probability of output. Defaults to 0.0.
+        init_method: method to initialize the input layer weights.
+            Defaults to ``init.xavier_normal_``.
+        output_layer_init_method: method to initialize the output layer weights.
+            If None, use ``init_method``.
+        bias_dropout_fusion: whether to fuse add bias and dropout.
+            Defaults to False.
+        scale_mask_softmax_fusion: whether to fuse scale, mask and softmax.
+            Defaults to False.
+        apply_query_key_layer_scaling: if `True`, scaling the attention score by layer index.
+            Defaults to False.
+        layer_idx: a layer_idx sign which determines the placements.
+            It will be used in pipeline parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        is_cross_attention=False,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        attn_mask_type=AttnMaskType.padding,
+        *,
+        layer_idx=0
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        assert (
+            hidden_size % num_attention_heads == 0
+        ), "hidden_size must be divisible by num_attention_heads."
+
+        self.num_heads = num_attention_heads
+        self.head_size = hidden_size // num_attention_heads
+        self.attn_mask_type = attn_mask_type
+
+        self.attention_dropout_prob = attention_dropout_prob
+        self.dropout = nn.Dropout(p=attention_dropout_prob)
+        self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
+        self.coeff = None
+        if apply_query_key_layer_scaling:
+            self.coeff = layer_idx + 1
+            self.norm_factor /= self.coeff
+
+        self.is_cross_attention = is_cross_attention
+        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+        self.bias_dropout_fusion = bias_dropout_fusion
+
+        if self.bias_dropout_fusion:
+            self.output_dropout_prob = output_dropout_prob
+        else:
+            self.output_dropout = nn.Dropout(p=output_dropout_prob)
+
+        if self.is_cross_attention:
+            self.query = Linear(
+                self.hidden_size,
+                self.hidden_size,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+            self.key_value = Linear(
+                self.hidden_size,
+                self.hidden_size * 2,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+        else:
+            self.query_key_value = Linear(
+                self.hidden_size,
+                self.hidden_size * 3,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+
+        self.dense = Linear(
+            self.hidden_size,
+            self.hidden_size,
+            parallel="row",
+            init_method=output_layer_init_method,
+            skip_bias_add=self.bias_dropout_fusion,
+            layer_idx=layer_idx,
+        )
+
+    def forward(
+        self,
+        hidden_states: flow.Tensor,
+        encoder_states: flow.Tensor = None,
+        attention_mask: flow.Tensor = None,
+        past_key_value: Tuple[flow.Tensor, flow.Tensor] = None,
+        use_cache: bool = False,
+    ):
+        """
+
+        Args:
+            hidden_states (flow.Tensor): shape is [bsz, tgt_len, hidden_size].
+            encoder_states (flow.Tensor, optional): shape is [bsz, src_len, hidden_size].
+                Defaults to None.
+            attention_mask (flow.Tensor, optional): shape is [bsz, 1, tgt_len, src_len].
+                It should be the combination of padding mask and casual mask.
+                It is the padding mask of source input when used with self-attention in encoder.
+                And it is the combination of padding mask of target input and casual mask when
+                used with self-attention in decoder. It is the padding mask of source input when
+                used with cross-attention in decoder.
+                Defaults to None.
+            past_key_value (Tuple[flow.Tensor, flow.Tensor], optional): tuple of key and value,
+                each shape is [bsz, num_heads, src_len, head_size]. Defaults to None.
+            use_cache (bool, optional): it will be set to True, when the model is in the inference
+                phase and used for incremental decoding. Defaults to False.
+        """
+
+        # hidden_states, encoder_states: [S(0), B]
+        # attention_mask: [S(0), B]
+
+        if encoder_states is not None:
+            encoder_states = encoder_states.to_global(placement=hidden_states.placement)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.to_global(placement=hidden_states.placement)
+
+        bsz, tgt_len = hidden_states.size()[:2]
+
+        if self.is_cross_attention:
+            # if it is cross attention, key and value should be calculated only once, and the
+            # result can be reused.
+            query = self.query(hidden_states)
+            query = query.view(bsz, -1, self.num_heads, self.head_size)
+            query = query.permute(0, 2, 1, 3)
+            if past_key_value is not None:
+                key, value = past_key_value
+            elif encoder_states is not None:
+                key_value = self.key_value(encoder_states)
+                key_value = key_value.view(bsz, -1, self.num_heads, 2 * self.head_size)
+                key_value = key_value.permute(0, 2, 1, 3)
+                key, value = flow.chunk(key_value, chunks=2, dim=-1)
+            else:
+                raise ValueError(
+                    "past_key_value and encoder_states cannot be None at the same time."
+                )
+        else:
+            # if it is self attention, query, key, and value are all obtained from hidden_states.
+            # when in the inference phase of an incremental decoder,
+            # hidden_states is the last-added state,
+            # the full key and value could be obtained by concatenating with past_key_value.
+            query_key_value = self.query_key_value(hidden_states)
+            query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
+            query_key_value = query_key_value.permute(
+                0, 2, 1, 3
+            )  # [bsz, num_heads, src_len, 3 * head_size]
+            query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1)
+            if past_key_value is not None:
+                past_key, past_value = past_key_value
+                key = flow.cat((past_key.type_as(key), key), dim=2)
+                value = flow.cat((past_value.type_as(value), value), dim=2)
+
+        # query, key, value: [S(0), S(1)], shape: [bsz, num_heads, seq_length, head_size]
+        if use_cache:
+            past_key_value = (key, value)
+
+        # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)]
+        attention_scores = flow.matmul(query, key, transpose_b=True, alpha=self.norm_factor)
+
+        # [S(0), S(1)] x [S(0), B] = [S(0), S(1)]
+        if attention_mask is not None:
+            if self.scale_mask_softmax_fusion:
+                if self.attn_mask_type == AttnMaskType.padding:
+                    attention_mask = (
+                        attention_mask.expand_as(attention_scores) if use_cache else attention_mask
+                    )
+                    attention_weights = flow._C.fused_scale_mask_softmax_dropout(
+                        attention_scores,
+                        attention_mask,
+                        fill_value=-10000.0,
+                        scale=self.coeff,
+                        p=self.attention_dropout_prob,
+                    )[0]
+            else:
+                if self.coeff is not None:
+                    attention_scores *= self.coeff
+                attention_scores = flow.mul(attention_scores, attention_mask)
+                attention_scores = attention_scores - 10000.0 * (1 - attention_mask)
+                # TODO(xingyu.liao): graph will occur `where_scalar` errors
+                # when using `masked_fill`
+                # attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0)
+                attention_weights = flow.softmax(attention_scores, dim=-1)
+                # [bsz, num_heads, tgt_len, src_len]
+                attention_weights = self.dropout(attention_weights)
+        else:
+            if self.scale_mask_softmax_fusion and self.attn_mask_type == AttnMaskType.causal:
+                attention_weights = flow._C.fused_scale_tril_softmax_mask_scale(
+                    attention_scores,
+                    p=self.attention_dropout_prob,
+                    diagonal=0,
+                    tril_scale_value=self.coeff,
+                    tril_fill_value=-10000.0,
+                )[0]
+            else:
+                attention_weights = flow.softmax(attention_scores, dim=-1)
+                # [bsz, num_heads, tgt_len, src_len]
+                attention_weights = self.dropout(attention_weights)
+
+        # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)]
+        context = flow.matmul(attention_weights, value)
+        # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size]
+        context = context.transpose(1, 2)
+
+        # Concat multi-head results from
+        # [bsz, tgt_len, num_heads, head_size] -> [bsz, tgt_len, num_heads * head_size]
+        # SBP sign: [S(0), S(2)]
+        # [S(0), S(2)] x [B, S(0)] = [S(0), P] -> [S(0), B]
+        output = self.dense(context.flatten(2))
+
+        if self.bias_dropout_fusion:
+            output, bias = output
+            output = flow._C.fused_bias_add_dropout(
+                output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
+            )
+        else:
+            output = self.output_dropout(output)
+
+        if use_cache:
+            output = (output, past_key_value)
+
+        return output
+
+    def extra_repr(self) -> str:
+        return "hidden_size={}, num_heads={}, is_cross_attention={}".format(
+            self.hidden_size,
+            self.num_heads,
+            self.is_cross_attention,
+        )
--- a/libai/layers/conv.py
+++ b/libai/layers/conv.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class Conv1D(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        parallel="data",
+        init_method=nn.init.xavier_normal_,
+        skip_bias_add=False,
+        dtype=flow.float32,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.parallel = parallel
+        self.skip_bias_add = skip_bias_add
+
+        if parallel == "col":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+
+        elif parallel == "row":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+
+        elif parallel == "data":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+
+        else:
+            raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
+
+        self.weight = flow.nn.Parameter(
+            flow.empty(
+                (in_features, out_features),
+                dtype=dtype,
+                placement=dist.get_layer_placement(layer_idx),  # for pipeline parallelism placement
+                sbp=weight_sbp,
+            )
+        )
+        if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1":
+            init_method(self.weight)
+
+        self.bias = (
+            flow.nn.Parameter(
+                flow.zeros(
+                    (out_features,),
+                    dtype=dtype,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=bias_sbp,
+                )
+            )
+            if bias
+            else None
+        )
+
+    def forward(self, x):
+        if dist.same_sbp(self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])):
+            if self.weight.sbp[-1] == flow.sbp.split(1):
+                x_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
+                x = x.to_global(sbp=x_sbp)
+
+            x = x.to_global(grad_sbp=x.sbp)
+            x = flow.matmul(x, self.weight)
+
+        elif dist.same_sbp(
+            self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+        ):
+            if self.weight.sbp[-1] == flow.sbp.split(0):
+                x_sbp = x.sbp[:-1] + (flow.sbp.split(x.ndim - 1),)
+                x = x.to_global(sbp=x_sbp)
+                out_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
+            else:
+                out_sbp = x.sbp
+
+            x = flow.matmul(x, self.weight)
+            x = x.to_global(sbp=out_sbp)
+
+        elif dist.same_sbp(
+            self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        ):
+            x = x.to_global(grad_sbp=x.sbp)
+            x = flow.matmul(x, self.weight)
+        else:
+            x = flow.matmul(x, self.weight)
+
+        if self.bias is not None:
+            if self.skip_bias_add:
+                return x, self.bias
+            else:
+                return x + self.bias
+        else:
+            return x
+
+    def extra_repr(self) -> str:
+        return "in_features={}, out_features={}, bias={}, parallel={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.parallel,
+        )
--- a/libai/layers/cross_entropy.py
+++ b/libai/layers/cross_entropy.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import oneflow as flow
+from oneflow import nn
+
+
+class ParallelCrossEntropyLoss(nn.Module):
+    """This criterion acts like :class:`~flow.nn.CrossEntropyLoss` except it will
+    execute distributed cross entropy loss computation cross different GPUs.
+    """
+
+    def forward(self, logits: flow.Tensor, target: flow.Tensor):
+        """Function for the distributed cross entropy.
+
+        Args:
+            logits (flow.Tensor): vocab_parallel_logits with shape
+                (batch_size, seq_length, vocab_size) and sbp signature is [S(0), S(2)].
+            target (flow.Tensor): target with shape (batch_size, seq_length) and
+                sbp signature is [S(0), B].
+        """
+        assert logits.ndim == 3
+        assert target.ndim == 2
+        assert logits.shape[0:2] == target.shape
+
+        target = target.to_global(placement=logits.placement)
+
+        # Change -1 in target to 0 because sparse_softmax_cross_entropy don't accept -1
+        target = target * (target >= 0)
+
+        lm_loss = flow._C.sparse_softmax_cross_entropy(
+            logits.view(-1, logits.shape[-1]),
+            target.view(-1),
+        )
+        return lm_loss
--- a/libai/layers/droppath.py
+++ b/libai/layers/droppath.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+import oneflow.nn as nn
+
+
+def drop_path(x, drop_prob: float = 0.5, training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+
+    # similar opeartion to new_tensor(shape).bernoulli_(keep_prob)
+    random_tensor = flow.rand(*shape, dtype=x.dtype, sbp=x.sbp, placement=x.placement)
+    random_tensor = (random_tensor < keep_prob).to(flow.float32)
+
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor = random_tensor / keep_prob
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
--- a/libai/layers/embedding.py
+++ b/libai/layers/embedding.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+
+import oneflow as flow
+from oneflow import nn
+from oneflow.nn import init
+
+from libai.utils import distributed as dist
+
+
+class Embedding(nn.Module):
+    """Construct the trainable embedding module, which does not support parallelization.
+    This can be used for positional embedding and token type embedding.
+
+    Arguments:
+        num_embeddings: size of vocabulary.
+        embedding_dim: dimension of embeddings.
+        padding_idx: pad index. Defaults to None.
+        init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
+        amp_enabled: fp16 option for embedding weight. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        init_method=init.xavier_normal_,
+        amp_enabled=False,
+        dtype=flow.float32,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.init_method = init_method
+        self.amp_enabled = amp_enabled
+
+        assert num_embeddings > 0
+        self.weight = nn.Parameter(
+            flow.empty(
+                (num_embeddings, embedding_dim),
+                dtype=dtype,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1":
+            self.init_method(self.weight)
+        # FIXME(lxy): Fill padding_idx is not supported in nd_sbp right now.
+        # self._fill_padding_idx_with_zero()
+
+    def forward(self, input_ids):
+        weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
+        # embeddings with sbp sign: [B, B]
+        #   [B, B] x [S(0), B] --> [S(0), B]
+        #     ↑         ↑              ↑
+        #   embed    pos_ids       pos_embed
+        input_embeds = flow._C.gather(weight, input_ids, axis=0)
+        return input_embeds
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with flow.no_grad():
+                self.weight[self.padding_idx] = flow.zeros(
+                    self.embedding_dim,
+                    placement=dist.get_layer_placement(0),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+
+    def extra_repr(self) -> str:
+        s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        return s.format(**self.__dict__)
+
+
+class VocabEmbedding(nn.Module):
+    """Construct the word embeddings, which may be split along vocabulary dimension.
+
+    Arguments:
+        num_embeddings: size of vocabulary.
+        embedding_dim: dimension of embeddings.
+        padding_idx: pad index. Defaults to None.
+        init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
+        amp_enabled: fp16 option for embedding weight. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        init_method=init.xavier_normal_,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.init_method = init_method
+        self.amp_enabled = amp_enabled
+
+        # Word token embedding shape with (vocab_size, hidden_size)
+        # sbp: [B, S(0)]
+        self.weight = nn.Parameter(
+            flow.empty(
+                (num_embeddings, embedding_dim),
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(0),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
+            )
+        )
+        # Initialize the word embedding
+        if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1":
+            self.init_method(self.weight)
+        # FIXME(Lxy): Fill padding_idx is not supported in nd_sbp right now.
+        # self._fill_padding_idx_with_zero()
+
+    def forward(self, input_ids):
+        weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
+        # input_ids with shape (batch_size, seq_len), and sbp sign: [S(0), B]
+
+        # Gather forward sbp sign
+        # [B, S(0)] x [S(0), B] --> [S(0), P]
+        #     ↑           ↑            ↑
+        #   embed  input_ids    input_embeds
+        input_embeds = flow._C.gather(weight, input_ids, axis=0)
+        # Set the embeds sbp from [S(0), P] --> [S(0), B] to get complete embedding results.
+        input_embeds = input_embeds.to_global(sbp=dist.get_hidden_sbp())
+
+        return input_embeds
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with flow.no_grad():
+                self.weight[self.padding_idx] = flow.zeros(
+                    self.embedding_dim,
+                    placement=dist.get_layer_placement(0),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+
+    def extra_repr(self) -> str:
+        s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        return s.format(**self.__dict__)
+
+
+class SinePositionalEmbedding(nn.Module):
+    """Construct the sinusoidal positional embeddings.
+
+    Arguments:
+        num_embeddings: size of vocabulary.
+        embedding_dim: dimension of embeddings.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim):
+        super().__init__()
+
+        self.embedding_dim = embedding_dim
+        self.num_embeddings = num_embeddings
+
+        position_embedding = flow.zeros(
+            num_embeddings,
+            embedding_dim,
+            dtype=flow.float32,
+            placement=dist.get_layer_placement(0),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+        )
+        position = flow._C.global_arange(
+            start=0,
+            end=num_embeddings,
+            placement=dist.get_layer_placement(0),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            dtype=flow.float32,
+        ).unsqueeze(1)
+        position_range = flow._C.global_arange(
+            start=0,
+            end=embedding_dim,
+            step=2,
+            placement=dist.get_layer_placement(0),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            dtype=flow.float32,
+        )
+        div_term = flow.exp(position_range * (-math.log(10000.0) / embedding_dim))
+
+        position_embedding[:, 0::2] = flow.sin(position * div_term)
+        position_embedding[:, 1::2] = flow.cos(position * div_term)
+        self.register_buffer("position_embedding", position_embedding)
+
+    def forward(self, position_ids):
+        position_embeds = flow._C.gather(self.position_embedding, position_ids, axis=0)
+        return position_embeds
+
+    def extra_repr(self) -> str:
+        s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
+        return s.format(**self.__dict__)
+
+
+class PatchEmbedding(nn.Module):
+    """2D Image to Patch Embedding
+
+    Arguments:
+        img_size: size of input image. Default to 224.
+        patch_size: embedded patch size. Default to 16.
+        in_chans: input channel's size. Default to 3.
+        embed_dim: dimension of embedded patch. Default to 768.
+        norm_layer: normalization patch embedding or not. Default to None.
+        flatten: flatten patch embedding or keep the 2-D shape. Default to True.
+        layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
+        parallelism. Default to 0.
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+        img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size)
+        patch_size = patch_size if isinstance(patch_size, tuple) else (patch_size, patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        ).to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(layer_idx),
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert (
+            H == self.img_size[0]
+        ), f"Input image height ({H}) doesn't match model ({self.img_size[0]})."
+        assert (
+            W == self.img_size[1]
+        ), f"Input image width ({W}) doesn't match model ({self.img_size[1]})."
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
--- a/libai/layers/layer_norm.py
+++ b/libai/layers/layer_norm.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class LayerNorm(nn.Module):
+    """Applies Layer Normalization over a mini-batch of inputs in 1D parallelism.
+
+    Args:
+        normalized_shape: input shape from an expected input of size.
+        eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
+            elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
+        layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self, normalized_shape, eps=1e-5, elementwise_affine=True, bias=True, *, layer_idx=0
+    ):
+        super().__init__()
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = tuple(normalized_shape)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self.layer_idx = layer_idx
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(
+                flow.ones(
+                    normalized_shape,
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+            )
+            self.bias = nn.Parameter(
+                flow.zeros(
+                    normalized_shape,
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                ),
+                requires_grad=bias,
+            )
+        else:
+            self.weight = None
+            self.bias = None
+
+    def forward(self, x):
+        assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape
+        begin_norm_axis = x.ndim - len(self.normalized_shape)
+        begin_params_axis = x.ndim - len(self.normalized_shape)
+        if self.elementwise_affine:
+            y = flow._C.layer_norm_affine(
+                x,
+                self.weight,
+                self.bias,
+                begin_norm_axis=begin_norm_axis,
+                begin_params_axis=begin_params_axis,
+                epsilon=self.eps,
+            )
+        else:
+            y = flow._C.layer_norm(
+                x,
+                begin_norm_axis=begin_norm_axis,
+                begin_params_axis=begin_params_axis,
+                epsilon=self.eps,
+            )
+        return y
+
+    def extra_repr(self) -> str:
+        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
+            **self.__dict__
+        )
+
+
+class RMSLayerNorm(nn.Module):
+    """T5 uses a layer_norm which only scales and doesn't shift, which is also known as
+    Root Mean Square Layer Normalization thus varience is calculated w/o mean and
+    there is no bias. More details see: https://arxiv.org/abs/1910.07467.
+
+    Args:
+        normalized_shape: input shape from an expected input of size.
+        eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
+            elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, layer_idx=0):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.weight = flow.nn.Parameter(
+            flow.ones(
+                normalized_shape,
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        self.l2norm_epsilon = eps
+
+    def forward(self, hidden_states):
+        return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon)
--- a/libai/layers/linear.py
+++ b/libai/layers/linear.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class Linear1D(nn.Module):
+    r"""Linear layer with 1D parallelism which includes column parallelism and row parallelism.
+    The linear layer is defined as :math:`y = xA^T + b`.
+
+    In column parallelism, A^T is parallelized along the second dimension
+    as :math:`A^T = [A_1, ..., A_p]`.
+
+    In row parallelism, A^T is parallelized along the first dimension and X along its second
+    dimension as:
+
+    .. math::
+        A^T = \begin{bmatrix}
+                 A\_1 \\
+                 . \\
+                 . \\
+                 . \\
+                 A\_p
+        \end{bmatrix}
+        x = \begin{bmatrix}
+                 x\_1 & ... & x\_p
+        \end{bmatrix}
+
+    Arguments:
+        in_features: size of each input sample.
+        out_features: size of each output sample.
+        bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
+        parallel: Parallel mode. Defaults to "data".
+        init_method: method to initialize weight. Defaults to :func:`nn.init.xavier_normal_`.
+        skip_bias_add: skip adding bias but instead return it, so that adding bias can be fused with
+            other elementwise operations. Defaults to ``False``.
+        layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+        dtype: the dtype of weight. Defaults to ``flow.float32``
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        parallel="data",
+        init_method=nn.init.xavier_normal_,
+        skip_bias_add=False,
+        dtype=flow.float32,
+        *,
+        layer_idx=0,  # enforce layer_idx passed with keyword
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.parallel = parallel
+        self.skip_bias_add = skip_bias_add
+
+        if parallel == "col":
+            # Column parallel
+            # weight sbp sign: [B, S(0)], weight will be transposed when performing matmul
+            # so weight sbp sign actually be [B, S(1)]
+            # bias sbp sign: [B, S(0)]
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+        elif parallel == "row":
+            # Row parallel
+            # weight sbp sign: [B, S(1)], weight will be transposed when performing matmul
+            # so weight sbp sign actually be [B, S(1)]
+            # bias sbp sign: [B, B]
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        elif parallel == "data":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        else:
+            raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
+
+        self.weight = flow.nn.Parameter(
+            flow.empty(
+                (out_features, in_features),
+                dtype=dtype,
+                placement=dist.get_layer_placement(layer_idx),  # for pipeline parallelism placement
+                sbp=weight_sbp,
+            )
+        )
+        if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1":
+            init_method(self.weight)
+
+        self.bias = (
+            flow.nn.Parameter(
+                flow.zeros(
+                    (out_features,),
+                    dtype=dtype,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=bias_sbp,
+                )
+            )
+            if bias
+            else None
+        )
+
+    def forward(self, x):
+        if dist.same_sbp(self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])):
+            # If the last dim of weight sbp sign is S(0), then last dim of weight.t sbp
+            # sign is S(1), so the last dim of x sbp sign must be B.
+            if self.weight.sbp[-1] == flow.sbp.split(0):
+                x_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
+                x = x.to_global(sbp=x_sbp)
+
+            # x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
+            x = x.to_global(grad_sbp=x.sbp)
+            x = flow.matmul(x, self.weight, transpose_b=True)
+
+        elif dist.same_sbp(
+            self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
+        ):
+            # If the last dim of weight sbp sign is S(1), then last dim of weight.t sbp
+            # sign is S(0), so the last dim of x sbp sign must be S(ndim-1).
+            if self.weight.sbp[-1] == flow.sbp.split(1):
+                x_sbp = x.sbp[:-1] + (flow.sbp.split(x.ndim - 1),)
+                x = x.to_global(sbp=x_sbp)
+                out_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
+            else:
+                out_sbp = x.sbp
+
+            x = flow.matmul(x, self.weight, transpose_b=True)
+            # Change x.sbp for followup forward pass.
+            # This line can be removed when sbp can be auto inferred.
+            x = x.to_global(sbp=out_sbp)
+        elif dist.same_sbp(
+            self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        ):
+            # x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
+            x = x.to_global(grad_sbp=x.sbp)
+            # NOTE(chengcheng): when input x is [S(0), B], there is no need to change sbp for x.
+            # x = x.to_global(sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(0)]))
+            x = flow.matmul(x, self.weight, transpose_b=True)
+        else:
+            # Not supported weight_sbp, deduce sbp and communicate with nccl automatically.
+            x = flow.matmul(x, self.weight, transpose_b=True)
+
+        if self.bias is not None:
+            if self.skip_bias_add:
+                return x, self.bias
+            else:
+                return x + self.bias
+        else:
+            return x
+
+    def extra_repr(self) -> str:
+        return "in_features={}, out_features={}, bias={}, parallel={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.parallel,
+        )
+
+
+# Give an alias for Linear1d
+Linear = Linear1D
--- a/libai/layers/lm_logits.py
+++ b/libai/layers/lm_logits.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class LMLogits(nn.Module):
+    def __init__(self, vocab_size, bias=False):
+        super().__init__()
+        self.bias = (
+            nn.Parameter(
+                flow.zeros(
+                    (vocab_size,),
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(-1),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
+                )
+            )
+            if bias
+            else None
+        )
+
+    def forward(self, input, word_embeddings):
+        """LM logits using word embedding weights"""
+        # input with sbp sign [S(0), B] and word_embeddings with sbp sign [S(0), B]
+
+        # NOTE(l1aoxingyu): This is for pipeline parallelism
+        # change word embedding placement from stage(0) to stage(-1)
+        w = word_embeddings.to_global(placement=input.placement)
+
+        # NOTE(l1aoxingyu): input x embed^T = logits with sbp sign
+        # [S(0), B] x [B, S(1)] --> [S(0), S(1)]
+        #     ↑          ↑               ↑
+        #   input      embed^T         logits
+        # Backward pass input.grad = logits.grad x embed with sbp sign
+        # [S(0), S(1)] x [B, S(0)] --> [S(0), P]
+        #     ↑             ↑               ↑
+        #  logits.grad    embed        input.grad
+        # When use input.grad as head node for backward pass, need to convert
+        # its sbp sign fromm [S(0), P] --> [S(0), B]
+        input = input.to_global(grad_sbp=input.sbp)
+
+        logits = flow._C.matmul(input, w, transpose_b=True)
+        if self.bias is not None:
+            logits = logits + self.bias
+        return logits
--- a/libai/layers/mlp.py
+++ b/libai/layers/mlp.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.layers import Linear, build_activation
+
+
+class MLP(nn.Module):
+    """MLP
+
+    MLP will take the input with h hidden state, project it to intermediate
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension.
+
+    Arguments:
+        hidden_size: size of each input and output sample.
+        ffn_hidden_size: size of each intermediate sample.
+        output_dropout_prob: Output dropout probability. Defaults to 0.0.
+        init_method: method to initialize the first linear weight.
+            Defaults to :func:`nn.init.xavier_normal_`.
+        output_layer_init_method: method to initialize the second linear weight. If set to None,
+            it will use ``init_method`` instead. Defaults to None.
+        bias_gelu_fusion: If set to ``True``, it will fuse bias adding and elementwise
+            gelu activation. Defaults to ``False``.
+        bias_dropout_fusion: If set to ``True``, it will fuse bias adding and dropout.
+            Defaults to ``False``.
+        layer_idx: A layer_idx sign which determines the placement. It will be used in
+            pipeline parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        ffn_hidden_size,
+        output_dropout_prob=0.0,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.output_dropout_prob = output_dropout_prob
+        self.bias_gelu_fusion = bias_gelu_fusion
+        self.bias_dropout_fusion = bias_dropout_fusion
+
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        self.dense_h_to_4h = Linear(
+            hidden_size,
+            ffn_hidden_size,
+            bias=True,
+            parallel="col",
+            skip_bias_add=bias_gelu_fusion,
+            init_method=init_method,
+            layer_idx=layer_idx,
+        )
+
+        if not bias_gelu_fusion:
+            self.activation_func = build_activation("gelu")
+
+        self.dense_4h_to_h = Linear(
+            ffn_hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="row",
+            skip_bias_add=bias_dropout_fusion,
+            init_method=output_layer_init_method,
+            layer_idx=layer_idx,
+        )
+
+        if not bias_dropout_fusion:
+            self.dropout = nn.Dropout(self.output_dropout_prob)
+
+    def forward(self, hidden_states):
+        intermediate = self.dense_h_to_4h(hidden_states)
+        if self.bias_gelu_fusion:
+            intermediate, bias = intermediate
+            intermediate = flow._C.fused_bias_add_gelu(
+                intermediate, bias, axis=intermediate.ndim - 1
+            )
+        else:
+            intermediate = self.activation_func(intermediate)
+
+        output = self.dense_4h_to_h(intermediate)
+        if self.bias_dropout_fusion:
+            output, bias = output
+            output = flow._C.fused_bias_add_dropout(
+                output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
+            )
+        else:
+            output = self.dropout(output)
+        return output
+
+    def extra_repr(self) -> str:
+        return "bias_gelu_fusion={}, bias_dropout_fusion={}, dropout={}".format(
+            self.bias_gelu_fusion, self.bias_dropout_fusion, self.output_dropout_prob
+        )
--- a/libai/layers/transformer_layer.py
+++ b/libai/layers/transformer_layer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow.nn as nn
+
+from libai.utils import distributed as dist
+
+from .attention import AttnMaskType, MultiheadAttention
+from .droppath import DropPath
+from .layer_norm import LayerNorm
+from .mlp import MLP
+
+
+class TransformerLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [bsz, seq_length, hidden size] and returns an
+    output of the same size.
+    The input and output has same sbp sign, (S(0), B).
+
+    Arguments:
+        hidden_size: size of hidden state.
+        ffn_hidden_size: size of feed forword neural network.
+        num_attention_heads: number of attention heads.
+        is_decoder: used to specify whether this is transformer encoder layer or transformer
+            decoder layer. Default: ``False``.
+        attention_dropout_prob: dropout probability of attention weights.
+        output_dropout_prob: dropout probability of output.
+        layernorm_epsilon: epsilon used in layernorm layer. Default: `1e-5`.
+        init_method: method to initialize the input layer weights.
+        output_layer_init_method: method to initialize the output layer weights.
+            If None, use `init_method`.
+        bias_gelu_fusion: whether fuse add bias and gelu. Default: ``False``.
+        bias_dropout_fusion: whether fuse add bias and dropout. Default: ``False``.
+        scale_mask_softmax_fusion: whether to fuse scale, mask and softmax. Default: ``False``.
+        apply_query_key_layer_scaling: if `true`, scaling the attention score by layer index.
+            Default: ``False``.
+        apply_residual_post_layernorm: if ``true``, use original BERT residual
+            connection ordering. Otherwise, use Megatron BERT residual connection which
+            is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        layer_idx: the layer index, which determines the placement.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        is_decoder=False,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        drop_path_prob=0.0,
+        layernorm_epsilon=1e-5,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+        attn_mask_type=AttnMaskType.padding,
+        *,
+        layer_idx=0
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout_prob = attention_dropout_prob
+        self.output_dropout_prob = output_dropout_prob
+        self.layernorm_epsilon = layernorm_epsilon
+        self.attn_mask_type = attn_mask_type
+
+        self.layer_idx = layer_idx
+        self.is_decoder = is_decoder
+
+        self.bias_gelu_fusion = bias_gelu_fusion
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.apply_residual_post_layernorm = apply_residual_post_layernorm
+
+        self.init_method = init_method
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        self.output_layer_init_method = output_layer_init_method
+
+        self.drop_path = DropPath(drop_path_prob) if drop_path_prob > 0.0 else nn.Identity()
+
+        self.input_layernorm = LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+        )
+
+        self.self_attention = self.build_attention(is_cross_attention=False)
+        self.post_attention_layernorm = LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+        )
+
+        if self.is_decoder:
+            self.cross_attention = self.build_attention(is_cross_attention=True)
+            self.post_cross_attention_layernorm = LayerNorm(
+                self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+            )
+
+        self.mlp = MLP(
+            self.hidden_size,
+            self.ffn_hidden_size,
+            self.output_dropout_prob,
+            self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            bias_gelu_fusion=self.bias_gelu_fusion,
+            bias_dropout_fusion=self.bias_dropout_fusion,
+            layer_idx=self.layer_idx,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        use_cache=False,
+    ):
+        """
+        Args:
+            hidden_states: shape is (batch_size, seq_length, hidden_size),
+                sbp signature is (S(0), B).
+            attention_mask: the combination of key padding mask and casual mask of hidden states
+                with shape (batch_size, 1, seq_length, seq_length) and the sbp
+                signature is (S(0), B),
+            encoder_states: encoder output with shape (batch_size, seq_length, hidden_size)
+                and the sbp signature is (S(0), B), which will be used in cross attention.
+            encoder_attention_mask: key padding mask of encoder states with shape
+                (batch_size, 1, seq_length, seq_length) and the sbp signature is (S(0), B).
+            past_key_value: tuple of key and value, each shape is
+                (seq_length, bsz, num_heads, head_size), For decoder layer,
+                the past_key_value contains the states both from self attention
+                and cross attention.
+            use_cache: it will be set to `True` when the model is in the inference phase and
+                used for incremental decoding.
+        """
+        # Change placement for pipeline parallelsim
+        hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))
+
+        # hidden_states shape: (batch_size, seq_length, hidden_size)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to_global(
+                placement=dist.get_layer_placement(self.layer_idx)
+            )
+
+        if past_key_value is not None:
+            if self.is_decoder:
+                assert len(past_key_value) == 4
+                self_attn_past_key_value = past_key_value[:2]
+                cross_attn_past_key_value = past_key_value[2:]
+            else:
+                self_attn_past_key_value = past_key_value
+                cross_attn_past_key_value = None
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        layernorm_output = self.input_layernorm(hidden_states)
+        attention_output = self.self_attention(
+            layernorm_output,
+            attention_mask=attention_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+        )
+        attention_output = self.drop_path(attention_output)
+
+        if use_cache:
+            attention_output, presents = attention_output
+
+        if self.apply_residual_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        hidden_states = residual + attention_output
+
+        layernorm_output = self.post_attention_layernorm(hidden_states)
+
+        if self.is_decoder:
+            attention_output = self.cross_attention(
+                layernorm_output,
+                encoder_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                use_cache=use_cache,
+            )
+
+            if use_cache:
+                attention_output, decoder_presents = attention_output
+                presents += decoder_presents
+
+            attention_output = self.drop_path(attention_output)
+            if self.apply_residual_post_layernorm:
+                residual = layernorm_output
+            else:
+                residual = hidden_states
+
+            hidden_states = residual + attention_output
+            layernorm_output = self.post_cross_attention_layernorm(hidden_states)
+
+        mlp_output = self.mlp(layernorm_output)
+        mlp_output = self.drop_path(mlp_output)
+
+        if self.apply_residual_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        output = residual + mlp_output
+
+        if use_cache:
+            output = (output, presents)
+        return output
+
+    def build_attention(self, is_cross_attention=False):
+        return MultiheadAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            is_cross_attention=is_cross_attention,
+            attention_dropout_prob=self.attention_dropout_prob,
+            output_dropout_prob=self.output_dropout_prob,
+            init_method=self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            bias_dropout_fusion=self.bias_dropout_fusion,
+            scale_mask_softmax_fusion=self.scale_mask_softmax_fusion,
+            apply_query_key_layer_scaling=self.apply_query_key_layer_scaling,
+            attn_mask_type=self.attn_mask_type,
+            layer_idx=self.layer_idx,
+        )
--- a/libai/models/__init__.py
+++ b/libai/models/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .bert_model import BertForPreTraining, BertModel, BertForClassification
+from .roberta_model import RobertaForPreTraining, RobertaForCausalLM, RobertaModel
+from .build import build_graph, build_model
+from .t5_model import T5ForPreTraining, T5Model
+from .gpt_model import GPTForPreTraining, GPTModel
+from .vision_transformer import VisionTransformer
+from .swin_transformer import SwinTransformer
+from .swin_transformer_v2 import SwinTransformerV2
+from .resmlp import ResMLP
+
+__all__ = [
+    "build_model",
+    "build_graph",
+    "BertModel",
+    "BertForPreTraining",
+    "BertForClassification",
+    "RobertaModel",
+    "RobertaForCausalLM",
+    "RobertaForPreTraining",
+    "T5Model",
+    "T5ForPreTraining",
+    "GPTModel",
+    "GPTForPreTraining",
+    "VisionTransformer",
+    "SwinTransformer",
+    "SwinTransformerV2",
+    "ResMLP",
+]
--- a/libai/models/bert_model.py
+++ b/libai/models/bert_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    Linear,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+    build_activation,
+)
+from libai.layers.attention import AttnMaskType
+from libai.utils import distributed as dist
+
+from .utils import init_method_normal, scaled_init_method_normal
+
+
+class BertExtendedAttnMask(nn.Module):
+    def forward(self, attention_mask):
+        # We create a 3D attention mask from a 2D tensor mask.
+        # [b, 1, s]
+        attention_mask_b1s = attention_mask.unsqueeze(1)
+        # [b, s, 1]
+        attention_mask_bs1 = attention_mask.unsqueeze(2)
+        # [b, s, s]
+        attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+        # [b, 1, s, s]
+        extended_attention_mask = attention_mask_bss.unsqueeze(1)
+
+        return extended_attention_mask
+
+
+class BertEmbeddings(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        num_tokentypes=0,
+        init_method=nn.init.xavier_normal_,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        self.vocab_embeddings = VocabEmbedding(
+            vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.position_embeddings = Embedding(
+            max_sequence_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+
+        # NOTE(l1aoxingyu): Set position_ids sbp sign to [B, B] initially, because position_ids is a
+        # 1D-tensor from 0 to seq_length, if set to [S(0), B] at first, then position_ids
+        # will split at the first dim of hierarchy.
+        self.position_ids = flow.arange(
+            max_sequence_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(0),
+        ).unsqueeze(0)
+
+        if num_tokentypes > 0:
+            self.tokentype_embeddings = Embedding(
+                num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+            )
+            self.tokentype_ids = flow.zeros(
+                self.position_ids.size(),
+                dtype=flow.long,
+                sbp=self.position_ids.sbp,
+                placement=self.position_ids.placement,
+            )
+        else:
+            self.tokentype_embeddings = None
+
+        self.embedding_dropout = nn.Dropout(embedding_dropout_prob)
+
+    def forward(self, input_ids, tokentype_ids=None, position_ids=None):
+        seq_length = input_ids.size()[1]
+
+        word_embeddings = self.vocab_embeddings(input_ids)
+        if position_ids is None:
+            # Change position_ids sbp sign: [B, B] -> [S(0), B]
+            position_ids = (
+                self.position_ids[:, :seq_length].expand_as(input_ids).to_global(sbp=input_ids.sbp)
+            )
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = word_embeddings + position_embeddings
+
+        if self.tokentype_embeddings is not None:
+            if tokentype_ids is None:
+                tokentype_ids = (
+                    self.tokentype_ids[:, :seq_length]
+                    .expand_as(input_ids)
+                    .to_global(sbp=input_ids.sbp)
+                )
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def word_embeddings(self):
+        return self.vocab_embeddings.weight
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, hidden_size, init_method):
+        super().__init__()
+        self.dense = Linear(
+            hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="data",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.activation_func = build_activation("gelu")
+        self.layernorm = LayerNorm((hidden_size,), layer_idx=-1)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation_func(hidden_states)
+        hidden_states = hidden_states.to_global(
+            grad_sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(2)])
+        )
+
+        # NOTE(l1aoxingyu): hidden_states shape is [B, S, H] whose sbp sign: [S(0), S(2)]
+        # Change from [S(0), S(2)] -> [S(0), B] because layernorm cannot get inputs with sbp S(2)
+        hidden_states = hidden_states.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])
+        )
+        hidden_states = self.layernorm(hidden_states)
+        return hidden_states
+
+
+class BertPooler(nn.Module):
+    """Pooler layer.
+
+    Pool hidden states of the first token and
+    add a linear transformation followed by a tanh.
+
+    Args:
+        hidden_size: hidden state feature dimension
+    """
+
+    def __init__(self, hidden_size, init_method):
+        super().__init__()
+        self.dense = Linear(
+            hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="col",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.activation_func = build_activation("tanh")
+
+    def forward(self, hidden_states):
+        """Just "pool" the model by simply taking the [CLS] token corresponding
+        to the first token."""
+        # hidden_states: [bsz, seq_len, hidden_size]
+        select_token_tensor = hidden_states[:, 0, :]
+        pooled_output = self.dense(select_token_tensor)
+        pooled_output = self.activation_func(pooled_output)
+        return pooled_output
+
+
+class BertLoss(nn.Module):
+    def __init__(self, add_binary_head):
+        super().__init__()
+        self.add_binary_head = add_binary_head
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, lm_output, lm_labels, loss_mask, binary_logits, ns_labels):
+        lm_labels = lm_labels.to_global(placement=lm_output.placement)
+        loss_mask = loss_mask.to_global(placement=lm_output.placement)
+        binary_logits = binary_logits.to_global(placement=lm_output.placement)
+        ns_labels = ns_labels.to_global(placement=lm_output.placement)
+        lm_loss = self.lm_loss(lm_output, lm_labels)
+        loss_mask = loss_mask.float()
+        # Change loss_mask.sum() sbp sign from [P, B] -> [B, B]
+        # because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P
+        denominator = (
+            loss_mask.sum().to_global(sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
+            + 1e-7
+        )
+        masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
+        # NOTE(l1aoxingyu): Change lm loss sbp sign [P, P] -> [P, B] to add with sop loss
+        # whose sbp sign: [P, B]
+        masked_lm_loss = masked_lm_loss.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
+        )
+
+        loss_dict = {"lm_loss": masked_lm_loss}
+
+        if self.add_binary_head:
+            sop_loss = flow._C.cross_entropy(
+                binary_logits, ns_labels, ignore_index=-1, reduction="none"
+            ).mean()
+            loss_dict["sop_loss"] = sop_loss
+        return loss_dict
+
+
+class BertModel(nn.Module):
+    """The bare Bert Model transformer outputting raw hidden-states without
+    any specific head on top.
+
+    Args:
+        vocab_size (int): The size of vocabulary file.
+        hidden_size (int): The size of hidden states.
+        hidden_layers (int): The number of ``TransformerLayer`` in encoder.
+        num_attention_heads (int):
+            The number of attention heads for each attention layer of ``TransformerLayer``.
+        intermediate_size (int):
+            The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
+        hidden_dropout_prob  (float, optional):
+            The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
+        attention_probs_dropout_prob  (float, optional):
+            The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+            Defaults to 0.0.
+        max_position_embeddings (int):
+            Max sequence length of input, defines the shape of Position Embeddings
+            in ``BertEmbedding``.
+        num_tokentypes (int, optional):
+            Number of segment token indices. Defaults to 2.
+        add_pooling_layer (bool, optional):
+            Whether or not averaging or pooling the sequence of hidden-states for the
+            whole input sequence. Defaults to ``True``.
+        initializer_range (float, optional):
+            Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+        layernorm_epsilon (float, optional):
+            The epsilon of LayerNorm layer. Defaults to 1e-5.
+        bias_gelu_fusion (bool, optional):
+            Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+        bias_dropout_fusion (bool, optional):
+            Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+        scale_mask_softmax_fusion (bool, optional):
+            Whether to fuse the computing of mask and softmax in attention layers.
+            Defaults to ``False``.
+        apply_query_key_layer_scaling (bool, optional):
+            Whether or not to use layer index related scaling in computing attention scores.
+            If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+            Defaults to ``True``.
+        apply_residual_post_layernorm (bool, optional):
+            If set ``True``, use original BERT residual connection ordering otherwise use Megatron
+            BERT residual connection which is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        amp_enabled (bool, optional):
+            Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        hidden_layers,
+        num_attention_heads,
+        intermediate_size,
+        hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        max_position_embeddings,
+        num_tokentypes=2,
+        add_pooling_layer=True,
+        initializer_range=0.02,
+        layernorm_eps=1e-12,
+        bias_gelu_fusion=True,
+        bias_dropout_fusion=True,
+        scale_mask_softmax_fusion=True,
+        apply_query_key_layer_scaling=True,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        init_method = init_method_normal(initializer_range)
+        scaled_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
+
+        # Embeddings
+        self.embeddings = BertEmbeddings(
+            vocab_size,
+            hidden_size,
+            max_position_embeddings,
+            hidden_dropout_prob,
+            num_tokentypes,
+            init_method,
+            amp_enabled,
+        )
+
+        # Mask generation
+        self.extended_attn_mask = BertExtendedAttnMask()
+
+        # Encoders
+        self.encoders = nn.ModuleList(
+            [
+                TransformerLayer(
+                    hidden_size,
+                    intermediate_size,
+                    num_attention_heads,
+                    attention_dropout_prob=attention_probs_dropout_prob,
+                    output_dropout_prob=hidden_dropout_prob,
+                    layernorm_epsilon=layernorm_eps,
+                    bias_gelu_fusion=bias_gelu_fusion,
+                    bias_dropout_fusion=bias_dropout_fusion,
+                    scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                    apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                    init_method=init_method,
+                    output_layer_init_method=scaled_init_method,
+                    apply_residual_post_layernorm=apply_residual_post_layernorm,
+                    attn_mask_type=AttnMaskType.padding,  # bert mask type
+                    layer_idx=i,
+                )
+                for i in range(hidden_layers)
+            ]
+        )
+        self.final_layernorm = LayerNorm((hidden_size,), eps=layernorm_eps, layer_idx=-1)
+
+        self.pooler = BertPooler(hidden_size, init_method) if add_pooling_layer else None
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "hidden_layers": cfg.hidden_layers,
+            "num_attention_heads": cfg.num_attention_heads,
+            "intermediate_size": cfg.intermediate_size,
+            "hidden_dropout_prob": cfg.hidden_dropout_prob,
+            "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
+            "max_position_embeddings": cfg.max_position_embeddings,
+            "num_tokentypes": cfg.num_tokentypes,
+            "add_pooling_layer": cfg.add_pooling_layer,
+            "initializer_range": cfg.initializer_range,
+            "layernorm_eps": cfg.layernorm_eps,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention
+                on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first and
+                second portions of the inputs. Indices are selected in `[0, 1]`. Defaults to None.
+        """
+        extended_attention_mask = self.extended_attn_mask(attention_mask)
+        embedding_output = self.embeddings(input_ids, tokentype_ids)
+
+        hidden_states = embedding_output
+        for layer in self.encoders:
+            hidden_states = layer(hidden_states, extended_attention_mask)
+        encoder_output = self.final_layernorm(hidden_states)
+        pooled_output = self.pooler(encoder_output) if self.pooler is not None else None
+        return encoder_output, pooled_output
+
+    def word_embeddings_weight(self):
+        return self.embeddings.word_embeddings()
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, vocab_size, hidden_size, init_method, add_binary_head=True):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(hidden_size, init_method)
+        self.seq_relationship = Linear(
+            hidden_size,
+            2,
+            bias=True,
+            parallel="data",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.lm_logits = LMLogits(vocab_size, bias=True)
+        self.loss_func = BertLoss(add_binary_head)
+
+    def forward(
+        self,
+        sequence_output,
+        pooled_output,
+        word_embeddings_weight,
+        ns_labels,
+        lm_labels,
+        loss_mask,
+    ):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        prediction_scores = self.lm_logits(prediction_scores, word_embeddings_weight)
+
+        if lm_labels is not None:
+            return self.loss_func(
+                prediction_scores, lm_labels, loss_mask, seq_relationship_score, ns_labels
+            )
+        return {
+            "prediction_scores": prediction_scores,
+            "seq_relationship_score": seq_relationship_score,
+        }
+
+
+class BertForPreTraining(nn.Module):
+    """Bert Model with two heads on top as done during the pretraining: a
+    `masked language modeling` head and a `next sentence prediction (classification)` head.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.bert = BertModel(cfg)
+        self.cls_head = BertPreTrainingHeads(
+            cfg.vocab_size,
+            cfg.hidden_size,
+            init_method_normal(cfg.initializer_range),
+            cfg.add_binary_head,
+        )
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        tokentype_ids=None,
+        ns_labels=None,
+        lm_labels=None,
+        loss_mask=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
+                and second portions of the inputs. Indices are selected in `[0, 1]`.
+                Defaults to None.
+            ns_labels (flow.LongTensor, optional): Labels for computing the next sequence prediction
+                (classification) loss. Input should be a sequence pair (see `input_ids` docstring).
+                Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+
+            lm_labels (flow.LongTensor, optional): Labels for computing the masked
+                language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
+            loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
+                on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0))
+        tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0))
+        outputs = self.bert(input_ids, attention_mask, tokentype_ids)
+        sequence_output, pooled_output = outputs[:2]
+
+        return self.cls_head(
+            sequence_output,
+            pooled_output,
+            self.bert.word_embeddings_weight(),
+            ns_labels,
+            lm_labels,
+            loss_mask,
+        )
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.bert.final_layernorm, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.origin, BertEmbeddings):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, BertExtendedAttnMask):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, BertPooler):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.origin, BertPreTrainingHeads):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.bert.final_layernorm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), BertEmbeddings):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), BertExtendedAttnMask):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), BertPooler):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.to(nn.Module), BertPreTrainingHeads):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.bert.final_layernorm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+
+class BertForClassification(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.num_labels = cfg.num_labels
+
+        self.bert = BertModel(cfg)
+        self.classifier = Linear(
+            cfg.hidden_size,
+            cfg.num_labels,
+            bias=True,
+            parallel="row",
+            init_method=init_method_normal(cfg.initializer_range),
+            layer_idx=-1,
+        )
+        classifier_dropout = (
+            cfg.classifier_dropout
+            if cfg.classifier_dropout is not None
+            else cfg.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None, labels=None, **kwargs):
+        labels = labels if labels is not None else kwargs.get("ns_labels")
+        outputs = self.bert(input_ids, attention_mask, tokentype_ids)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            loss = loss.to_global(sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast]))
+            return {"cls_loss": loss}
+        else:
+            return {"logits": logits}
--- a/libai/models/build.py
+++ b/libai/models/build.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libai.config import instantiate, try_get_key
+
+
+def build_model(cfg):
+    """Build the whole model architecture, defined by ``cfg.model``.
+    Note that it does not load any weights from ``cfg``.
+    """
+    model = instantiate(cfg)
+    return model
+
+
+def build_graph(cfg, model, optimizer=None, lr_scheduler=None, is_train=False):
+    """Build the `nn.Graph`, defined by ``cfg.graph``."""
+    auto_parallel_conf = try_get_key(cfg, "graph.auto_parallel", default=None)
+    if is_train:
+        # Set train graph
+        assert optimizer is not None, "optimizer must be set for train graph"
+        assert lr_scheduler is not None, "lr_scheduler must be set for train graph"
+        graph = cfg.graph.train_graph
+        graph.model = model
+        graph.optimizer = optimizer
+        graph.lr_scheduler = lr_scheduler
+        graph.fp16 = try_get_key(cfg, "train.amp.enabled", default=False)
+        graph.activation_checkpoint = try_get_key(
+            cfg, "train.activation_checkpoint.enabled", default=False
+        )
+        graph.zero_optim = try_get_key(cfg, "train.zero_optimization.enabled", default=False)
+        graph.zero_stage = try_get_key(cfg, "train.zero_optimization.stage", default=1)
+        graph.grad_acc_steps = try_get_key(cfg, "train.num_accumulation_steps", default=1)
+        graph.auto_parallel_conf = auto_parallel_conf
+        return instantiate(graph)
+    else:
+        # Set eval graph
+        graph = cfg.graph.eval_graph
+        graph.model = model
+        graph.auto_parallel_conf = auto_parallel_conf
+        return instantiate(graph)
--- a/libai/models/gpt_model.py
+++ b/libai/models/gpt_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+from oneflow.nn import init
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+)
+from libai.layers.attention import AttnMaskType
+from libai.utils import distributed as dist
+
+from .utils import init_method_normal, scaled_init_method_normal
+
+
+class CasualMask(nn.Module):
+    """
+    Create a casual mask and combine it with the padding mask.
+    It will be used in gpt model and T5 decoder.
+    When in T5 decoder, the argument `layer_idx` should be set to first decoder layer index.
+    """
+
+    def __init__(self, max_positions=1024, *, layer_idx=0):
+        super().__init__()
+        self.mask = flow.tril(
+            flow.ones(
+                (max_positions, max_positions),
+                dtype=flow.int8,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+
+    def forward(self, input_ids, past_length=0, attention_mask=None):
+        bsz, tgt_len = input_ids.size()
+        casual_mask = self.mask[:tgt_len, :tgt_len]
+        if past_length > 0:
+            # in case past_key_values are used, we need to add a prefix ones mask to casual mask
+            casual_mask = flow.cat(
+                [flow.ones(tgt_len, past_length, dtype=flow.int8), casual_mask], dim=-1
+            )
+        casual_mask = (
+            casual_mask.unsqueeze(0).unsqueeze(1).expand(bsz, 1, tgt_len, tgt_len + past_length)
+        )
+        casual_mask = casual_mask.to_global(sbp=input_ids.sbp)
+        if attention_mask is not None:
+            assert attention_mask.dim() == 4, "please extend the attention mask first"
+            casual_mask = casual_mask * attention_mask
+        return casual_mask
+
+
+class GPTModel(nn.Module):
+    """GPT-2 language model. The output of the forward method is logits.
+
+    Args:
+        hidden_layers (int): The number of ``TransformerLayer`` in the gpt model.
+        vocab_size (int): The size of vocabulary file.
+        hidden_size (int): The size of hidden states.
+        ffn_hidden_size (int):
+            The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
+        num_attention_heads (int):
+            The number of attention heads for each attention layer of ``TransformerLayer``.
+        max_seq_length (int, optional):
+            Max sequence length of input, defines the shape of Position Embeddings in GPTEmebedding.
+            Defaults to 1024.
+        embedding_dropout_prob (float, optional):
+            The dropout ratio for the output of GPTEmbedding Layer. Defaults to 0.0.
+        attention_dropout_prob (float, optional):
+            The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+            Defaults to 0.0.
+        output_dropout_prob (float, optional):
+            The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
+        layernorm_epsilon (float, optional):
+            The epsilon of LayerNorm layer. Defaults to 1e-5.
+        initializer_range (float, optional):
+            Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+        use_scaled_init_for_output_weights (bool, optional): Defaults to ``True``.
+        bias_gelu_fusion (bool, optional):
+            Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+        bias_dropout_fusion (bool, optional):
+            Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+        scale_mask_softmax_fusion (bool, optional):
+            Whether to fuse the computing of mask and softmax in attention layers.
+            Defaults to ``False``.
+        apply_query_key_layer_scaling (bool, optional):
+            Whether or not to use layer index related scaling in computing attention scores.
+            If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+            Defaults to ``False``.
+        apply_residual_post_layernorm (bool, optional):
+            If set ``True``, use original BERT residual connection ordering otherwise use Megatron
+            BERT residual connection which is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        amp_enabled (bool, optional):
+            Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        hidden_layers,
+        vocab_size,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        max_seq_length=1024,
+        embedding_dropout_prob=0.0,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        layernorm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_scaled_init_for_output_weights=True,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        init_method = init_method_normal(sigma=initializer_range)
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
+        else:
+            output_layer_init_method = init_method
+
+        self.embeddings = GPTEmbedding(
+            vocab_size,
+            hidden_size,
+            max_seq_length,
+            init_method=init_method,
+            embedding_dropout_prob=embedding_dropout_prob,
+            amp_enabled=amp_enabled,
+        )
+
+        self.transformer = Transformer(
+            hidden_layers,
+            hidden_size,
+            ffn_hidden_size,
+            num_attention_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            output_dropout_prob=output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            init_method=init_method,
+            output_layer_init_method=output_layer_init_method,
+            bias_gelu_fusion=bias_gelu_fusion,
+            bias_dropout_fusion=bias_dropout_fusion,
+            scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            apply_residual_post_layernorm=apply_residual_post_layernorm,
+        )
+
+        self.lm_head = LMLogits(vocab_size, bias=False)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "hidden_layers": cfg.hidden_layers,
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "ffn_hidden_size": cfg.ffn_hidden_size,
+            "num_attention_heads": cfg.num_attention_heads,
+            "max_seq_length": cfg.max_seq_length,
+            "embedding_dropout_prob": cfg.embedding_dropout_prob,
+            "attention_dropout_prob": cfg.attention_dropout_prob,
+            "output_dropout_prob": cfg.output_dropout_prob,
+            "layernorm_epsilon": cfg.layernorm_epsilon,
+            "initializer_range": cfg.initializer_range,
+            "use_scaled_init_for_output_weights": cfg.use_scaled_init_for_output_weights,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+    def forward(self, input_ids):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+
+        Returns:
+            flow.Tensor: logits
+        """
+
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        input_embeds = self.embeddings(input_ids, 0)
+
+        transformer_output = self.transformer(input_embeds, attention_mask=None)
+
+        output = self.lm_head(transformer_output, self.embeddings.token_embeddings.weight)
+
+        return output
+
+
+class GPTEmbedding(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_seq_length,
+        init_method=init.xavier_normal_,
+        embedding_dropout_prob=0.0,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        self.token_embeddings = VocabEmbedding(
+            vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.position_embeddings = Embedding(
+            max_seq_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.dropout = nn.Dropout(embedding_dropout_prob)
+
+        self.position_ids = flow.arange(
+            max_seq_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(0),
+        ).unsqueeze(0)
+
+    def forward(self, input_ids, past_length=0):
+        bsz, seq_length = input_ids.size()
+
+        position_ids = self.position_ids[:, past_length : past_length + seq_length]
+        position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
+
+        token_embeds = self.token_embeddings(input_ids)
+        position_embeds = self.position_embeddings(position_ids)
+        input_embeds = token_embeds + position_embeds
+        input_embeds = self.dropout(input_embeds)
+        return input_embeds
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        hidden_layers,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        layernorm_epsilon=1e-5,
+        init_method=init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+    ):
+        super().__init__()
+        self.hidden_layers = hidden_layers
+
+        def build_layer(layer_number):
+            return TransformerLayer(
+                hidden_size,
+                ffn_hidden_size,
+                num_attention_heads,
+                attention_dropout_prob=attention_dropout_prob,
+                output_dropout_prob=output_dropout_prob,
+                layernorm_epsilon=layernorm_epsilon,
+                init_method=init_method,
+                output_layer_init_method=output_layer_init_method,
+                bias_gelu_fusion=bias_gelu_fusion,
+                bias_dropout_fusion=bias_dropout_fusion,
+                scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                apply_residual_post_layernorm=apply_residual_post_layernorm,
+                attn_mask_type=AttnMaskType.causal,
+                layer_idx=layer_number,
+            )
+
+        self.layers = nn.ModuleList([build_layer(i) for i in range(self.hidden_layers)])
+        self.layernorm_f = LayerNorm(hidden_size, eps=layernorm_epsilon, layer_idx=-1)
+
+    def forward(self, hidden_states, attention_mask):
+        # hidden_states shape: (batch_size, seq_length, hidden_size)
+        # sbp: [S(0), B]
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, attention_mask)
+
+        output = self.layernorm_f(hidden_states)
+
+        return output
+
+
+class GPTLoss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, logits, lm_labels):
+        lm_loss = self.lm_loss(logits, lm_labels)
+        lm_loss = lm_loss.mean()
+        return {"lm_loss": lm_loss}
+
+
+class GPTForPreTraining(nn.Module):
+    """
+    GPT Model with classification head on top.
+    """
+
+    def __init__(self, cfg) -> None:
+        super().__init__()
+        self.GPT_model = GPTModel(cfg)
+        self.loss_func = GPTLoss()
+
+    def forward(
+        self,
+        input_ids,
+        labels=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            labels (flow.LongTensor, optional): Labels for computing language modeling loss.
+                None for evaluating. Defaults to None.
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation.
+                :code:`{"masked_lm_loss": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        logits = self.GPT_model(input_ids)
+        if labels is not None:
+            lm_loss = self.loss_func(logits, labels)
+            return lm_loss
+        else:
+            return {"prediction_scores": logits}
+
+    @staticmethod
+    def set_pipeline_stage_id(model: nn.Module):
+        dist_utils = dist.get_dist_util()
+
+        if hasattr(model.GPT_model.transformer.layernorm_f, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                if isinstance(module_block.origin, (GPTEmbedding, CasualMask)):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, (LMLogits, GPTLoss)):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            model.GPT_model.transformer.layernorm_f.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), (GPTEmbedding, CasualMask)):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), (LMLogits, GPTLoss)):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            model.GPT_model.transformer.layernorm_f.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
--- a/libai/models/resmlp.py
+++ b/libai/models/resmlp.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# --------------------------------------------------------
+# ResMLP Model
+# References:
+# resmlp: https://github.com/facebookresearch/deit/blob/main/resmlp_models.py
+# --------------------------------------------------------
+
+import oneflow as flow
+import oneflow.nn as nn
+from flowvision.layers.weight_init import trunc_normal_
+
+import libai.utils.distributed as dist
+from libai.config import configurable
+from libai.layers import MLP, DropPath, LayerNorm, Linear, PatchEmbedding
+
+
+class Affine(nn.Module):
+    def __init__(self, dim, *, layer_idx=0):
+        super().__init__()
+        self.alpha = nn.Parameter(
+            flow.ones(
+                dim,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        self.beta = nn.Parameter(
+            flow.zeros(
+                dim,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            ),
+        )
+
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
+        return self.alpha * x + self.beta
+
+
+class layers_scale_mlp_blocks(nn.Module):
+    def __init__(
+        self, dim, drop=0.0, drop_path=0.0, init_values=1e-4, num_patches=196, *, layer_idx=0
+    ):
+        super().__init__()
+        self.norm1 = Affine(dim, layer_idx=layer_idx)
+        self.attn = Linear(num_patches, num_patches, layer_idx=layer_idx)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = Affine(dim, layer_idx=layer_idx)
+        self.mlp = MLP(hidden_size=dim, ffn_hidden_size=int(4.0 * dim), layer_idx=layer_idx)
+        self.gamma_1 = nn.Parameter(
+            init_values
+            * flow.ones(
+                dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(layer_idx),
+            ),
+            requires_grad=True,
+        )
+        self.gamma_2 = nn.Parameter(
+            init_values
+            * flow.ones(
+                dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(layer_idx),
+            ),
+            requires_grad=True,
+        )
+
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
+        x = x + self.drop_path(
+            self.gamma_1 * self.attn(self.norm1(x).transpose(1, 2)).transpose(1, 2)
+        )
+        x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class ResMLP(nn.Module):
+    """ResMLP in LiBai.
+
+    LiBai's implementation of:
+    `ResMLP: Feedforward networks for image classification with data-efficient training
+    <https://arxiv.org/abs/2105.03404>`_
+
+    Args:
+        img_size (int, tuple(int)): input image size
+        patch_size (int, tuple(int)): patch size
+        in_chans (int): number of input channels
+        embed_dim (int): embedding dimension
+        depth (int): depth of transformer
+        drop_rate (float): dropout rate
+        drop_path_rate (float): stochastic depth rate
+        init_scale (float): the layer scale ratio
+        num_classes (int): number of classes for classification head
+        loss_func (callable, optional): loss function for computing the total loss
+                                        between logits and labels
+
+    """
+
+    @configurable
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        drop_rate=0.0,
+        drop_path_rate=0.0,
+        init_scale=1e-4,
+        num_classes=1000,
+        loss_func=None,
+    ):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbedding(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        num_patches = self.patch_embed.num_patches
+        dpr = [drop_path_rate for i in range(depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList(
+            [
+                layers_scale_mlp_blocks(
+                    dim=embed_dim,
+                    drop=drop_rate,
+                    drop_path=dpr[i],
+                    init_values=init_scale,
+                    num_patches=num_patches,
+                    layer_idx=i,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        self.norm = Affine(embed_dim, layer_idx=-1)
+        self.head = (
+            Linear(embed_dim, num_classes, layer_idx=-1) if num_classes > 0 else nn.Identity()
+        )
+
+        # loss func
+        self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
+
+        # weight init
+        self.apply(self._init_weights)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "img_size": cfg.img_size,
+            "patch_size": cfg.patch_size,
+            "in_chans": cfg.in_chans,
+            "embed_dim": cfg.embed_dim,
+            "depth": cfg.depth,
+            "drop_rate": cfg.drop_rate,
+            "drop_path_rate": cfg.drop_path_rate,
+            "init_scale": cfg.init_scale,
+            "num_classes": cfg.num_classes,
+            "loss_func": cfg.loss_func,
+        }
+
+    def _init_weights(self, m):
+        if isinstance(m, Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+
+        # layer scale mlp blocks
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+
+        return x
+
+    def forward_head(self, x):
+        B = x.shape[0]
+        x = self.norm(x)
+        x = x.mean(dim=1).reshape(B, 1, -1)
+        return self.head(x[:, 0])
+
+    def forward(self, images, labels=None):
+        """
+
+        Args:
+            images (flow.Tensor): training samples.
+            labels (flow.LongTensor, optional): training targets
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation mode.
+                :code:`{"losses": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        x = self.forward_features(images)
+        x = self.forward_head(x)
+
+        if labels is not None and self.training:
+            losses = self.loss_func(x, labels)
+            return {"losses": losses}
+        else:
+            return {"prediction_scores": x}
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.loss_func, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.origin, PatchEmbedding):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, layers_scale_mlp_blocks):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            # Set norm and head stage id
+            model.norm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), PatchEmbedding):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            # Set norm and head stage id
+            model.norm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+    @staticmethod
+    def set_activation_checkpoint(model):
+        for module_block in model.modules():
+            if hasattr(module_block, "origin"):
+                # Old API in OneFlow 0.8
+                if isinstance(module_block.origin, layers_scale_mlp_blocks):
+                    module_block.config.activation_checkpointing = True
+            else:
+                if isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
+                    module_block.to(nn.graph.GraphModule).activation_checkpointing = True
--- a/libai/models/roberta_model.py
+++ b/libai/models/roberta_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    Linear,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+    build_activation,
+)
+from libai.utils import distributed as dist
+
+from .bert_model import BertEmbeddings, BertExtendedAttnMask, BertModel, BertPooler
+from .utils import init_method_normal
+
+
+class RobertaExtendedAttnMask(BertExtendedAttnMask):
+    """
+    Same as BertExtendedAttnMask.
+    """
+
+
+class RobertaEmbeddings(BertEmbeddings):
+    """
+    Same as BertEmbeddings with a tiny tweak for vocab_embeddings and position_embeddings.
+    """
+
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        num_tokentypes=0,
+        pad_token_id=1,
+        init_method=nn.init.xavier_normal_,
+        amp_enabled=False,
+    ):
+        super().__init__(
+            vocab_size,
+            hidden_size,
+            max_sequence_length,
+            embedding_dropout_prob,
+            num_tokentypes=num_tokentypes,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+        )
+        self.pad_token_id = pad_token_id
+        self.vocab_embeddings = VocabEmbedding(
+            vocab_size,
+            hidden_size,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+            padding_idx=pad_token_id,
+        )
+        self.position_embeddings = Embedding(
+            max_sequence_length,
+            hidden_size,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+            padding_idx=pad_token_id,
+        )
+
+        if num_tokentypes > 0:
+            self.tokentype_embeddings = Embedding(
+                num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+            )
+            self.tokentype_ids = flow.zeros(
+                1,
+                max_sequence_length,
+                dtype=flow.long,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(0),
+            )
+        else:
+            self.tokentype_embeddings = None
+
+    def forward(self, input_ids, tokentype_ids=None, position_ids=None):
+        seq_length = input_ids.size()[1]
+
+        word_embeddings = self.vocab_embeddings(input_ids)
+
+        if position_ids is None:
+            position_ids = self.create_position_ids_from_input_ids(input_ids, self.pad_token_id)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = word_embeddings + position_embeddings
+
+        if self.tokentype_embeddings is not None:
+            if tokentype_ids is None:
+                tokentype_ids = (
+                    self.tokentype_ids[:, :seq_length]
+                    .expand_as(input_ids)
+                    .to_global(sbp=input_ids.sbp)
+                )
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_input_ids(self, input_ids, pad_token_id):
+        mask = input_ids.ne(pad_token_id).int()
+        position_ids = (flow.cumsum(mask, dim=1).type_as(mask)) * mask + pad_token_id
+        position_ids = position_ids.to_global(sbp=input_ids.sbp, placement=input_ids.placement)
+        return position_ids
+
+
+class RobertaPooler(BertPooler):
+    """
+    Same as BertPooler.
+    """
+
+
+class RobertaLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, lm_output, lm_labels, loss_mask):
+        lm_labels = lm_labels.to_global(placement=lm_output.placement)
+        loss_mask = loss_mask.to_global(placement=lm_output.placement)
+        lm_loss = self.lm_loss(lm_output, lm_labels)
+        loss_mask = loss_mask.float()
+        # Change loss_mask.sum() sbp sign from [P, B] -> [B, B]
+        # because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P
+        denominator = loss_mask.sum().to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        )
+        masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
+        masked_lm_loss = masked_lm_loss.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
+        )
+        loss_dict = {"lm_loss": masked_lm_loss}
+        return loss_dict
+
+
+class RobertaModel(BertModel):
+    """The bare Roberta Model transformer outputting raw hidden-states without
+    any specific head on top.
+
+        Args:
+            vocab_size (int):
+                The size of vocabulary file.
+            hidden_size (int):
+                The size of hidden states.
+            hidden_layers (int):
+                The number of ``TransformerLayer`` in encoder.
+            num_attention_heads (int):
+                The number of attention heads for each attention layer of ``TransformerLayer``.
+            intermediate_size (int):
+                The size of intermediate layer in feed-forward network for each
+                ``TransformerLayer``.
+            hidden_dropout_prob (float, optional):
+                The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
+            attention_probs_dropout_prob (float, optional):
+                The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+                Defaults to 0.0.
+            max_position_embeddings (int):
+                Max sequence length of input, defines the shape of Position Embeddings
+                in ``RobertaEmbeddings``.
+            type_vocab_size (int, optional):
+                Number of segment token indices. Defaults to 2.
+            add_pooling_layer (bool, optional):
+                Whether or not averaging or pooling the sequence of hidden-states for the
+                whole input sequence. Defaults to ``True``.
+            initializer_range (float, optional):
+                Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+            layer_norm_eps (float, optional):
+                The epsilon of LayerNorm layer. Defaults to 1e-5.
+            pad_token_id (int, optional):
+                The token id used for padding. Defaults to 1.
+            bias_gelu_fusion (bool, optional):
+                Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+            bias_dropout_fusion (bool, optional):
+                Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+            scale_mask_softmax_fusion (bool, optional):
+                Whether to fuse the computing of mask and softmax in attention layers.
+                Defaults to ``False``.
+            apply_query_key_layer_scaling (bool, optional):
+                Whether or not to use layer index related scaling in computing attention scores.
+                If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+                Defaults to ``True``.
+            apply_residual_post_layernorm (bool, optional):
+                If set ``True``, use original BERT(Roberta) residual connection ordering
+                otherwise use Megatron BERT residual connection which is more stable
+                when scaling model size introduced in https://arxiv.org/pdf/1909.08053.pdf.
+                Default: ``False``.
+            amp_enabled (bool, optional):
+                Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        hidden_layers,
+        num_attention_heads,
+        intermediate_size,
+        hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        max_position_embeddings,
+        num_tokentypes=2,
+        add_pooling_layer=True,
+        initializer_range=0.02,
+        layernorm_eps=1e-12,
+        pad_token_id=1,
+        bias_gelu_fusion=True,
+        bias_dropout_fusion=True,
+        scale_mask_softmax_fusion=True,
+        apply_query_key_layer_scaling=True,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ):
+        super().__init__(
+            vocab_size,
+            hidden_size,
+            hidden_layers,
+            num_attention_heads,
+            intermediate_size,
+            hidden_dropout_prob,
+            attention_probs_dropout_prob,
+            max_position_embeddings,
+            num_tokentypes=num_tokentypes,
+            add_pooling_layer=add_pooling_layer,
+            initializer_range=initializer_range,
+            layernorm_eps=layernorm_eps,
+            bias_gelu_fusion=bias_gelu_fusion,
+            bias_dropout_fusion=bias_dropout_fusion,
+            scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            apply_residual_post_layernorm=apply_residual_post_layernorm,
+            amp_enabled=amp_enabled,
+        )
+
+        init_method = init_method_normal(initializer_range)
+
+        # Embeddings
+        self.embeddings = RobertaEmbeddings(
+            vocab_size,
+            hidden_size,
+            max_position_embeddings,
+            hidden_dropout_prob,
+            num_tokentypes,
+            pad_token_id,
+            init_method,
+            amp_enabled,
+        )
+
+        # Mask generation
+        self.extended_attn_mask = RobertaExtendedAttnMask()
+        self.pooler = RobertaPooler(hidden_size, init_method) if add_pooling_layer else None
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "hidden_layers": cfg.hidden_layers,
+            "num_attention_heads": cfg.num_attention_heads,
+            "intermediate_size": cfg.intermediate_size,
+            "hidden_dropout_prob": cfg.hidden_dropout_prob,
+            "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
+            "max_position_embeddings": cfg.max_position_embeddings,
+            "num_tokentypes": cfg.num_tokentypes,
+            "add_pooling_layer": cfg.add_pooling_layer,
+            "initializer_range": cfg.initializer_range,
+            "layernorm_eps": cfg.layernorm_eps,
+            "pad_token_id": cfg.pad_token_id,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+
+class RobertaLMHead(nn.Module):
+    def __init__(self, vocab_size, hidden_size, init_method, layer_norm_eps):
+        super().__init__()
+        self.dense = Linear(
+            hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="data",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.activation_func = build_activation("gelu")
+        self.layernorm = LayerNorm((hidden_size,), eps=layer_norm_eps, layer_idx=-1)
+
+        # NOTE(xzp): LMLogits as a decoder:nn.Linear(hidden_size, vocab_size),
+        # it shares the roberta.word_embeddings.weight
+        self.lm_logits = LMLogits(vocab_size, bias=True)
+
+    def forward(self, hidden_states, word_embeddings_weight):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation_func(hidden_states)
+        hidden_states = hidden_states.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])
+        )
+        hidden_states = self.layernorm(hidden_states)
+        hidden_states = self.lm_logits(hidden_states, word_embeddings_weight)
+        return hidden_states
+
+
+class RobertaPreTrainedModel(nn.Module):
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.roberta.final_layernorm, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.origin, RobertaEmbeddings):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, RobertaExtendedAttnMask):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                # `add_pooling_layer` in RobertaForMaskedLM and RobertaForCausalLM.
+                # default to False.
+                elif isinstance(module_block.origin, RobertaPooler):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.origin, RobertaLMHead):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.roberta.final_layernorm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.to(nn.Module), RobertaEmbeddings):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), RobertaExtendedAttnMask):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                # `add_pooling_layer` in RobertaForMaskedLM and RobertaForCausalLM.
+                # default to False.
+                elif isinstance(module_block.to(nn.Module), RobertaPooler):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.to(nn.Module), RobertaLMHead):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.roberta.final_layernorm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+
+class RobertaForPreTraining(RobertaPreTrainedModel):
+    def __init__(self, cfg):
+        super().__init__()
+
+        cfg.add_pooling_layer = False
+        self.roberta = RobertaModel(cfg)
+        self.lm_head = RobertaLMHead(
+            cfg.vocab_size,
+            cfg.hidden_size,
+            init_method_normal(cfg.initializer_range),
+            cfg.layernorm_eps,
+        )
+        self.loss_fc = RobertaLoss()
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        tokentype_ids=None,
+        lm_labels=None,
+        loss_mask=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
+                and second portions of the inputs. Indices are selected in `[0, 1]`.
+                Defaults to None.
+            labels (flow.LongTensor, optional): Labels for computing the masked
+                language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
+                Defaults to None.
+            loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
+                on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+                Defaults to None.
+        """
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0))
+        tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0))
+
+        outputs = self.roberta(input_ids, attention_mask, tokentype_ids=tokentype_ids)
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output, self.roberta.word_embeddings_weight())
+
+        if lm_labels is not None:
+            return self.loss_fc(prediction_scores, lm_labels, loss_mask)
+
+        return {"prediction_scores": prediction_scores}
+
+
+class RobertaForCausalLM(RobertaPreTrainedModel):
+    def __init__(self, cfg):
+        super().__init__()
+
+        cfg.add_pooling_layer = False
+        self.roberta = RobertaModel(cfg)
+        self.lm_head = RobertaLMHead(
+            cfg.vocab_size,
+            cfg.hidden_size,
+            init_method_normal(cfg.initializer_range),
+            cfg.layernorm_eps,
+        )
+        self.loss_fc = RobertaLoss()
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        tokentype_ids=None,
+        position_ids=None,
+        labels=None,
+        loss_mask=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
+                and second portions of the inputs. Indices are selected in `[0, 1]`.
+                Defaults to None.
+            position_ids (flow.LongTensor, optional): Indices of positions of each input sequence
+                tokens in the position embeddings. Defaults to None.
+            labels (flow.LongTensor, optional): Labels for computing the masked
+                language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
+                Defaults to None.
+            loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
+                on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+                Defaults to None.
+        """
+        outputs = self.roberta(input_ids, attention_mask, position_ids, tokentype_ids)
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output, self.roberta.word_embeddings_weight())
+
+        if labels is not None:
+            # next-token prediction task, shift prediction_scores and labels by one.
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            shifted_prediction_scores = shifted_prediction_scores.to_global(
+                sbp=prediction_scores.sbp
+            )
+            shifted_labels = labels[:, 1:].contiguous()
+            shifted_labels = shifted_labels.to_global(sbp=shifted_labels.sbp)
+            lm_loss = self.loss_fc(shifted_prediction_scores, shifted_labels, loss_mask)
+            return {"lm_loss": lm_loss}
+
+        return {"prediction_scores": prediction_scores}