Commit 9fdb7dab authored by yuguo960516's avatar yuguo960516
Browse files

bloom

parents
Pipeline #150 failed with stages
in 0 seconds
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from libai.inference.basic import BasePipeline
from libai.utils import distributed as dist
class TextGenerationPipeline(BasePipeline):
def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
"""load pretrained model.
Args:
libai_cfg_model (libai.models): Lazy config Model in Libai, you can import it
by `from libai.config.configs.common.models.bert
import pretrain_model as libai_cfg_model`
model_path (str): The directory path of pretrained model,
"""
if mode == "huggingface":
from projects.MT5.utils.mt5_loader import T5LoaderHuggerFace
model_loader = T5LoaderHuggerFace(
libai_cfg_model,
libai_cfg_model.cfg,
model_path,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
embedding_dropout_prob=0.0,
)
return model_loader.load()
elif mode == "libai":
from projects.MT5.utils.mt5_loader import T5LoaderLibai
model_loader = T5LoaderLibai(
libai_cfg_model,
libai_cfg_model.cfg,
model_path,
)
return model_loader.load()
elif mode == "random":
from libai.engine import DefaultTrainer
return DefaultTrainer.build_model(self.cfg)
else:
raise NotImplementedError
def _parse_parameters(self, **pipeline_parameters):
preprocess_params = {}
forward_params = {**pipeline_parameters}
postprocess_params = {}
return preprocess_params, forward_params, postprocess_params
def preprocess(
self,
inputs,
pad: bool = False,
**kwargs,
) -> dict:
# tokenizer encoder
encoder_ids = self.tokenizer.encode(inputs, return_tensors="of", is_global=True)
encoder_input_dict = {
"encoder_ids": encoder_ids,
}
return encoder_input_dict
def forward(self, encoder_input_dict, **kwargs) -> dict:
outputs = self.model.generate(encoder_input_dict["encoder_ids"], **kwargs)
return {"return_ids": outputs}
def postprocess(self, model_output_dict, **kwargs) -> dict:
return_ids = model_output_dict["return_ids"]
records = [
{"generated_text": self.tokenizer.decode(return_ids[i], skip_special_tokens=True)}
for i in range(return_ids.size(0))
]
return records
if __name__ == "__main__":
pipeline = TextGenerationPipeline(
"/path/to/libai/projects/MT5/configs/t5_inference.py",
data_parallel=1,
tensor_parallel=2,
pipeline_parallel=2,
pipeline_stage_id=[0] * 12 + [1] * 12,
pipeline_num_layers=12 * 2,
model_path="/path/to/t5-base",
mode="huggingface",
)
text = ["summarize: She is a student, She is tall, She loves study"]
dict1 = pipeline(text)
if dist.is_main_process():
print(dict1)
IMAGENET_LABELS = [
"tench, Tinca tinca",
"goldfish, Carassius auratus",
"great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias", # noqa: E501
"tiger shark, Galeocerdo cuvieri",
"hammerhead, hammerhead shark",
"electric ray, crampfish, numbfish, torpedo",
"stingray",
"cock",
"hen",
"ostrich, Struthio camelus",
"brambling, Fringilla montifringilla",
"goldfinch, Carduelis carduelis",
"house finch, linnet, Carpodacus mexicanus",
"junco, snowbird",
"indigo bunting, indigo finch, indigo bird, Passerina cyanea",
"robin, American robin, Turdus migratorius",
"bulbul",
"jay",
"magpie",
"chickadee",
"water ouzel, dipper",
"kite",
"bald eagle, American eagle, Haliaeetus leucocephalus",
"vulture",
"great grey owl, great gray owl, Strix nebulosa",
"European fire salamander, Salamandra salamandra",
"common newt, Triturus vulgaris",
"eft",
"spotted salamander, Ambystoma maculatum",
"axolotl, mud puppy, Ambystoma mexicanum",
"bullfrog, Rana catesbeiana",
"tree frog, tree-frog",
"tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
"loggerhead, loggerhead turtle, Caretta caretta",
"leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea", # noqa: E501
"mud turtle",
"terrapin",
"box turtle, box tortoise",
"banded gecko",
"common iguana, iguana, Iguana iguana",
"American chameleon, anole, Anolis carolinensis",
"whiptail, whiptail lizard",
"agama",
"frilled lizard, Chlamydosaurus kingi",
"alligator lizard",
"Gila monster, Heloderma suspectum",
"green lizard, Lacerta viridis",
"African chameleon, Chamaeleo chamaeleon",
"Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis", # noqa: E501
"African crocodile, Nile crocodile, Crocodylus niloticus",
"American alligator, Alligator mississipiensis",
"triceratops",
"thunder snake, worm snake, Carphophis amoenus",
"ringneck snake, ring-necked snake, ring snake",
"hognose snake, puff adder, sand viper",
"green snake, grass snake",
"king snake, kingsnake",
"garter snake, grass snake",
"water snake",
"vine snake",
"night snake, Hypsiglena torquata",
"boa constrictor, Constrictor constrictor",
"rock python, rock snake, Python sebae",
"Indian cobra, Naja naja",
"green mamba",
"sea snake",
"horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
"diamondback, diamondback rattlesnake, Crotalus adamanteus",
"sidewinder, horned rattlesnake, Crotalus cerastes",
"trilobite",
"harvestman, daddy longlegs, Phalangium opilio",
"scorpion",
"black and gold garden spider, Argiope aurantia",
"barn spider, Araneus cavaticus",
"garden spider, Aranea diademata",
"black widow, Latrodectus mactans",
"tarantula",
"wolf spider, hunting spider",
"tick",
"centipede",
"black grouse",
"ptarmigan",
"ruffed grouse, partridge, Bonasa umbellus",
"prairie chicken, prairie grouse, prairie fowl",
"peacock",
"quail",
"partridge",
"African grey, African gray, Psittacus erithacus",
"macaw",
"sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
"lorikeet",
"coucal",
"bee eater",
"hornbill",
"hummingbird",
"jacamar",
"toucan",
"drake",
"red-breasted merganser, Mergus serrator",
"goose",
"black swan, Cygnus atratus",
"tusker",
"echidna, spiny anteater, anteater",
"platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus", # noqa: E501
"wallaby, brush kangaroo",
"koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus", # noqa: E501
"wombat",
"jellyfish",
"sea anemone, anemone",
"brain coral",
"flatworm, platyhelminth",
"nematode, nematode worm, roundworm",
"conch",
"snail",
"slug",
"sea slug, nudibranch",
"chiton, coat-of-mail shell, sea cradle, polyplacophore",
"chambered nautilus, pearly nautilus, nautilus",
"Dungeness crab, Cancer magister",
"rock crab, Cancer irroratus",
"fiddler crab",
"king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica", # noqa: E501
"American lobster, Northern lobster, Maine lobster, Homarus americanus", # noqa: E501
"spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish", # noqa: E501
"crayfish, crawfish, crawdad, crawdaddy",
"hermit crab",
"isopod",
"white stork, Ciconia ciconia",
"black stork, Ciconia nigra",
"spoonbill",
"flamingo",
"little blue heron, Egretta caerulea",
"American egret, great white heron, Egretta albus",
"bittern",
"crane",
"limpkin, Aramus pictus",
"European gallinule, Porphyrio porphyrio",
"American coot, marsh hen, mud hen, water hen, Fulica americana",
"bustard",
"ruddy turnstone, Arenaria interpres",
"red-backed sandpiper, dunlin, Erolia alpina",
"redshank, Tringa totanus",
"dowitcher",
"oystercatcher, oyster catcher",
"pelican",
"king penguin, Aptenodytes patagonica",
"albatross, mollymawk",
"grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus", # noqa: E501
"killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
"dugong, Dugong dugon",
"sea lion",
"Chihuahua",
"Japanese spaniel",
"Maltese dog, Maltese terrier, Maltese",
"Pekinese, Pekingese, Peke",
"Shih-Tzu",
"Blenheim spaniel",
"papillon",
"toy terrier",
"Rhodesian ridgeback",
"Afghan hound, Afghan",
"basset, basset hound",
"beagle",
"bloodhound, sleuthhound",
"bluetick",
"black-and-tan coonhound",
"Walker hound, Walker foxhound",
"English foxhound",
"redbone",
"borzoi, Russian wolfhound",
"Irish wolfhound",
"Italian greyhound",
"whippet",
"Ibizan hound, Ibizan Podenco",
"Norwegian elkhound, elkhound",
"otterhound, otter hound",
"Saluki, gazelle hound",
"Scottish deerhound, deerhound",
"Weimaraner",
"Staffordshire bullterrier, Staffordshire bull terrier",
"American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier", # noqa: E501
"Bedlington terrier",
"Border terrier",
"Kerry blue terrier",
"Irish terrier",
"Norfolk terrier",
"Norwich terrier",
"Yorkshire terrier",
"wire-haired fox terrier",
"Lakeland terrier",
"Sealyham terrier, Sealyham",
"Airedale, Airedale terrier",
"cairn, cairn terrier",
"Australian terrier",
"Dandie Dinmont, Dandie Dinmont terrier",
"Boston bull, Boston terrier",
"miniature schnauzer",
"giant schnauzer",
"standard schnauzer",
"Scotch terrier, Scottish terrier, Scottie",
"Tibetan terrier, chrysanthemum dog",
"silky terrier, Sydney silky",
"soft-coated wheaten terrier",
"West Highland white terrier",
"Lhasa, Lhasa apso",
"flat-coated retriever",
"curly-coated retriever",
"golden retriever",
"Labrador retriever",
"Chesapeake Bay retriever",
"German short-haired pointer",
"vizsla, Hungarian pointer",
"English setter",
"Irish setter, red setter",
"Gordon setter",
"Brittany spaniel",
"clumber, clumber spaniel",
"English springer, English springer spaniel",
"Welsh springer spaniel",
"cocker spaniel, English cocker spaniel, cocker",
"Sussex spaniel",
"Irish water spaniel",
"kuvasz",
"schipperke",
"groenendael",
"malinois",
"briard",
"kelpie",
"komondor",
"Old English sheepdog, bobtail",
"Shetland sheepdog, Shetland sheep dog, Shetland",
"collie",
"Border collie",
"Bouvier des Flandres, Bouviers des Flandres",
"Rottweiler",
"German shepherd, German shepherd dog, German police dog, alsatian",
"Doberman, Doberman pinscher",
"miniature pinscher",
"Greater Swiss Mountain dog",
"Bernese mountain dog",
"Appenzeller",
"EntleBucher",
"boxer",
"bull mastiff",
"Tibetan mastiff",
"French bulldog",
"Great Dane",
"Saint Bernard, St Bernard",
"Eskimo dog, husky",
"malamute, malemute, Alaskan malamute",
"Siberian husky",
"dalmatian, coach dog, carriage dog",
"affenpinscher, monkey pinscher, monkey dog",
"basenji",
"pug, pug-dog",
"Leonberg",
"Newfoundland, Newfoundland dog",
"Great Pyrenees",
"Samoyed, Samoyede",
"Pomeranian",
"chow, chow chow",
"keeshond",
"Brabancon griffon",
"Pembroke, Pembroke Welsh corgi",
"Cardigan, Cardigan Welsh corgi",
"toy poodle",
"miniature poodle",
"standard poodle",
"Mexican hairless",
"timber wolf, grey wolf, gray wolf, Canis lupus",
"white wolf, Arctic wolf, Canis lupus tundrarum",
"red wolf, maned wolf, Canis rufus, Canis niger",
"coyote, prairie wolf, brush wolf, Canis latrans",
"dingo, warrigal, warragal, Canis dingo",
"dhole, Cuon alpinus",
"African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
"hyena, hyaena",
"red fox, Vulpes vulpes",
"kit fox, Vulpes macrotis",
"Arctic fox, white fox, Alopex lagopus",
"grey fox, gray fox, Urocyon cinereoargenteus",
"tabby, tabby cat",
"tiger cat",
"Persian cat",
"Siamese cat, Siamese",
"Egyptian cat",
"cougar, puma, catamount, mountain lion, painter, panther, Felis concolor", # noqa: E501
"lynx, catamount",
"leopard, Panthera pardus",
"snow leopard, ounce, Panthera uncia",
"jaguar, panther, Panthera onca, Felis onca",
"lion, king of beasts, Panthera leo",
"tiger, Panthera tigris",
"cheetah, chetah, Acinonyx jubatus",
"brown bear, bruin, Ursus arctos",
"American black bear, black bear, Ursus americanus, Euarctos americanus", # noqa: E501
"ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
"sloth bear, Melursus ursinus, Ursus ursinus",
"mongoose",
"meerkat, mierkat",
"tiger beetle",
"ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
"ground beetle, carabid beetle",
"long-horned beetle, longicorn, longicorn beetle",
"leaf beetle, chrysomelid",
"dung beetle",
"rhinoceros beetle",
"weevil",
"fly",
"bee",
"ant, emmet, pismire",
"grasshopper, hopper",
"cricket",
"walking stick, walkingstick, stick insect",
"cockroach, roach",
"mantis, mantid",
"cicada, cicala",
"leafhopper",
"lacewing, lacewing fly",
"dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk", # noqa: E501
"damselfly",
"admiral",
"ringlet, ringlet butterfly",
"monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
"cabbage butterfly",
"sulphur butterfly, sulfur butterfly",
"lycaenid, lycaenid butterfly",
"starfish, sea star",
"sea urchin",
"sea cucumber, holothurian",
"wood rabbit, cottontail, cottontail rabbit",
"hare",
"Angora, Angora rabbit",
"hamster",
"porcupine, hedgehog",
"fox squirrel, eastern fox squirrel, Sciurus niger",
"marmot",
"beaver",
"guinea pig, Cavia cobaya",
"sorrel",
"zebra",
"hog, pig, grunter, squealer, Sus scrofa",
"wild boar, boar, Sus scrofa",
"warthog",
"hippopotamus, hippo, river horse, Hippopotamus amphibius",
"ox",
"water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
"bison",
"ram, tup",
"bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis", # noqa: E501
"ibex, Capra ibex",
"hartebeest",
"impala, Aepyceros melampus",
"gazelle",
"Arabian camel, dromedary, Camelus dromedarius",
"llama",
"weasel",
"mink",
"polecat, fitch, foulmart, foumart, Mustela putorius",
"black-footed ferret, ferret, Mustela nigripes",
"otter",
"skunk, polecat, wood pussy",
"badger",
"armadillo",
"three-toed sloth, ai, Bradypus tridactylus",
"orangutan, orang, orangutang, Pongo pygmaeus",
"gorilla, Gorilla gorilla",
"chimpanzee, chimp, Pan troglodytes",
"gibbon, Hylobates lar",
"siamang, Hylobates syndactylus, Symphalangus syndactylus",
"guenon, guenon monkey",
"patas, hussar monkey, Erythrocebus patas",
"baboon",
"macaque",
"langur",
"colobus, colobus monkey",
"proboscis monkey, Nasalis larvatus",
"marmoset",
"capuchin, ringtail, Cebus capucinus",
"howler monkey, howler",
"titi, titi monkey",
"spider monkey, Ateles geoffroyi",
"squirrel monkey, Saimiri sciureus",
"Madagascar cat, ring-tailed lemur, Lemur catta",
"indri, indris, Indri indri, Indri brevicaudatus",
"Indian elephant, Elephas maximus",
"African elephant, Loxodonta africana",
"lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
"giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
"barracouta, snoek",
"eel",
"coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch", # noqa: E501
"rock beauty, Holocanthus tricolor",
"anemone fish",
"sturgeon",
"gar, garfish, garpike, billfish, Lepisosteus osseus",
"lionfish",
"puffer, pufferfish, blowfish, globefish",
"abacus",
"abaya",
"academic gown, academic robe, judge's robe",
"accordion, piano accordion, squeeze box",
"acoustic guitar",
"aircraft carrier, carrier, flattop, attack aircraft carrier",
"airliner",
"airship, dirigible",
"altar",
"ambulance",
"amphibian, amphibious vehicle",
"analog clock",
"apiary, bee house",
"apron",
"ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin", # noqa: E501
"assault rifle, assault gun",
"backpack, back pack, knapsack, packsack, rucksack, haversack",
"bakery, bakeshop, bakehouse",
"balance beam, beam",
"balloon",
"ballpoint, ballpoint pen, ballpen, Biro",
"Band Aid",
"banjo",
"bannister, banister, balustrade, balusters, handrail",
"barbell",
"barber chair",
"barbershop",
"barn",
"barometer",
"barrel, cask",
"barrow, garden cart, lawn cart, wheelbarrow",
"baseball",
"basketball",
"bassinet",
"bassoon",
"bathing cap, swimming cap",
"bath towel",
"bathtub, bathing tub, bath, tub",
"beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon", # noqa: E501
"beacon, lighthouse, beacon light, pharos",
"beaker",
"bearskin, busby, shako",
"beer bottle",
"beer glass",
"bell cote, bell cot",
"bib",
"bicycle-built-for-two, tandem bicycle, tandem",
"bikini, two-piece",
"binder, ring-binder",
"binoculars, field glasses, opera glasses",
"birdhouse",
"boathouse",
"bobsled, bobsleigh, bob",
"bolo tie, bolo, bola tie, bola",
"bonnet, poke bonnet",
"bookcase",
"bookshop, bookstore, bookstall",
"bottlecap",
"bow",
"bow tie, bow-tie, bowtie",
"brass, memorial tablet, plaque",
"brassiere, bra, bandeau",
"breakwater, groin, groyne, mole, bulwark, seawall, jetty",
"breastplate, aegis, egis",
"broom",
"bucket, pail",
"buckle",
"bulletproof vest",
"bullet train, bullet",
"butcher shop, meat market",
"cab, hack, taxi, taxicab",
"caldron, cauldron",
"candle, taper, wax light",
"cannon",
"canoe",
"can opener, tin opener",
"cardigan",
"car mirror",
"carousel, carrousel, merry-go-round, roundabout, whirligig",
"carpenter's kit, tool kit",
"carton",
"car wheel",
"cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM", # noqa: E501
"cassette",
"cassette player",
"castle",
"catamaran",
"CD player",
"cello, violoncello",
"cellular telephone, cellular phone, cellphone, cell, mobile phone",
"chain",
"chainlink fence",
"chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour", # noqa: E501
"chain saw, chainsaw",
"chest",
"chiffonier, commode",
"chime, bell, gong",
"china cabinet, china closet",
"Christmas stocking",
"church, church building",
"cinema, movie theater, movie theatre, movie house, picture palace",
"cleaver, meat cleaver, chopper",
"cliff dwelling",
"cloak",
"clog, geta, patten, sabot",
"cocktail shaker",
"coffee mug",
"coffeepot",
"coil, spiral, volute, whorl, helix",
"combination lock",
"computer keyboard, keypad",
"confectionery, confectionary, candy store",
"container ship, containership, container vessel",
"convertible",
"corkscrew, bottle screw",
"cornet, horn, trumpet, trump",
"cowboy boot",
"cowboy hat, ten-gallon hat",
"cradle",
"crane",
"crash helmet",
"crate",
"crib, cot",
"Crock Pot",
"croquet ball",
"crutch",
"cuirass",
"dam, dike, dyke",
"desk",
"desktop computer",
"dial telephone, dial phone",
"diaper, nappy, napkin",
"digital clock",
"digital watch",
"dining table, board",
"dishrag, dishcloth",
"dishwasher, dish washer, dishwashing machine",
"disk brake, disc brake",
"dock, dockage, docking facility",
"dogsled, dog sled, dog sleigh",
"dome",
"doormat, welcome mat",
"drilling platform, offshore rig",
"drum, membranophone, tympan",
"drumstick",
"dumbbell",
"Dutch oven",
"electric fan, blower",
"electric guitar",
"electric locomotive",
"entertainment center",
"envelope",
"espresso maker",
"face powder",
"feather boa, boa",
"file, file cabinet, filing cabinet",
"fireboat",
"fire engine, fire truck",
"fire screen, fireguard",
"flagpole, flagstaff",
"flute, transverse flute",
"folding chair",
"football helmet",
"forklift",
"fountain",
"fountain pen",
"four-poster",
"freight car",
"French horn, horn",
"frying pan, frypan, skillet",
"fur coat",
"garbage truck, dustcart",
"gasmask, respirator, gas helmet",
"gas pump, gasoline pump, petrol pump, island dispenser",
"goblet",
"go-kart",
"golf ball",
"golfcart, golf cart",
"gondola",
"gong, tam-tam",
"gown",
"grand piano, grand",
"greenhouse, nursery, glasshouse",
"grille, radiator grille",
"grocery store, grocery, food market, market",
"guillotine",
"hair slide",
"hair spray",
"half track",
"hammer",
"hamper",
"hand blower, blow dryer, blow drier, hair dryer, hair drier",
"hand-held computer, hand-held microcomputer",
"handkerchief, hankie, hanky, hankey",
"hard disc, hard disk, fixed disk",
"harmonica, mouth organ, harp, mouth harp",
"harp",
"harvester, reaper",
"hatchet",
"holster",
"home theater, home theatre",
"honeycomb",
"hook, claw",
"hoopskirt, crinoline",
"horizontal bar, high bar",
"horse cart, horse-cart",
"hourglass",
"iPod",
"iron, smoothing iron",
"jack-o'-lantern",
"jean, blue jean, denim",
"jeep, landrover",
"jersey, T-shirt, tee shirt",
"jigsaw puzzle",
"jinrikisha, ricksha, rickshaw",
"joystick",
"kimono",
"knee pad",
"knot",
"lab coat, laboratory coat",
"ladle",
"lampshade, lamp shade",
"laptop, laptop computer",
"lawn mower, mower",
"lens cap, lens cover",
"letter opener, paper knife, paperknife",
"library",
"lifeboat",
"lighter, light, igniter, ignitor",
"limousine, limo",
"liner, ocean liner",
"lipstick, lip rouge",
"Loafer",
"lotion",
"loudspeaker, speaker, speaker unit, loudspeaker system, speaker system", # noqa: E501
"loupe, jeweler's loupe",
"lumbermill, sawmill",
"magnetic compass",
"mailbag, postbag",
"mailbox, letter box",
"maillot",
"maillot, tank suit",
"manhole cover",
"maraca",
"marimba, xylophone",
"mask",
"matchstick",
"maypole",
"maze, labyrinth",
"measuring cup",
"medicine chest, medicine cabinet",
"megalith, megalithic structure",
"microphone, mike",
"microwave, microwave oven",
"military uniform",
"milk can",
"minibus",
"miniskirt, mini",
"minivan",
"missile",
"mitten",
"mixing bowl",
"mobile home, manufactured home",
"Model T",
"modem",
"monastery",
"monitor",
"moped",
"mortar",
"mortarboard",
"mosque",
"mosquito net",
"motor scooter, scooter",
"mountain bike, all-terrain bike, off-roader",
"mountain tent",
"mouse, computer mouse",
"mousetrap",
"moving van",
"muzzle",
"nail",
"neck brace",
"necklace",
"nipple",
"notebook, notebook computer",
"obelisk",
"oboe, hautboy, hautbois",
"ocarina, sweet potato",
"odometer, hodometer, mileometer, milometer",
"oil filter",
"organ, pipe organ",
"oscilloscope, scope, cathode-ray oscilloscope, CRO",
"overskirt",
"oxcart",
"oxygen mask",
"packet",
"paddle, boat paddle",
"paddlewheel, paddle wheel",
"padlock",
"paintbrush",
"pajama, pyjama, pj's, jammies",
"palace",
"panpipe, pandean pipe, syrinx",
"paper towel",
"parachute, chute",
"parallel bars, bars",
"park bench",
"parking meter",
"passenger car, coach, carriage",
"patio, terrace",
"pay-phone, pay-station",
"pedestal, plinth, footstall",
"pencil box, pencil case",
"pencil sharpener",
"perfume, essence",
"Petri dish",
"photocopier",
"pick, plectrum, plectron",
"pickelhaube",
"picket fence, paling",
"pickup, pickup truck",
"pier",
"piggy bank, penny bank",
"pill bottle",
"pillow",
"ping-pong ball",
"pinwheel",
"pirate, pirate ship",
"pitcher, ewer",
"plane, carpenter's plane, woodworking plane",
"planetarium",
"plastic bag",
"plate rack",
"plow, plough",
"plunger, plumber's helper",
"Polaroid camera, Polaroid Land camera",
"pole",
"police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria", # noqa: E501
"poncho",
"pool table, billiard table, snooker table",
"pop bottle, soda bottle",
"pot, flowerpot",
"potter's wheel",
"power drill",
"prayer rug, prayer mat",
"printer",
"prison, prison house",
"projectile, missile",
"projector",
"puck, hockey puck",
"punching bag, punch bag, punching ball, punchball",
"purse",
"quill, quill pen",
"quilt, comforter, comfort, puff",
"racer, race car, racing car",
"racket, racquet",
"radiator",
"radio, wireless",
"radio telescope, radio reflector",
"rain barrel",
"recreational vehicle, RV, R.V.",
"reel",
"reflex camera",
"refrigerator, icebox",
"remote control, remote",
"restaurant, eating house, eating place, eatery",
"revolver, six-gun, six-shooter",
"rifle",
"rocking chair, rocker",
"rotisserie",
"rubber eraser, rubber, pencil eraser",
"rugby ball",
"rule, ruler",
"running shoe",
"safe",
"safety pin",
"saltshaker, salt shaker",
"sandal",
"sarong",
"sax, saxophone",
"scabbard",
"scale, weighing machine",
"school bus",
"schooner",
"scoreboard",
"screen, CRT screen",
"screw",
"screwdriver",
"seat belt, seatbelt",
"sewing machine",
"shield, buckler",
"shoe shop, shoe-shop, shoe store",
"shoji",
"shopping basket",
"shopping cart",
"shovel",
"shower cap",
"shower curtain",
"ski",
"ski mask",
"sleeping bag",
"slide rule, slipstick",
"sliding door",
"slot, one-armed bandit",
"snorkel",
"snowmobile",
"snowplow, snowplough",
"soap dispenser",
"soccer ball",
"sock",
"solar dish, solar collector, solar furnace",
"sombrero",
"soup bowl",
"space bar",
"space heater",
"space shuttle",
"spatula",
"speedboat",
"spider web, spider's web",
"spindle",
"sports car, sport car",
"spotlight, spot",
"stage",
"steam locomotive",
"steel arch bridge",
"steel drum",
"stethoscope",
"stole",
"stone wall",
"stopwatch, stop watch",
"stove",
"strainer",
"streetcar, tram, tramcar, trolley, trolley car",
"stretcher",
"studio couch, day bed",
"stupa, tope",
"submarine, pigboat, sub, U-boat",
"suit, suit of clothes",
"sundial",
"sunglass",
"sunglasses, dark glasses, shades",
"sunscreen, sunblock, sun blocker",
"suspension bridge",
"swab, swob, mop",
"sweatshirt",
"swimming trunks, bathing trunks",
"swing",
"switch, electric switch, electrical switch",
"syringe",
"table lamp",
"tank, army tank, armored combat vehicle, armoured combat vehicle",
"tape player",
"teapot",
"teddy, teddy bear",
"television, television system",
"tennis ball",
"thatch, thatched roof",
"theater curtain, theatre curtain",
"thimble",
"thresher, thrasher, threshing machine",
"throne",
"tile roof",
"toaster",
"tobacco shop, tobacconist shop, tobacconist",
"toilet seat",
"torch",
"totem pole",
"tow truck, tow car, wrecker",
"toyshop",
"tractor",
"trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi", # noqa: E501
"tray",
"trench coat",
"tricycle, trike, velocipede",
"trimaran",
"tripod",
"triumphal arch",
"trolleybus, trolley coach, trackless trolley",
"trombone",
"tub, vat",
"turnstile",
"typewriter keyboard",
"umbrella",
"unicycle, monocycle",
"upright, upright piano",
"vacuum, vacuum cleaner",
"vase",
"vault",
"velvet",
"vending machine",
"vestment",
"viaduct",
"violin, fiddle",
"volleyball",
"waffle iron",
"wall clock",
"wallet, billfold, notecase, pocketbook",
"wardrobe, closet, press",
"warplane, military plane",
"washbasin, handbasin, washbowl, lavabo, wash-hand basin",
"washer, automatic washer, washing machine",
"water bottle",
"water jug",
"water tower",
"whiskey jug",
"whistle",
"wig",
"window screen",
"window shade",
"Windsor tie",
"wine bottle",
"wing",
"wok",
"wooden spoon",
"wool, woolen, woollen",
"worm fence, snake fence, snake-rail fence, Virginia fence",
"wreck",
"yawl",
"yurt",
"web site, website, internet site, site",
"comic book",
"crossword puzzle, crossword",
"street sign",
"traffic light, traffic signal, stoplight",
"book jacket, dust cover, dust jacket, dust wrapper",
"menu",
"plate",
"guacamole",
"consomme",
"hot pot, hotpot",
"trifle",
"ice cream, icecream",
"ice lolly, lolly, lollipop, popsicle",
"French loaf",
"bagel, beigel",
"pretzel",
"cheeseburger",
"hotdog, hot dog, red hot",
"mashed potato",
"head cabbage",
"broccoli",
"cauliflower",
"zucchini, courgette",
"spaghetti squash",
"acorn squash",
"butternut squash",
"cucumber, cuke",
"artichoke, globe artichoke",
"bell pepper",
"cardoon",
"mushroom",
"Granny Smith",
"strawberry",
"orange",
"lemon",
"fig",
"pineapple, ananas",
"banana",
"jackfruit, jak, jack",
"custard apple",
"pomegranate",
"hay",
"carbonara",
"chocolate sauce, chocolate syrup",
"dough",
"meat loaf, meatloaf",
"pizza, pizza pie",
"potpie",
"burrito",
"red wine",
"espresso",
"cup",
"eggnog",
"alp",
"bubble",
"cliff, drop, drop-off",
"coral reef",
"geyser",
"lakeside, lakeshore",
"promontory, headland, head, foreland",
"sandbar, sand bar",
"seashore, coast, seacoast, sea-coast",
"valley, vale",
"volcano",
"ballplayer, baseball player",
"groom, bridegroom",
"scuba diver",
"rapeseed",
"daisy",
"yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum", # noqa: E501
"corn",
"acorn",
"hip, rose hip, rosehip",
"buckeye, horse chestnut, conker",
"coral fungus",
"agaric",
"gyromitra",
"stinkhorn, carrion fungus",
"earthstar",
"hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa", # noqa: E501
"bolete",
"ear, spike, capitulum",
"toilet tissue, toilet paper, bathroom tissue",
]
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .activation import build_activation
from .cross_entropy import ParallelCrossEntropyLoss
from .embedding import Embedding, SinePositionalEmbedding, VocabEmbedding, PatchEmbedding
from .layer_norm import LayerNorm, RMSLayerNorm
from .linear import Linear, Linear1D
from .conv import Conv1D
from .lm_logits import LMLogits
from .mlp import MLP
from .transformer_layer import TransformerLayer
from .attention import MultiheadAttention
from .droppath import DropPath, drop_path
__all__ = [
"Embedding",
"VocabEmbedding",
"SinePositionalEmbedding",
"PatchEmbedding",
"build_activation",
"Linear",
"Linear1D",
"Conv1D",
"MLP",
"LayerNorm",
"RMSLayerNorm",
"TransformerLayer",
"MultiheadAttention",
"ParallelCrossEntropyLoss",
"LMLogits",
"drop_path",
"DropPath",
]
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
from typing import Optional
import oneflow as flow
from oneflow import nn
class Activation(str, Enum):
SquaredReLU = "squared_relu"
GeLU = "gelu"
GeLUTanh = "gelu_tanh"
LeakyReLU = "leaky_relu"
ReLU = "relu"
Tanh = "tanh"
QuickGELU = "quick_gelu"
# For unit testing / parity comparisons, probably not the fastest way
class SquaredReLU(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor:
x_ = flow._C.relu(x)
return x_ * x_
class Passthrough(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor:
return x
class GeLUTanh(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor:
"""When the approximate argument is 'tanh', Gelu is estimated with:
0.5 * x * (1.0 + flow.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0))))
"""
return flow.nn.functional.gelu(x, approximate="tanh")
class QuickGELU(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor:
return x * flow.sigmoid(1.702 * x)
def build_activation(activation: Optional[Activation]):
"""
Fetching activation layers by name, e.g.,
``build_activation("gelu")`` returns ``nn.GELU()`` module.
"""
if not activation:
return Passthrough()
return {
Activation.ReLU: nn.ReLU,
Activation.GeLU: nn.GELU,
Activation.GeLUTanh: GeLUTanh,
Activation.LeakyReLU: nn.LeakyReLU,
Activation.SquaredReLU: SquaredReLU,
Activation.Tanh: nn.Tanh,
Activation.QuickGELU: QuickGELU,
}[activation]()
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import enum
import math
from typing import Tuple
import oneflow as flow
from oneflow import nn
from .linear import Linear
class AttnMaskType(enum.Enum):
padding = 1
causal = 2
class MultiheadAttention(nn.Module):
"""Multi-head attention layer, support self attention and cross attention.
Args:
hidden_size: size of hidden state.
num_attention_heads: number of attention heads.
is_cross_attention: used to specify whether it is self attention or cross attention.
Defaults to False.
attention_dropout_prob: dropout probability of attention weights.
Defaults to 0.0.
output_dropout_prob: dropout probability of output. Defaults to 0.0.
init_method: method to initialize the input layer weights.
Defaults to ``init.xavier_normal_``.
output_layer_init_method: method to initialize the output layer weights.
If None, use ``init_method``.
bias_dropout_fusion: whether to fuse add bias and dropout.
Defaults to False.
scale_mask_softmax_fusion: whether to fuse scale, mask and softmax.
Defaults to False.
apply_query_key_layer_scaling: if `True`, scaling the attention score by layer index.
Defaults to False.
layer_idx: a layer_idx sign which determines the placements.
It will be used in pipeline parallelism. Defaults to 0.
"""
def __init__(
self,
hidden_size,
num_attention_heads,
is_cross_attention=False,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
attn_mask_type=AttnMaskType.padding,
*,
layer_idx=0
):
super().__init__()
self.hidden_size = hidden_size
if output_layer_init_method is None:
output_layer_init_method = init_method
assert (
hidden_size % num_attention_heads == 0
), "hidden_size must be divisible by num_attention_heads."
self.num_heads = num_attention_heads
self.head_size = hidden_size // num_attention_heads
self.attn_mask_type = attn_mask_type
self.attention_dropout_prob = attention_dropout_prob
self.dropout = nn.Dropout(p=attention_dropout_prob)
self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
self.coeff = None
if apply_query_key_layer_scaling:
self.coeff = layer_idx + 1
self.norm_factor /= self.coeff
self.is_cross_attention = is_cross_attention
self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
self.bias_dropout_fusion = bias_dropout_fusion
if self.bias_dropout_fusion:
self.output_dropout_prob = output_dropout_prob
else:
self.output_dropout = nn.Dropout(p=output_dropout_prob)
if self.is_cross_attention:
self.query = Linear(
self.hidden_size,
self.hidden_size,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
self.key_value = Linear(
self.hidden_size,
self.hidden_size * 2,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
else:
self.query_key_value = Linear(
self.hidden_size,
self.hidden_size * 3,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
self.dense = Linear(
self.hidden_size,
self.hidden_size,
parallel="row",
init_method=output_layer_init_method,
skip_bias_add=self.bias_dropout_fusion,
layer_idx=layer_idx,
)
def forward(
self,
hidden_states: flow.Tensor,
encoder_states: flow.Tensor = None,
attention_mask: flow.Tensor = None,
past_key_value: Tuple[flow.Tensor, flow.Tensor] = None,
use_cache: bool = False,
):
"""
Args:
hidden_states (flow.Tensor): shape is [bsz, tgt_len, hidden_size].
encoder_states (flow.Tensor, optional): shape is [bsz, src_len, hidden_size].
Defaults to None.
attention_mask (flow.Tensor, optional): shape is [bsz, 1, tgt_len, src_len].
It should be the combination of padding mask and casual mask.
It is the padding mask of source input when used with self-attention in encoder.
And it is the combination of padding mask of target input and casual mask when
used with self-attention in decoder. It is the padding mask of source input when
used with cross-attention in decoder.
Defaults to None.
past_key_value (Tuple[flow.Tensor, flow.Tensor], optional): tuple of key and value,
each shape is [bsz, num_heads, src_len, head_size]. Defaults to None.
use_cache (bool, optional): it will be set to True, when the model is in the inference
phase and used for incremental decoding. Defaults to False.
"""
# hidden_states, encoder_states: [S(0), B]
# attention_mask: [S(0), B]
if encoder_states is not None:
encoder_states = encoder_states.to_global(placement=hidden_states.placement)
if attention_mask is not None:
attention_mask = attention_mask.to_global(placement=hidden_states.placement)
bsz, tgt_len = hidden_states.size()[:2]
if self.is_cross_attention:
# if it is cross attention, key and value should be calculated only once, and the
# result can be reused.
query = self.query(hidden_states)
query = query.view(bsz, -1, self.num_heads, self.head_size)
query = query.permute(0, 2, 1, 3)
if past_key_value is not None:
key, value = past_key_value
elif encoder_states is not None:
key_value = self.key_value(encoder_states)
key_value = key_value.view(bsz, -1, self.num_heads, 2 * self.head_size)
key_value = key_value.permute(0, 2, 1, 3)
key, value = flow.chunk(key_value, chunks=2, dim=-1)
else:
raise ValueError(
"past_key_value and encoder_states cannot be None at the same time."
)
else:
# if it is self attention, query, key, and value are all obtained from hidden_states.
# when in the inference phase of an incremental decoder,
# hidden_states is the last-added state,
# the full key and value could be obtained by concatenating with past_key_value.
query_key_value = self.query_key_value(hidden_states)
query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
query_key_value = query_key_value.permute(
0, 2, 1, 3
) # [bsz, num_heads, src_len, 3 * head_size]
query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1)
if past_key_value is not None:
past_key, past_value = past_key_value
key = flow.cat((past_key.type_as(key), key), dim=2)
value = flow.cat((past_value.type_as(value), value), dim=2)
# query, key, value: [S(0), S(1)], shape: [bsz, num_heads, seq_length, head_size]
if use_cache:
past_key_value = (key, value)
# [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)]
attention_scores = flow.matmul(query, key, transpose_b=True, alpha=self.norm_factor)
# [S(0), S(1)] x [S(0), B] = [S(0), S(1)]
if attention_mask is not None:
if self.scale_mask_softmax_fusion:
if self.attn_mask_type == AttnMaskType.padding:
attention_mask = (
attention_mask.expand_as(attention_scores) if use_cache else attention_mask
)
attention_weights = flow._C.fused_scale_mask_softmax_dropout(
attention_scores,
attention_mask,
fill_value=-10000.0,
scale=self.coeff,
p=self.attention_dropout_prob,
)[0]
else:
if self.coeff is not None:
attention_scores *= self.coeff
attention_scores = flow.mul(attention_scores, attention_mask)
attention_scores = attention_scores - 10000.0 * (1 - attention_mask)
# TODO(xingyu.liao): graph will occur `where_scalar` errors
# when using `masked_fill`
# attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0)
attention_weights = flow.softmax(attention_scores, dim=-1)
# [bsz, num_heads, tgt_len, src_len]
attention_weights = self.dropout(attention_weights)
else:
if self.scale_mask_softmax_fusion and self.attn_mask_type == AttnMaskType.causal:
attention_weights = flow._C.fused_scale_tril_softmax_mask_scale(
attention_scores,
p=self.attention_dropout_prob,
diagonal=0,
tril_scale_value=self.coeff,
tril_fill_value=-10000.0,
)[0]
else:
attention_weights = flow.softmax(attention_scores, dim=-1)
# [bsz, num_heads, tgt_len, src_len]
attention_weights = self.dropout(attention_weights)
# Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)]
context = flow.matmul(attention_weights, value)
# Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size]
context = context.transpose(1, 2)
# Concat multi-head results from
# [bsz, tgt_len, num_heads, head_size] -> [bsz, tgt_len, num_heads * head_size]
# SBP sign: [S(0), S(2)]
# [S(0), S(2)] x [B, S(0)] = [S(0), P] -> [S(0), B]
output = self.dense(context.flatten(2))
if self.bias_dropout_fusion:
output, bias = output
output = flow._C.fused_bias_add_dropout(
output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
)
else:
output = self.output_dropout(output)
if use_cache:
output = (output, past_key_value)
return output
def extra_repr(self) -> str:
return "hidden_size={}, num_heads={}, is_cross_attention={}".format(
self.hidden_size,
self.num_heads,
self.is_cross_attention,
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class Conv1D(nn.Module):
def __init__(
self,
in_features,
out_features,
bias=True,
parallel="data",
init_method=nn.init.xavier_normal_,
skip_bias_add=False,
dtype=flow.float32,
*,
layer_idx=0,
):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.parallel = parallel
self.skip_bias_add = skip_bias_add
if parallel == "col":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
elif parallel == "row":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
elif parallel == "data":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
else:
raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
self.weight = flow.nn.Parameter(
flow.empty(
(in_features, out_features),
dtype=dtype,
placement=dist.get_layer_placement(layer_idx), # for pipeline parallelism placement
sbp=weight_sbp,
)
)
if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1":
init_method(self.weight)
self.bias = (
flow.nn.Parameter(
flow.zeros(
(out_features,),
dtype=dtype,
placement=dist.get_layer_placement(layer_idx),
sbp=bias_sbp,
)
)
if bias
else None
)
def forward(self, x):
if dist.same_sbp(self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])):
if self.weight.sbp[-1] == flow.sbp.split(1):
x_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
x = x.to_global(sbp=x_sbp)
x = x.to_global(grad_sbp=x.sbp)
x = flow.matmul(x, self.weight)
elif dist.same_sbp(
self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
):
if self.weight.sbp[-1] == flow.sbp.split(0):
x_sbp = x.sbp[:-1] + (flow.sbp.split(x.ndim - 1),)
x = x.to_global(sbp=x_sbp)
out_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
else:
out_sbp = x.sbp
x = flow.matmul(x, self.weight)
x = x.to_global(sbp=out_sbp)
elif dist.same_sbp(
self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
):
x = x.to_global(grad_sbp=x.sbp)
x = flow.matmul(x, self.weight)
else:
x = flow.matmul(x, self.weight)
if self.bias is not None:
if self.skip_bias_add:
return x, self.bias
else:
return x + self.bias
else:
return x
def extra_repr(self) -> str:
return "in_features={}, out_features={}, bias={}, parallel={}".format(
self.in_features,
self.out_features,
self.bias is not None,
self.parallel,
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
class ParallelCrossEntropyLoss(nn.Module):
"""This criterion acts like :class:`~flow.nn.CrossEntropyLoss` except it will
execute distributed cross entropy loss computation cross different GPUs.
"""
def forward(self, logits: flow.Tensor, target: flow.Tensor):
"""Function for the distributed cross entropy.
Args:
logits (flow.Tensor): vocab_parallel_logits with shape
(batch_size, seq_length, vocab_size) and sbp signature is [S(0), S(2)].
target (flow.Tensor): target with shape (batch_size, seq_length) and
sbp signature is [S(0), B].
"""
assert logits.ndim == 3
assert target.ndim == 2
assert logits.shape[0:2] == target.shape
target = target.to_global(placement=logits.placement)
# Change -1 in target to 0 because sparse_softmax_cross_entropy don't accept -1
target = target * (target >= 0)
lm_loss = flow._C.sparse_softmax_cross_entropy(
logits.view(-1, logits.shape[-1]),
target.view(-1),
)
return lm_loss
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
import oneflow.nn as nn
def drop_path(x, drop_prob: float = 0.5, training: bool = False, scale_by_keep: bool = True):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
if drop_prob == 0.0 or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
# similar opeartion to new_tensor(shape).bernoulli_(keep_prob)
random_tensor = flow.rand(*shape, dtype=x.dtype, sbp=x.sbp, placement=x.placement)
random_tensor = (random_tensor < keep_prob).to(flow.float32)
if keep_prob > 0.0 and scale_by_keep:
random_tensor = random_tensor / keep_prob
return x * random_tensor
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
self.scale_by_keep = scale_by_keep
def forward(self, x):
return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import os
import oneflow as flow
from oneflow import nn
from oneflow.nn import init
from libai.utils import distributed as dist
class Embedding(nn.Module):
"""Construct the trainable embedding module, which does not support parallelization.
This can be used for positional embedding and token type embedding.
Arguments:
num_embeddings: size of vocabulary.
embedding_dim: dimension of embeddings.
padding_idx: pad index. Defaults to None.
init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
amp_enabled: fp16 option for embedding weight. Defaults to False.
"""
def __init__(
self,
num_embeddings,
embedding_dim,
padding_idx=None,
init_method=init.xavier_normal_,
amp_enabled=False,
dtype=flow.float32,
layer_idx=0,
):
super().__init__()
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
if padding_idx is not None:
if padding_idx > 0:
assert (
padding_idx < self.num_embeddings
), "Padding_idx must be within num_embeddings"
elif padding_idx < 0:
assert (
padding_idx >= -self.num_embeddings
), "Padding_idx must be within num_embeddings"
padding_idx = self.num_embeddings + padding_idx
self.padding_idx = padding_idx
self.init_method = init_method
self.amp_enabled = amp_enabled
assert num_embeddings > 0
self.weight = nn.Parameter(
flow.empty(
(num_embeddings, embedding_dim),
dtype=dtype,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1":
self.init_method(self.weight)
# FIXME(lxy): Fill padding_idx is not supported in nd_sbp right now.
# self._fill_padding_idx_with_zero()
def forward(self, input_ids):
weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
# embeddings with sbp sign: [B, B]
# [B, B] x [S(0), B] --> [S(0), B]
# ↑ ↑ ↑
# embed pos_ids pos_embed
input_embeds = flow._C.gather(weight, input_ids, axis=0)
return input_embeds
def _fill_padding_idx_with_zero(self) -> None:
if self.padding_idx is not None:
with flow.no_grad():
self.weight[self.padding_idx] = flow.zeros(
self.embedding_dim,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
def extra_repr(self) -> str:
s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
if self.padding_idx is not None:
s += ", padding_idx={padding_idx}"
return s.format(**self.__dict__)
class VocabEmbedding(nn.Module):
"""Construct the word embeddings, which may be split along vocabulary dimension.
Arguments:
num_embeddings: size of vocabulary.
embedding_dim: dimension of embeddings.
padding_idx: pad index. Defaults to None.
init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
amp_enabled: fp16 option for embedding weight. Defaults to False.
"""
def __init__(
self,
num_embeddings,
embedding_dim,
padding_idx=None,
init_method=init.xavier_normal_,
amp_enabled=False,
):
super().__init__()
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
if padding_idx is not None:
if padding_idx > 0:
assert (
padding_idx < self.num_embeddings
), "Padding_idx must be within num_embeddings"
elif padding_idx < 0:
assert (
padding_idx >= -self.num_embeddings
), "Padding_idx must be within num_embeddings"
padding_idx = self.num_embeddings + padding_idx
self.padding_idx = padding_idx
self.init_method = init_method
self.amp_enabled = amp_enabled
# Word token embedding shape with (vocab_size, hidden_size)
# sbp: [B, S(0)]
self.weight = nn.Parameter(
flow.empty(
(num_embeddings, embedding_dim),
dtype=flow.float32,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
)
)
# Initialize the word embedding
if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1":
self.init_method(self.weight)
# FIXME(Lxy): Fill padding_idx is not supported in nd_sbp right now.
# self._fill_padding_idx_with_zero()
def forward(self, input_ids):
weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
# input_ids with shape (batch_size, seq_len), and sbp sign: [S(0), B]
# Gather forward sbp sign
# [B, S(0)] x [S(0), B] --> [S(0), P]
# ↑ ↑ ↑
# embed input_ids input_embeds
input_embeds = flow._C.gather(weight, input_ids, axis=0)
# Set the embeds sbp from [S(0), P] --> [S(0), B] to get complete embedding results.
input_embeds = input_embeds.to_global(sbp=dist.get_hidden_sbp())
return input_embeds
def _fill_padding_idx_with_zero(self) -> None:
if self.padding_idx is not None:
with flow.no_grad():
self.weight[self.padding_idx] = flow.zeros(
self.embedding_dim,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
def extra_repr(self) -> str:
s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
if self.padding_idx is not None:
s += ", padding_idx={padding_idx}"
return s.format(**self.__dict__)
class SinePositionalEmbedding(nn.Module):
"""Construct the sinusoidal positional embeddings.
Arguments:
num_embeddings: size of vocabulary.
embedding_dim: dimension of embeddings.
"""
def __init__(self, num_embeddings, embedding_dim):
super().__init__()
self.embedding_dim = embedding_dim
self.num_embeddings = num_embeddings
position_embedding = flow.zeros(
num_embeddings,
embedding_dim,
dtype=flow.float32,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
position = flow._C.global_arange(
start=0,
end=num_embeddings,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
dtype=flow.float32,
).unsqueeze(1)
position_range = flow._C.global_arange(
start=0,
end=embedding_dim,
step=2,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
dtype=flow.float32,
)
div_term = flow.exp(position_range * (-math.log(10000.0) / embedding_dim))
position_embedding[:, 0::2] = flow.sin(position * div_term)
position_embedding[:, 1::2] = flow.cos(position * div_term)
self.register_buffer("position_embedding", position_embedding)
def forward(self, position_ids):
position_embeds = flow._C.gather(self.position_embedding, position_ids, axis=0)
return position_embeds
def extra_repr(self) -> str:
s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
return s.format(**self.__dict__)
class PatchEmbedding(nn.Module):
"""2D Image to Patch Embedding
Arguments:
img_size: size of input image. Default to 224.
patch_size: embedded patch size. Default to 16.
in_chans: input channel's size. Default to 3.
embed_dim: dimension of embedded patch. Default to 768.
norm_layer: normalization patch embedding or not. Default to None.
flatten: flatten patch embedding or keep the 2-D shape. Default to True.
layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Default to 0.
"""
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
norm_layer=None,
flatten=True,
*,
layer_idx=0,
):
super().__init__()
img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size)
patch_size = patch_size if isinstance(patch_size, tuple) else (patch_size, patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
self.num_patches = self.grid_size[0] * self.grid_size[1]
self.flatten = flatten
self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
).to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(layer_idx),
)
self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
def forward(self, x):
B, C, H, W = x.shape
assert (
H == self.img_size[0]
), f"Input image height ({H}) doesn't match model ({self.img_size[0]})."
assert (
W == self.img_size[1]
), f"Input image width ({W}) doesn't match model ({self.img_size[1]})."
x = self.proj(x)
if self.flatten:
x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
x = self.norm(x)
return x
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class LayerNorm(nn.Module):
"""Applies Layer Normalization over a mini-batch of inputs in 1D parallelism.
Args:
normalized_shape: input shape from an expected input of size.
eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
"""
def __init__(
self, normalized_shape, eps=1e-5, elementwise_affine=True, bias=True, *, layer_idx=0
):
super().__init__()
if isinstance(normalized_shape, int):
normalized_shape = (normalized_shape,)
self.normalized_shape = tuple(normalized_shape)
self.eps = eps
self.elementwise_affine = elementwise_affine
self.layer_idx = layer_idx
if elementwise_affine:
self.weight = nn.Parameter(
flow.ones(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.bias = nn.Parameter(
flow.zeros(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
),
requires_grad=bias,
)
else:
self.weight = None
self.bias = None
def forward(self, x):
assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape
begin_norm_axis = x.ndim - len(self.normalized_shape)
begin_params_axis = x.ndim - len(self.normalized_shape)
if self.elementwise_affine:
y = flow._C.layer_norm_affine(
x,
self.weight,
self.bias,
begin_norm_axis=begin_norm_axis,
begin_params_axis=begin_params_axis,
epsilon=self.eps,
)
else:
y = flow._C.layer_norm(
x,
begin_norm_axis=begin_norm_axis,
begin_params_axis=begin_params_axis,
epsilon=self.eps,
)
return y
def extra_repr(self) -> str:
return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
**self.__dict__
)
class RMSLayerNorm(nn.Module):
"""T5 uses a layer_norm which only scales and doesn't shift, which is also known as
Root Mean Square Layer Normalization thus varience is calculated w/o mean and
there is no bias. More details see: https://arxiv.org/abs/1910.07467.
Args:
normalized_shape: input shape from an expected input of size.
eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
"""
def __init__(self, normalized_shape, eps=1e-6, layer_idx=0):
super().__init__()
self.layer_idx = layer_idx
self.weight = flow.nn.Parameter(
flow.ones(
normalized_shape,
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.l2norm_epsilon = eps
def forward(self, hidden_states):
return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class Linear1D(nn.Module):
r"""Linear layer with 1D parallelism which includes column parallelism and row parallelism.
The linear layer is defined as :math:`y = xA^T + b`.
In column parallelism, A^T is parallelized along the second dimension
as :math:`A^T = [A_1, ..., A_p]`.
In row parallelism, A^T is parallelized along the first dimension and X along its second
dimension as:
.. math::
A^T = \begin{bmatrix}
A\_1 \\
. \\
. \\
. \\
A\_p
\end{bmatrix}
x = \begin{bmatrix}
x\_1 & ... & x\_p
\end{bmatrix}
Arguments:
in_features: size of each input sample.
out_features: size of each output sample.
bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
parallel: Parallel mode. Defaults to "data".
init_method: method to initialize weight. Defaults to :func:`nn.init.xavier_normal_`.
skip_bias_add: skip adding bias but instead return it, so that adding bias can be fused with
other elementwise operations. Defaults to ``False``.
layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
parallelism. Defaults to 0.
dtype: the dtype of weight. Defaults to ``flow.float32``
"""
def __init__(
self,
in_features,
out_features,
bias=True,
parallel="data",
init_method=nn.init.xavier_normal_,
skip_bias_add=False,
dtype=flow.float32,
*,
layer_idx=0, # enforce layer_idx passed with keyword
):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.parallel = parallel
self.skip_bias_add = skip_bias_add
if parallel == "col":
# Column parallel
# weight sbp sign: [B, S(0)], weight will be transposed when performing matmul
# so weight sbp sign actually be [B, S(1)]
# bias sbp sign: [B, S(0)]
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
elif parallel == "row":
# Row parallel
# weight sbp sign: [B, S(1)], weight will be transposed when performing matmul
# so weight sbp sign actually be [B, S(1)]
# bias sbp sign: [B, B]
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
elif parallel == "data":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
else:
raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
self.weight = flow.nn.Parameter(
flow.empty(
(out_features, in_features),
dtype=dtype,
placement=dist.get_layer_placement(layer_idx), # for pipeline parallelism placement
sbp=weight_sbp,
)
)
if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1":
init_method(self.weight)
self.bias = (
flow.nn.Parameter(
flow.zeros(
(out_features,),
dtype=dtype,
placement=dist.get_layer_placement(layer_idx),
sbp=bias_sbp,
)
)
if bias
else None
)
def forward(self, x):
if dist.same_sbp(self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])):
# If the last dim of weight sbp sign is S(0), then last dim of weight.t sbp
# sign is S(1), so the last dim of x sbp sign must be B.
if self.weight.sbp[-1] == flow.sbp.split(0):
x_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
x = x.to_global(sbp=x_sbp)
# x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
x = x.to_global(grad_sbp=x.sbp)
x = flow.matmul(x, self.weight, transpose_b=True)
elif dist.same_sbp(
self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
):
# If the last dim of weight sbp sign is S(1), then last dim of weight.t sbp
# sign is S(0), so the last dim of x sbp sign must be S(ndim-1).
if self.weight.sbp[-1] == flow.sbp.split(1):
x_sbp = x.sbp[:-1] + (flow.sbp.split(x.ndim - 1),)
x = x.to_global(sbp=x_sbp)
out_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
else:
out_sbp = x.sbp
x = flow.matmul(x, self.weight, transpose_b=True)
# Change x.sbp for followup forward pass.
# This line can be removed when sbp can be auto inferred.
x = x.to_global(sbp=out_sbp)
elif dist.same_sbp(
self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
):
# x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
x = x.to_global(grad_sbp=x.sbp)
# NOTE(chengcheng): when input x is [S(0), B], there is no need to change sbp for x.
# x = x.to_global(sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(0)]))
x = flow.matmul(x, self.weight, transpose_b=True)
else:
# Not supported weight_sbp, deduce sbp and communicate with nccl automatically.
x = flow.matmul(x, self.weight, transpose_b=True)
if self.bias is not None:
if self.skip_bias_add:
return x, self.bias
else:
return x + self.bias
else:
return x
def extra_repr(self) -> str:
return "in_features={}, out_features={}, bias={}, parallel={}".format(
self.in_features,
self.out_features,
self.bias is not None,
self.parallel,
)
# Give an alias for Linear1d
Linear = Linear1D
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
class LMLogits(nn.Module):
def __init__(self, vocab_size, bias=False):
super().__init__()
self.bias = (
nn.Parameter(
flow.zeros(
(vocab_size,),
dtype=flow.float32,
placement=dist.get_layer_placement(-1),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
)
)
if bias
else None
)
def forward(self, input, word_embeddings):
"""LM logits using word embedding weights"""
# input with sbp sign [S(0), B] and word_embeddings with sbp sign [S(0), B]
# NOTE(l1aoxingyu): This is for pipeline parallelism
# change word embedding placement from stage(0) to stage(-1)
w = word_embeddings.to_global(placement=input.placement)
# NOTE(l1aoxingyu): input x embed^T = logits with sbp sign
# [S(0), B] x [B, S(1)] --> [S(0), S(1)]
# ↑ ↑ ↑
# input embed^T logits
# Backward pass input.grad = logits.grad x embed with sbp sign
# [S(0), S(1)] x [B, S(0)] --> [S(0), P]
# ↑ ↑ ↑
# logits.grad embed input.grad
# When use input.grad as head node for backward pass, need to convert
# its sbp sign fromm [S(0), P] --> [S(0), B]
input = input.to_global(grad_sbp=input.sbp)
logits = flow._C.matmul(input, w, transpose_b=True)
if self.bias is not None:
logits = logits + self.bias
return logits
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.layers import Linear, build_activation
class MLP(nn.Module):
"""MLP
MLP will take the input with h hidden state, project it to intermediate
hidden dimension, perform gelu transformation, and project the
state back into h hidden dimension.
Arguments:
hidden_size: size of each input and output sample.
ffn_hidden_size: size of each intermediate sample.
output_dropout_prob: Output dropout probability. Defaults to 0.0.
init_method: method to initialize the first linear weight.
Defaults to :func:`nn.init.xavier_normal_`.
output_layer_init_method: method to initialize the second linear weight. If set to None,
it will use ``init_method`` instead. Defaults to None.
bias_gelu_fusion: If set to ``True``, it will fuse bias adding and elementwise
gelu activation. Defaults to ``False``.
bias_dropout_fusion: If set to ``True``, it will fuse bias adding and dropout.
Defaults to ``False``.
layer_idx: A layer_idx sign which determines the placement. It will be used in
pipeline parallelism. Defaults to 0.
"""
def __init__(
self,
hidden_size,
ffn_hidden_size,
output_dropout_prob=0.0,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
*,
layer_idx=0,
):
super().__init__()
self.output_dropout_prob = output_dropout_prob
self.bias_gelu_fusion = bias_gelu_fusion
self.bias_dropout_fusion = bias_dropout_fusion
if output_layer_init_method is None:
output_layer_init_method = init_method
self.dense_h_to_4h = Linear(
hidden_size,
ffn_hidden_size,
bias=True,
parallel="col",
skip_bias_add=bias_gelu_fusion,
init_method=init_method,
layer_idx=layer_idx,
)
if not bias_gelu_fusion:
self.activation_func = build_activation("gelu")
self.dense_4h_to_h = Linear(
ffn_hidden_size,
hidden_size,
bias=True,
parallel="row",
skip_bias_add=bias_dropout_fusion,
init_method=output_layer_init_method,
layer_idx=layer_idx,
)
if not bias_dropout_fusion:
self.dropout = nn.Dropout(self.output_dropout_prob)
def forward(self, hidden_states):
intermediate = self.dense_h_to_4h(hidden_states)
if self.bias_gelu_fusion:
intermediate, bias = intermediate
intermediate = flow._C.fused_bias_add_gelu(
intermediate, bias, axis=intermediate.ndim - 1
)
else:
intermediate = self.activation_func(intermediate)
output = self.dense_4h_to_h(intermediate)
if self.bias_dropout_fusion:
output, bias = output
output = flow._C.fused_bias_add_dropout(
output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
)
else:
output = self.dropout(output)
return output
def extra_repr(self) -> str:
return "bias_gelu_fusion={}, bias_dropout_fusion={}, dropout={}".format(
self.bias_gelu_fusion, self.bias_dropout_fusion, self.output_dropout_prob
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow.nn as nn
from libai.utils import distributed as dist
from .attention import AttnMaskType, MultiheadAttention
from .droppath import DropPath
from .layer_norm import LayerNorm
from .mlp import MLP
class TransformerLayer(nn.Module):
"""A single transformer layer.
Transformer layer takes input with size [bsz, seq_length, hidden size] and returns an
output of the same size.
The input and output has same sbp sign, (S(0), B).
Arguments:
hidden_size: size of hidden state.
ffn_hidden_size: size of feed forword neural network.
num_attention_heads: number of attention heads.
is_decoder: used to specify whether this is transformer encoder layer or transformer
decoder layer. Default: ``False``.
attention_dropout_prob: dropout probability of attention weights.
output_dropout_prob: dropout probability of output.
layernorm_epsilon: epsilon used in layernorm layer. Default: `1e-5`.
init_method: method to initialize the input layer weights.
output_layer_init_method: method to initialize the output layer weights.
If None, use `init_method`.
bias_gelu_fusion: whether fuse add bias and gelu. Default: ``False``.
bias_dropout_fusion: whether fuse add bias and dropout. Default: ``False``.
scale_mask_softmax_fusion: whether to fuse scale, mask and softmax. Default: ``False``.
apply_query_key_layer_scaling: if `true`, scaling the attention score by layer index.
Default: ``False``.
apply_residual_post_layernorm: if ``true``, use original BERT residual
connection ordering. Otherwise, use Megatron BERT residual connection which
is more stable when scaling model size introduced in
https://arxiv.org/pdf/1909.08053.pdf.
Default: ``False``.
layer_idx: the layer index, which determines the placement.
"""
def __init__(
self,
hidden_size,
ffn_hidden_size,
num_attention_heads,
is_decoder=False,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
drop_path_prob=0.0,
layernorm_epsilon=1e-5,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
apply_residual_post_layernorm=False,
attn_mask_type=AttnMaskType.padding,
*,
layer_idx=0
):
super().__init__()
self.hidden_size = hidden_size
self.ffn_hidden_size = ffn_hidden_size
self.num_attention_heads = num_attention_heads
self.attention_dropout_prob = attention_dropout_prob
self.output_dropout_prob = output_dropout_prob
self.layernorm_epsilon = layernorm_epsilon
self.attn_mask_type = attn_mask_type
self.layer_idx = layer_idx
self.is_decoder = is_decoder
self.bias_gelu_fusion = bias_gelu_fusion
self.bias_dropout_fusion = bias_dropout_fusion
self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
self.apply_residual_post_layernorm = apply_residual_post_layernorm
self.init_method = init_method
if output_layer_init_method is None:
output_layer_init_method = init_method
self.output_layer_init_method = output_layer_init_method
self.drop_path = DropPath(drop_path_prob) if drop_path_prob > 0.0 else nn.Identity()
self.input_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
self.self_attention = self.build_attention(is_cross_attention=False)
self.post_attention_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
if self.is_decoder:
self.cross_attention = self.build_attention(is_cross_attention=True)
self.post_cross_attention_layernorm = LayerNorm(
self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
)
self.mlp = MLP(
self.hidden_size,
self.ffn_hidden_size,
self.output_dropout_prob,
self.init_method,
output_layer_init_method=self.output_layer_init_method,
bias_gelu_fusion=self.bias_gelu_fusion,
bias_dropout_fusion=self.bias_dropout_fusion,
layer_idx=self.layer_idx,
)
def forward(
self,
hidden_states,
attention_mask=None,
encoder_states=None,
encoder_attention_mask=None,
past_key_value=None,
use_cache=False,
):
"""
Args:
hidden_states: shape is (batch_size, seq_length, hidden_size),
sbp signature is (S(0), B).
attention_mask: the combination of key padding mask and casual mask of hidden states
with shape (batch_size, 1, seq_length, seq_length) and the sbp
signature is (S(0), B),
encoder_states: encoder output with shape (batch_size, seq_length, hidden_size)
and the sbp signature is (S(0), B), which will be used in cross attention.
encoder_attention_mask: key padding mask of encoder states with shape
(batch_size, 1, seq_length, seq_length) and the sbp signature is (S(0), B).
past_key_value: tuple of key and value, each shape is
(seq_length, bsz, num_heads, head_size), For decoder layer,
the past_key_value contains the states both from self attention
and cross attention.
use_cache: it will be set to `True` when the model is in the inference phase and
used for incremental decoding.
"""
# Change placement for pipeline parallelsim
hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))
# hidden_states shape: (batch_size, seq_length, hidden_size)
if attention_mask is not None:
attention_mask = attention_mask.to_global(
placement=dist.get_layer_placement(self.layer_idx)
)
if past_key_value is not None:
if self.is_decoder:
assert len(past_key_value) == 4
self_attn_past_key_value = past_key_value[:2]
cross_attn_past_key_value = past_key_value[2:]
else:
self_attn_past_key_value = past_key_value
cross_attn_past_key_value = None
else:
self_attn_past_key_value, cross_attn_past_key_value = None, None
layernorm_output = self.input_layernorm(hidden_states)
attention_output = self.self_attention(
layernorm_output,
attention_mask=attention_mask,
past_key_value=self_attn_past_key_value,
use_cache=use_cache,
)
attention_output = self.drop_path(attention_output)
if use_cache:
attention_output, presents = attention_output
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
hidden_states = residual + attention_output
layernorm_output = self.post_attention_layernorm(hidden_states)
if self.is_decoder:
attention_output = self.cross_attention(
layernorm_output,
encoder_states,
attention_mask=encoder_attention_mask,
past_key_value=cross_attn_past_key_value,
use_cache=use_cache,
)
if use_cache:
attention_output, decoder_presents = attention_output
presents += decoder_presents
attention_output = self.drop_path(attention_output)
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
hidden_states = residual + attention_output
layernorm_output = self.post_cross_attention_layernorm(hidden_states)
mlp_output = self.mlp(layernorm_output)
mlp_output = self.drop_path(mlp_output)
if self.apply_residual_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
output = residual + mlp_output
if use_cache:
output = (output, presents)
return output
def build_attention(self, is_cross_attention=False):
return MultiheadAttention(
self.hidden_size,
self.num_attention_heads,
is_cross_attention=is_cross_attention,
attention_dropout_prob=self.attention_dropout_prob,
output_dropout_prob=self.output_dropout_prob,
init_method=self.init_method,
output_layer_init_method=self.output_layer_init_method,
bias_dropout_fusion=self.bias_dropout_fusion,
scale_mask_softmax_fusion=self.scale_mask_softmax_fusion,
apply_query_key_layer_scaling=self.apply_query_key_layer_scaling,
attn_mask_type=self.attn_mask_type,
layer_idx=self.layer_idx,
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .bert_model import BertForPreTraining, BertModel, BertForClassification
from .roberta_model import RobertaForPreTraining, RobertaForCausalLM, RobertaModel
from .build import build_graph, build_model
from .t5_model import T5ForPreTraining, T5Model
from .gpt_model import GPTForPreTraining, GPTModel
from .vision_transformer import VisionTransformer
from .swin_transformer import SwinTransformer
from .swin_transformer_v2 import SwinTransformerV2
from .resmlp import ResMLP
__all__ = [
"build_model",
"build_graph",
"BertModel",
"BertForPreTraining",
"BertForClassification",
"RobertaModel",
"RobertaForCausalLM",
"RobertaForPreTraining",
"T5Model",
"T5ForPreTraining",
"GPTModel",
"GPTForPreTraining",
"VisionTransformer",
"SwinTransformer",
"SwinTransformerV2",
"ResMLP",
]
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.config import configurable
from libai.layers import (
Embedding,
LayerNorm,
Linear,
LMLogits,
ParallelCrossEntropyLoss,
TransformerLayer,
VocabEmbedding,
build_activation,
)
from libai.layers.attention import AttnMaskType
from libai.utils import distributed as dist
from .utils import init_method_normal, scaled_init_method_normal
class BertExtendedAttnMask(nn.Module):
def forward(self, attention_mask):
# We create a 3D attention mask from a 2D tensor mask.
# [b, 1, s]
attention_mask_b1s = attention_mask.unsqueeze(1)
# [b, s, 1]
attention_mask_bs1 = attention_mask.unsqueeze(2)
# [b, s, s]
attention_mask_bss = attention_mask_b1s * attention_mask_bs1
# [b, 1, s, s]
extended_attention_mask = attention_mask_bss.unsqueeze(1)
return extended_attention_mask
class BertEmbeddings(nn.Module):
def __init__(
self,
vocab_size,
hidden_size,
max_sequence_length,
embedding_dropout_prob,
num_tokentypes=0,
init_method=nn.init.xavier_normal_,
amp_enabled=False,
):
super().__init__()
self.vocab_embeddings = VocabEmbedding(
vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.position_embeddings = Embedding(
max_sequence_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
# NOTE(l1aoxingyu): Set position_ids sbp sign to [B, B] initially, because position_ids is a
# 1D-tensor from 0 to seq_length, if set to [S(0), B] at first, then position_ids
# will split at the first dim of hierarchy.
self.position_ids = flow.arange(
max_sequence_length,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
).unsqueeze(0)
if num_tokentypes > 0:
self.tokentype_embeddings = Embedding(
num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.tokentype_ids = flow.zeros(
self.position_ids.size(),
dtype=flow.long,
sbp=self.position_ids.sbp,
placement=self.position_ids.placement,
)
else:
self.tokentype_embeddings = None
self.embedding_dropout = nn.Dropout(embedding_dropout_prob)
def forward(self, input_ids, tokentype_ids=None, position_ids=None):
seq_length = input_ids.size()[1]
word_embeddings = self.vocab_embeddings(input_ids)
if position_ids is None:
# Change position_ids sbp sign: [B, B] -> [S(0), B]
position_ids = (
self.position_ids[:, :seq_length].expand_as(input_ids).to_global(sbp=input_ids.sbp)
)
position_embeddings = self.position_embeddings(position_ids)
embeddings = word_embeddings + position_embeddings
if self.tokentype_embeddings is not None:
if tokentype_ids is None:
tokentype_ids = (
self.tokentype_ids[:, :seq_length]
.expand_as(input_ids)
.to_global(sbp=input_ids.sbp)
)
embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
embeddings = self.embedding_dropout(embeddings)
return embeddings
def word_embeddings(self):
return self.vocab_embeddings.weight
class BertLMPredictionHead(nn.Module):
def __init__(self, hidden_size, init_method):
super().__init__()
self.dense = Linear(
hidden_size,
hidden_size,
bias=True,
parallel="data",
init_method=init_method,
layer_idx=-1,
)
self.activation_func = build_activation("gelu")
self.layernorm = LayerNorm((hidden_size,), layer_idx=-1)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation_func(hidden_states)
hidden_states = hidden_states.to_global(
grad_sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(2)])
)
# NOTE(l1aoxingyu): hidden_states shape is [B, S, H] whose sbp sign: [S(0), S(2)]
# Change from [S(0), S(2)] -> [S(0), B] because layernorm cannot get inputs with sbp S(2)
hidden_states = hidden_states.to_global(
sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])
)
hidden_states = self.layernorm(hidden_states)
return hidden_states
class BertPooler(nn.Module):
"""Pooler layer.
Pool hidden states of the first token and
add a linear transformation followed by a tanh.
Args:
hidden_size: hidden state feature dimension
"""
def __init__(self, hidden_size, init_method):
super().__init__()
self.dense = Linear(
hidden_size,
hidden_size,
bias=True,
parallel="col",
init_method=init_method,
layer_idx=-1,
)
self.activation_func = build_activation("tanh")
def forward(self, hidden_states):
"""Just "pool" the model by simply taking the [CLS] token corresponding
to the first token."""
# hidden_states: [bsz, seq_len, hidden_size]
select_token_tensor = hidden_states[:, 0, :]
pooled_output = self.dense(select_token_tensor)
pooled_output = self.activation_func(pooled_output)
return pooled_output
class BertLoss(nn.Module):
def __init__(self, add_binary_head):
super().__init__()
self.add_binary_head = add_binary_head
self.lm_loss = ParallelCrossEntropyLoss()
def forward(self, lm_output, lm_labels, loss_mask, binary_logits, ns_labels):
lm_labels = lm_labels.to_global(placement=lm_output.placement)
loss_mask = loss_mask.to_global(placement=lm_output.placement)
binary_logits = binary_logits.to_global(placement=lm_output.placement)
ns_labels = ns_labels.to_global(placement=lm_output.placement)
lm_loss = self.lm_loss(lm_output, lm_labels)
loss_mask = loss_mask.float()
# Change loss_mask.sum() sbp sign from [P, B] -> [B, B]
# because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P
denominator = (
loss_mask.sum().to_global(sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
+ 1e-7
)
masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
# NOTE(l1aoxingyu): Change lm loss sbp sign [P, P] -> [P, B] to add with sop loss
# whose sbp sign: [P, B]
masked_lm_loss = masked_lm_loss.to_global(
sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
)
loss_dict = {"lm_loss": masked_lm_loss}
if self.add_binary_head:
sop_loss = flow._C.cross_entropy(
binary_logits, ns_labels, ignore_index=-1, reduction="none"
).mean()
loss_dict["sop_loss"] = sop_loss
return loss_dict
class BertModel(nn.Module):
"""The bare Bert Model transformer outputting raw hidden-states without
any specific head on top.
Args:
vocab_size (int): The size of vocabulary file.
hidden_size (int): The size of hidden states.
hidden_layers (int): The number of ``TransformerLayer`` in encoder.
num_attention_heads (int):
The number of attention heads for each attention layer of ``TransformerLayer``.
intermediate_size (int):
The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
hidden_dropout_prob (float, optional):
The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
attention_probs_dropout_prob (float, optional):
The dropout ratio for the output of each attention layer in ``TransformerLayer``.
Defaults to 0.0.
max_position_embeddings (int):
Max sequence length of input, defines the shape of Position Embeddings
in ``BertEmbedding``.
num_tokentypes (int, optional):
Number of segment token indices. Defaults to 2.
add_pooling_layer (bool, optional):
Whether or not averaging or pooling the sequence of hidden-states for the
whole input sequence. Defaults to ``True``.
initializer_range (float, optional):
Sigma of the normal distribution in the initialization method. Defaults to 0.02.
layernorm_epsilon (float, optional):
The epsilon of LayerNorm layer. Defaults to 1e-5.
bias_gelu_fusion (bool, optional):
Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
bias_dropout_fusion (bool, optional):
Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
scale_mask_softmax_fusion (bool, optional):
Whether to fuse the computing of mask and softmax in attention layers.
Defaults to ``False``.
apply_query_key_layer_scaling (bool, optional):
Whether or not to use layer index related scaling in computing attention scores.
If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
Defaults to ``True``.
apply_residual_post_layernorm (bool, optional):
If set ``True``, use original BERT residual connection ordering otherwise use Megatron
BERT residual connection which is more stable when scaling model size introduced in
https://arxiv.org/pdf/1909.08053.pdf.
Default: ``False``.
amp_enabled (bool, optional):
Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
"""
@configurable
def __init__(
self,
vocab_size,
hidden_size,
hidden_layers,
num_attention_heads,
intermediate_size,
hidden_dropout_prob,
attention_probs_dropout_prob,
max_position_embeddings,
num_tokentypes=2,
add_pooling_layer=True,
initializer_range=0.02,
layernorm_eps=1e-12,
bias_gelu_fusion=True,
bias_dropout_fusion=True,
scale_mask_softmax_fusion=True,
apply_query_key_layer_scaling=True,
apply_residual_post_layernorm=False,
amp_enabled=False,
):
super().__init__()
init_method = init_method_normal(initializer_range)
scaled_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
# Embeddings
self.embeddings = BertEmbeddings(
vocab_size,
hidden_size,
max_position_embeddings,
hidden_dropout_prob,
num_tokentypes,
init_method,
amp_enabled,
)
# Mask generation
self.extended_attn_mask = BertExtendedAttnMask()
# Encoders
self.encoders = nn.ModuleList(
[
TransformerLayer(
hidden_size,
intermediate_size,
num_attention_heads,
attention_dropout_prob=attention_probs_dropout_prob,
output_dropout_prob=hidden_dropout_prob,
layernorm_epsilon=layernorm_eps,
bias_gelu_fusion=bias_gelu_fusion,
bias_dropout_fusion=bias_dropout_fusion,
scale_mask_softmax_fusion=scale_mask_softmax_fusion,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
init_method=init_method,
output_layer_init_method=scaled_init_method,
apply_residual_post_layernorm=apply_residual_post_layernorm,
attn_mask_type=AttnMaskType.padding, # bert mask type
layer_idx=i,
)
for i in range(hidden_layers)
]
)
self.final_layernorm = LayerNorm((hidden_size,), eps=layernorm_eps, layer_idx=-1)
self.pooler = BertPooler(hidden_size, init_method) if add_pooling_layer else None
@classmethod
def from_config(cls, cfg):
return {
"vocab_size": cfg.vocab_size,
"hidden_size": cfg.hidden_size,
"hidden_layers": cfg.hidden_layers,
"num_attention_heads": cfg.num_attention_heads,
"intermediate_size": cfg.intermediate_size,
"hidden_dropout_prob": cfg.hidden_dropout_prob,
"attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
"max_position_embeddings": cfg.max_position_embeddings,
"num_tokentypes": cfg.num_tokentypes,
"add_pooling_layer": cfg.add_pooling_layer,
"initializer_range": cfg.initializer_range,
"layernorm_eps": cfg.layernorm_eps,
"bias_gelu_fusion": cfg.bias_gelu_fusion,
"bias_dropout_fusion": cfg.bias_dropout_fusion,
"scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
"apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
"apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
"amp_enabled": cfg.amp_enabled,
}
def forward(self, input_ids, attention_mask, tokentype_ids=None):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
attention_mask (flow.BoolTensor): Mask to avoid performing attention
on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first and
second portions of the inputs. Indices are selected in `[0, 1]`. Defaults to None.
"""
extended_attention_mask = self.extended_attn_mask(attention_mask)
embedding_output = self.embeddings(input_ids, tokentype_ids)
hidden_states = embedding_output
for layer in self.encoders:
hidden_states = layer(hidden_states, extended_attention_mask)
encoder_output = self.final_layernorm(hidden_states)
pooled_output = self.pooler(encoder_output) if self.pooler is not None else None
return encoder_output, pooled_output
def word_embeddings_weight(self):
return self.embeddings.word_embeddings()
class BertPreTrainingHeads(nn.Module):
def __init__(self, vocab_size, hidden_size, init_method, add_binary_head=True):
super().__init__()
self.predictions = BertLMPredictionHead(hidden_size, init_method)
self.seq_relationship = Linear(
hidden_size,
2,
bias=True,
parallel="data",
init_method=init_method,
layer_idx=-1,
)
self.lm_logits = LMLogits(vocab_size, bias=True)
self.loss_func = BertLoss(add_binary_head)
def forward(
self,
sequence_output,
pooled_output,
word_embeddings_weight,
ns_labels,
lm_labels,
loss_mask,
):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
prediction_scores = self.lm_logits(prediction_scores, word_embeddings_weight)
if lm_labels is not None:
return self.loss_func(
prediction_scores, lm_labels, loss_mask, seq_relationship_score, ns_labels
)
return {
"prediction_scores": prediction_scores,
"seq_relationship_score": seq_relationship_score,
}
class BertForPreTraining(nn.Module):
"""Bert Model with two heads on top as done during the pretraining: a
`masked language modeling` head and a `next sentence prediction (classification)` head.
"""
def __init__(self, cfg):
super().__init__()
self.bert = BertModel(cfg)
self.cls_head = BertPreTrainingHeads(
cfg.vocab_size,
cfg.hidden_size,
init_method_normal(cfg.initializer_range),
cfg.add_binary_head,
)
def forward(
self,
input_ids,
attention_mask,
tokentype_ids=None,
ns_labels=None,
lm_labels=None,
loss_mask=None,
):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
attention_mask (flow.BoolTensor): Mask to avoid performing attention on
padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
and second portions of the inputs. Indices are selected in `[0, 1]`.
Defaults to None.
ns_labels (flow.LongTensor, optional): Labels for computing the next sequence prediction
(classification) loss. Input should be a sequence pair (see `input_ids` docstring).
Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
lm_labels (flow.LongTensor, optional): Labels for computing the masked
language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0))
tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0))
outputs = self.bert(input_ids, attention_mask, tokentype_ids)
sequence_output, pooled_output = outputs[:2]
return self.cls_head(
sequence_output,
pooled_output,
self.bert.word_embeddings_weight(),
ns_labels,
lm_labels,
loss_mask,
)
@staticmethod
def set_pipeline_stage_id(model):
dist_utils = dist.get_dist_util()
# Set pipeline parallelism stage_id
if hasattr(model.bert.final_layernorm, "config"):
# Old API in OneFlow 0.8
for module_block in model.modules():
# module.origin can get the original module
if isinstance(module_block.origin, BertEmbeddings):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, BertExtendedAttnMask):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, TransformerLayer):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
elif isinstance(module_block.origin, BertPooler):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
elif isinstance(module_block.origin, BertPreTrainingHeads):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# Set the last layernorm stage id
model.bert.final_layernorm.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
else:
for module_block in model.modules():
if isinstance(module_block.to(nn.Module), BertEmbeddings):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), BertExtendedAttnMask):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), TransformerLayer):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
elif isinstance(module_block.to(nn.Module), BertPooler):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
elif isinstance(module_block.to(nn.Module), BertPreTrainingHeads):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# Set the last layernorm stage id
model.bert.final_layernorm.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
class BertForClassification(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.num_labels = cfg.num_labels
self.bert = BertModel(cfg)
self.classifier = Linear(
cfg.hidden_size,
cfg.num_labels,
bias=True,
parallel="row",
init_method=init_method_normal(cfg.initializer_range),
layer_idx=-1,
)
classifier_dropout = (
cfg.classifier_dropout
if cfg.classifier_dropout is not None
else cfg.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
def forward(self, input_ids, attention_mask, tokentype_ids=None, labels=None, **kwargs):
labels = labels if labels is not None else kwargs.get("ns_labels")
outputs = self.bert(input_ids, attention_mask, tokentype_ids)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
loss = loss.to_global(sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast]))
return {"cls_loss": loss}
else:
return {"logits": logits}
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from libai.config import instantiate, try_get_key
def build_model(cfg):
"""Build the whole model architecture, defined by ``cfg.model``.
Note that it does not load any weights from ``cfg``.
"""
model = instantiate(cfg)
return model
def build_graph(cfg, model, optimizer=None, lr_scheduler=None, is_train=False):
"""Build the `nn.Graph`, defined by ``cfg.graph``."""
auto_parallel_conf = try_get_key(cfg, "graph.auto_parallel", default=None)
if is_train:
# Set train graph
assert optimizer is not None, "optimizer must be set for train graph"
assert lr_scheduler is not None, "lr_scheduler must be set for train graph"
graph = cfg.graph.train_graph
graph.model = model
graph.optimizer = optimizer
graph.lr_scheduler = lr_scheduler
graph.fp16 = try_get_key(cfg, "train.amp.enabled", default=False)
graph.activation_checkpoint = try_get_key(
cfg, "train.activation_checkpoint.enabled", default=False
)
graph.zero_optim = try_get_key(cfg, "train.zero_optimization.enabled", default=False)
graph.zero_stage = try_get_key(cfg, "train.zero_optimization.stage", default=1)
graph.grad_acc_steps = try_get_key(cfg, "train.num_accumulation_steps", default=1)
graph.auto_parallel_conf = auto_parallel_conf
return instantiate(graph)
else:
# Set eval graph
graph = cfg.graph.eval_graph
graph.model = model
graph.auto_parallel_conf = auto_parallel_conf
return instantiate(graph)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from oneflow.nn import init
from libai.config import configurable
from libai.layers import (
Embedding,
LayerNorm,
LMLogits,
ParallelCrossEntropyLoss,
TransformerLayer,
VocabEmbedding,
)
from libai.layers.attention import AttnMaskType
from libai.utils import distributed as dist
from .utils import init_method_normal, scaled_init_method_normal
class CasualMask(nn.Module):
"""
Create a casual mask and combine it with the padding mask.
It will be used in gpt model and T5 decoder.
When in T5 decoder, the argument `layer_idx` should be set to first decoder layer index.
"""
def __init__(self, max_positions=1024, *, layer_idx=0):
super().__init__()
self.mask = flow.tril(
flow.ones(
(max_positions, max_positions),
dtype=flow.int8,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
def forward(self, input_ids, past_length=0, attention_mask=None):
bsz, tgt_len = input_ids.size()
casual_mask = self.mask[:tgt_len, :tgt_len]
if past_length > 0:
# in case past_key_values are used, we need to add a prefix ones mask to casual mask
casual_mask = flow.cat(
[flow.ones(tgt_len, past_length, dtype=flow.int8), casual_mask], dim=-1
)
casual_mask = (
casual_mask.unsqueeze(0).unsqueeze(1).expand(bsz, 1, tgt_len, tgt_len + past_length)
)
casual_mask = casual_mask.to_global(sbp=input_ids.sbp)
if attention_mask is not None:
assert attention_mask.dim() == 4, "please extend the attention mask first"
casual_mask = casual_mask * attention_mask
return casual_mask
class GPTModel(nn.Module):
"""GPT-2 language model. The output of the forward method is logits.
Args:
hidden_layers (int): The number of ``TransformerLayer`` in the gpt model.
vocab_size (int): The size of vocabulary file.
hidden_size (int): The size of hidden states.
ffn_hidden_size (int):
The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
num_attention_heads (int):
The number of attention heads for each attention layer of ``TransformerLayer``.
max_seq_length (int, optional):
Max sequence length of input, defines the shape of Position Embeddings in GPTEmebedding.
Defaults to 1024.
embedding_dropout_prob (float, optional):
The dropout ratio for the output of GPTEmbedding Layer. Defaults to 0.0.
attention_dropout_prob (float, optional):
The dropout ratio for the output of each attention layer in ``TransformerLayer``.
Defaults to 0.0.
output_dropout_prob (float, optional):
The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
layernorm_epsilon (float, optional):
The epsilon of LayerNorm layer. Defaults to 1e-5.
initializer_range (float, optional):
Sigma of the normal distribution in the initialization method. Defaults to 0.02.
use_scaled_init_for_output_weights (bool, optional): Defaults to ``True``.
bias_gelu_fusion (bool, optional):
Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
bias_dropout_fusion (bool, optional):
Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
scale_mask_softmax_fusion (bool, optional):
Whether to fuse the computing of mask and softmax in attention layers.
Defaults to ``False``.
apply_query_key_layer_scaling (bool, optional):
Whether or not to use layer index related scaling in computing attention scores.
If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
Defaults to ``False``.
apply_residual_post_layernorm (bool, optional):
If set ``True``, use original BERT residual connection ordering otherwise use Megatron
BERT residual connection which is more stable when scaling model size introduced in
https://arxiv.org/pdf/1909.08053.pdf.
Default: ``False``.
amp_enabled (bool, optional):
Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
"""
@configurable
def __init__(
self,
hidden_layers,
vocab_size,
hidden_size,
ffn_hidden_size,
num_attention_heads,
max_seq_length=1024,
embedding_dropout_prob=0.0,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
layernorm_epsilon=1e-5,
initializer_range=0.02,
use_scaled_init_for_output_weights=True,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
apply_residual_post_layernorm=False,
amp_enabled=False,
):
super().__init__()
init_method = init_method_normal(sigma=initializer_range)
if use_scaled_init_for_output_weights:
output_layer_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
else:
output_layer_init_method = init_method
self.embeddings = GPTEmbedding(
vocab_size,
hidden_size,
max_seq_length,
init_method=init_method,
embedding_dropout_prob=embedding_dropout_prob,
amp_enabled=amp_enabled,
)
self.transformer = Transformer(
hidden_layers,
hidden_size,
ffn_hidden_size,
num_attention_heads,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
layernorm_epsilon=layernorm_epsilon,
init_method=init_method,
output_layer_init_method=output_layer_init_method,
bias_gelu_fusion=bias_gelu_fusion,
bias_dropout_fusion=bias_dropout_fusion,
scale_mask_softmax_fusion=scale_mask_softmax_fusion,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
apply_residual_post_layernorm=apply_residual_post_layernorm,
)
self.lm_head = LMLogits(vocab_size, bias=False)
@classmethod
def from_config(cls, cfg):
return {
"hidden_layers": cfg.hidden_layers,
"vocab_size": cfg.vocab_size,
"hidden_size": cfg.hidden_size,
"ffn_hidden_size": cfg.ffn_hidden_size,
"num_attention_heads": cfg.num_attention_heads,
"max_seq_length": cfg.max_seq_length,
"embedding_dropout_prob": cfg.embedding_dropout_prob,
"attention_dropout_prob": cfg.attention_dropout_prob,
"output_dropout_prob": cfg.output_dropout_prob,
"layernorm_epsilon": cfg.layernorm_epsilon,
"initializer_range": cfg.initializer_range,
"use_scaled_init_for_output_weights": cfg.use_scaled_init_for_output_weights,
"bias_gelu_fusion": cfg.bias_gelu_fusion,
"bias_dropout_fusion": cfg.bias_dropout_fusion,
"scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
"apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
"apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
"amp_enabled": cfg.amp_enabled,
}
def forward(self, input_ids):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
Returns:
flow.Tensor: logits
"""
input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
input_embeds = self.embeddings(input_ids, 0)
transformer_output = self.transformer(input_embeds, attention_mask=None)
output = self.lm_head(transformer_output, self.embeddings.token_embeddings.weight)
return output
class GPTEmbedding(nn.Module):
def __init__(
self,
vocab_size,
hidden_size,
max_seq_length,
init_method=init.xavier_normal_,
embedding_dropout_prob=0.0,
amp_enabled=False,
):
super().__init__()
self.token_embeddings = VocabEmbedding(
vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.position_embeddings = Embedding(
max_seq_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.dropout = nn.Dropout(embedding_dropout_prob)
self.position_ids = flow.arange(
max_seq_length,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
).unsqueeze(0)
def forward(self, input_ids, past_length=0):
bsz, seq_length = input_ids.size()
position_ids = self.position_ids[:, past_length : past_length + seq_length]
position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
token_embeds = self.token_embeddings(input_ids)
position_embeds = self.position_embeddings(position_ids)
input_embeds = token_embeds + position_embeds
input_embeds = self.dropout(input_embeds)
return input_embeds
class Transformer(nn.Module):
def __init__(
self,
hidden_layers,
hidden_size,
ffn_hidden_size,
num_attention_heads,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
layernorm_epsilon=1e-5,
init_method=init.xavier_normal_,
output_layer_init_method=None,
bias_gelu_fusion=False,
bias_dropout_fusion=False,
scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False,
apply_residual_post_layernorm=False,
):
super().__init__()
self.hidden_layers = hidden_layers
def build_layer(layer_number):
return TransformerLayer(
hidden_size,
ffn_hidden_size,
num_attention_heads,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
layernorm_epsilon=layernorm_epsilon,
init_method=init_method,
output_layer_init_method=output_layer_init_method,
bias_gelu_fusion=bias_gelu_fusion,
bias_dropout_fusion=bias_dropout_fusion,
scale_mask_softmax_fusion=scale_mask_softmax_fusion,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
apply_residual_post_layernorm=apply_residual_post_layernorm,
attn_mask_type=AttnMaskType.causal,
layer_idx=layer_number,
)
self.layers = nn.ModuleList([build_layer(i) for i in range(self.hidden_layers)])
self.layernorm_f = LayerNorm(hidden_size, eps=layernorm_epsilon, layer_idx=-1)
def forward(self, hidden_states, attention_mask):
# hidden_states shape: (batch_size, seq_length, hidden_size)
# sbp: [S(0), B]
for i, layer in enumerate(self.layers):
hidden_states = layer(hidden_states, attention_mask)
output = self.layernorm_f(hidden_states)
return output
class GPTLoss(nn.Module):
def __init__(self) -> None:
super().__init__()
self.lm_loss = ParallelCrossEntropyLoss()
def forward(self, logits, lm_labels):
lm_loss = self.lm_loss(logits, lm_labels)
lm_loss = lm_loss.mean()
return {"lm_loss": lm_loss}
class GPTForPreTraining(nn.Module):
"""
GPT Model with classification head on top.
"""
def __init__(self, cfg) -> None:
super().__init__()
self.GPT_model = GPTModel(cfg)
self.loss_func = GPTLoss()
def forward(
self,
input_ids,
labels=None,
):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
labels (flow.LongTensor, optional): Labels for computing language modeling loss.
None for evaluating. Defaults to None.
Returns:
dict:
A dict containing :code:`loss_value` or :code:`logits`
depending on training or evaluation.
:code:`{"masked_lm_loss": loss_value}` when training,
:code:`{"prediction_scores": logits}` when evaluating.
"""
logits = self.GPT_model(input_ids)
if labels is not None:
lm_loss = self.loss_func(logits, labels)
return lm_loss
else:
return {"prediction_scores": logits}
@staticmethod
def set_pipeline_stage_id(model: nn.Module):
dist_utils = dist.get_dist_util()
if hasattr(model.GPT_model.transformer.layernorm_f, "config"):
# Old API in OneFlow 0.8
for module_block in model.modules():
if isinstance(module_block.origin, (GPTEmbedding, CasualMask)):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, TransformerLayer):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
elif isinstance(module_block.origin, (LMLogits, GPTLoss)):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.GPT_model.transformer.layernorm_f.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
else:
for module_block in model.modules():
if isinstance(module_block.to(nn.Module), (GPTEmbedding, CasualMask)):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), TransformerLayer):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
elif isinstance(module_block.to(nn.Module), (LMLogits, GPTLoss)):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.GPT_model.transformer.layernorm_f.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# ResMLP Model
# References:
# resmlp: https://github.com/facebookresearch/deit/blob/main/resmlp_models.py
# --------------------------------------------------------
import oneflow as flow
import oneflow.nn as nn
from flowvision.layers.weight_init import trunc_normal_
import libai.utils.distributed as dist
from libai.config import configurable
from libai.layers import MLP, DropPath, LayerNorm, Linear, PatchEmbedding
class Affine(nn.Module):
def __init__(self, dim, *, layer_idx=0):
super().__init__()
self.alpha = nn.Parameter(
flow.ones(
dim,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.beta = nn.Parameter(
flow.zeros(
dim,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
),
)
self.layer_idx = layer_idx
def forward(self, x):
x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
return self.alpha * x + self.beta
class layers_scale_mlp_blocks(nn.Module):
def __init__(
self, dim, drop=0.0, drop_path=0.0, init_values=1e-4, num_patches=196, *, layer_idx=0
):
super().__init__()
self.norm1 = Affine(dim, layer_idx=layer_idx)
self.attn = Linear(num_patches, num_patches, layer_idx=layer_idx)
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.norm2 = Affine(dim, layer_idx=layer_idx)
self.mlp = MLP(hidden_size=dim, ffn_hidden_size=int(4.0 * dim), layer_idx=layer_idx)
self.gamma_1 = nn.Parameter(
init_values
* flow.ones(
dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(layer_idx),
),
requires_grad=True,
)
self.gamma_2 = nn.Parameter(
init_values
* flow.ones(
dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(layer_idx),
),
requires_grad=True,
)
self.layer_idx = layer_idx
def forward(self, x):
x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
x = x + self.drop_path(
self.gamma_1 * self.attn(self.norm1(x).transpose(1, 2)).transpose(1, 2)
)
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class ResMLP(nn.Module):
"""ResMLP in LiBai.
LiBai's implementation of:
`ResMLP: Feedforward networks for image classification with data-efficient training
<https://arxiv.org/abs/2105.03404>`_
Args:
img_size (int, tuple(int)): input image size
patch_size (int, tuple(int)): patch size
in_chans (int): number of input channels
embed_dim (int): embedding dimension
depth (int): depth of transformer
drop_rate (float): dropout rate
drop_path_rate (float): stochastic depth rate
init_scale (float): the layer scale ratio
num_classes (int): number of classes for classification head
loss_func (callable, optional): loss function for computing the total loss
between logits and labels
"""
@configurable
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
drop_rate=0.0,
drop_path_rate=0.0,
init_scale=1e-4,
num_classes=1000,
loss_func=None,
):
super().__init__()
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim
self.patch_embed = PatchEmbedding(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
)
num_patches = self.patch_embed.num_patches
dpr = [drop_path_rate for i in range(depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList(
[
layers_scale_mlp_blocks(
dim=embed_dim,
drop=drop_rate,
drop_path=dpr[i],
init_values=init_scale,
num_patches=num_patches,
layer_idx=i,
)
for i in range(depth)
]
)
self.norm = Affine(embed_dim, layer_idx=-1)
self.head = (
Linear(embed_dim, num_classes, layer_idx=-1) if num_classes > 0 else nn.Identity()
)
# loss func
self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
# weight init
self.apply(self._init_weights)
@classmethod
def from_config(cls, cfg):
return {
"img_size": cfg.img_size,
"patch_size": cfg.patch_size,
"in_chans": cfg.in_chans,
"embed_dim": cfg.embed_dim,
"depth": cfg.depth,
"drop_rate": cfg.drop_rate,
"drop_path_rate": cfg.drop_path_rate,
"init_scale": cfg.init_scale,
"num_classes": cfg.num_classes,
"loss_func": cfg.loss_func,
}
def _init_weights(self, m):
if isinstance(m, Linear):
trunc_normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward_features(self, x):
x = self.patch_embed(x)
# layer scale mlp blocks
for i, blk in enumerate(self.blocks):
x = blk(x)
return x
def forward_head(self, x):
B = x.shape[0]
x = self.norm(x)
x = x.mean(dim=1).reshape(B, 1, -1)
return self.head(x[:, 0])
def forward(self, images, labels=None):
"""
Args:
images (flow.Tensor): training samples.
labels (flow.LongTensor, optional): training targets
Returns:
dict:
A dict containing :code:`loss_value` or :code:`logits`
depending on training or evaluation mode.
:code:`{"losses": loss_value}` when training,
:code:`{"prediction_scores": logits}` when evaluating.
"""
x = self.forward_features(images)
x = self.forward_head(x)
if labels is not None and self.training:
losses = self.loss_func(x, labels)
return {"losses": losses}
else:
return {"prediction_scores": x}
@staticmethod
def set_pipeline_stage_id(model):
dist_utils = dist.get_dist_util()
# Set pipeline parallelism stage_id
if hasattr(model.loss_func, "config"):
# Old API in OneFlow 0.8
for module_block in model.modules():
# module.origin can get the original module
if isinstance(module_block.origin, PatchEmbedding):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, layers_scale_mlp_blocks):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
# Set norm and head stage id
model.norm.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.head.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.loss_func.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
else:
for module_block in model.modules():
if isinstance(module_block.to(nn.Module), PatchEmbedding):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
# Set norm and head stage id
model.norm.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.head.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.loss_func.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
@staticmethod
def set_activation_checkpoint(model):
for module_block in model.modules():
if hasattr(module_block, "origin"):
# Old API in OneFlow 0.8
if isinstance(module_block.origin, layers_scale_mlp_blocks):
module_block.config.activation_checkpointing = True
else:
if isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
module_block.to(nn.graph.GraphModule).activation_checkpointing = True
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from libai.config import configurable
from libai.layers import (
Embedding,
LayerNorm,
Linear,
LMLogits,
ParallelCrossEntropyLoss,
TransformerLayer,
VocabEmbedding,
build_activation,
)
from libai.utils import distributed as dist
from .bert_model import BertEmbeddings, BertExtendedAttnMask, BertModel, BertPooler
from .utils import init_method_normal
class RobertaExtendedAttnMask(BertExtendedAttnMask):
"""
Same as BertExtendedAttnMask.
"""
class RobertaEmbeddings(BertEmbeddings):
"""
Same as BertEmbeddings with a tiny tweak for vocab_embeddings and position_embeddings.
"""
def __init__(
self,
vocab_size,
hidden_size,
max_sequence_length,
embedding_dropout_prob,
num_tokentypes=0,
pad_token_id=1,
init_method=nn.init.xavier_normal_,
amp_enabled=False,
):
super().__init__(
vocab_size,
hidden_size,
max_sequence_length,
embedding_dropout_prob,
num_tokentypes=num_tokentypes,
init_method=init_method,
amp_enabled=amp_enabled,
)
self.pad_token_id = pad_token_id
self.vocab_embeddings = VocabEmbedding(
vocab_size,
hidden_size,
init_method=init_method,
amp_enabled=amp_enabled,
padding_idx=pad_token_id,
)
self.position_embeddings = Embedding(
max_sequence_length,
hidden_size,
init_method=init_method,
amp_enabled=amp_enabled,
padding_idx=pad_token_id,
)
if num_tokentypes > 0:
self.tokentype_embeddings = Embedding(
num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled
)
self.tokentype_ids = flow.zeros(
1,
max_sequence_length,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
)
else:
self.tokentype_embeddings = None
def forward(self, input_ids, tokentype_ids=None, position_ids=None):
seq_length = input_ids.size()[1]
word_embeddings = self.vocab_embeddings(input_ids)
if position_ids is None:
position_ids = self.create_position_ids_from_input_ids(input_ids, self.pad_token_id)
position_embeddings = self.position_embeddings(position_ids)
embeddings = word_embeddings + position_embeddings
if self.tokentype_embeddings is not None:
if tokentype_ids is None:
tokentype_ids = (
self.tokentype_ids[:, :seq_length]
.expand_as(input_ids)
.to_global(sbp=input_ids.sbp)
)
embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
embeddings = self.embedding_dropout(embeddings)
return embeddings
def create_position_ids_from_input_ids(self, input_ids, pad_token_id):
mask = input_ids.ne(pad_token_id).int()
position_ids = (flow.cumsum(mask, dim=1).type_as(mask)) * mask + pad_token_id
position_ids = position_ids.to_global(sbp=input_ids.sbp, placement=input_ids.placement)
return position_ids
class RobertaPooler(BertPooler):
"""
Same as BertPooler.
"""
class RobertaLoss(nn.Module):
def __init__(self):
super().__init__()
self.lm_loss = ParallelCrossEntropyLoss()
def forward(self, lm_output, lm_labels, loss_mask):
lm_labels = lm_labels.to_global(placement=lm_output.placement)
loss_mask = loss_mask.to_global(placement=lm_output.placement)
lm_loss = self.lm_loss(lm_output, lm_labels)
loss_mask = loss_mask.float()
# Change loss_mask.sum() sbp sign from [P, B] -> [B, B]
# because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P
denominator = loss_mask.sum().to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
)
masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
masked_lm_loss = masked_lm_loss.to_global(
sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
)
loss_dict = {"lm_loss": masked_lm_loss}
return loss_dict
class RobertaModel(BertModel):
"""The bare Roberta Model transformer outputting raw hidden-states without
any specific head on top.
Args:
vocab_size (int):
The size of vocabulary file.
hidden_size (int):
The size of hidden states.
hidden_layers (int):
The number of ``TransformerLayer`` in encoder.
num_attention_heads (int):
The number of attention heads for each attention layer of ``TransformerLayer``.
intermediate_size (int):
The size of intermediate layer in feed-forward network for each
``TransformerLayer``.
hidden_dropout_prob (float, optional):
The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
attention_probs_dropout_prob (float, optional):
The dropout ratio for the output of each attention layer in ``TransformerLayer``.
Defaults to 0.0.
max_position_embeddings (int):
Max sequence length of input, defines the shape of Position Embeddings
in ``RobertaEmbeddings``.
type_vocab_size (int, optional):
Number of segment token indices. Defaults to 2.
add_pooling_layer (bool, optional):
Whether or not averaging or pooling the sequence of hidden-states for the
whole input sequence. Defaults to ``True``.
initializer_range (float, optional):
Sigma of the normal distribution in the initialization method. Defaults to 0.02.
layer_norm_eps (float, optional):
The epsilon of LayerNorm layer. Defaults to 1e-5.
pad_token_id (int, optional):
The token id used for padding. Defaults to 1.
bias_gelu_fusion (bool, optional):
Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
bias_dropout_fusion (bool, optional):
Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
scale_mask_softmax_fusion (bool, optional):
Whether to fuse the computing of mask and softmax in attention layers.
Defaults to ``False``.
apply_query_key_layer_scaling (bool, optional):
Whether or not to use layer index related scaling in computing attention scores.
If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
Defaults to ``True``.
apply_residual_post_layernorm (bool, optional):
If set ``True``, use original BERT(Roberta) residual connection ordering
otherwise use Megatron BERT residual connection which is more stable
when scaling model size introduced in https://arxiv.org/pdf/1909.08053.pdf.
Default: ``False``.
amp_enabled (bool, optional):
Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
"""
@configurable
def __init__(
self,
vocab_size,
hidden_size,
hidden_layers,
num_attention_heads,
intermediate_size,
hidden_dropout_prob,
attention_probs_dropout_prob,
max_position_embeddings,
num_tokentypes=2,
add_pooling_layer=True,
initializer_range=0.02,
layernorm_eps=1e-12,
pad_token_id=1,
bias_gelu_fusion=True,
bias_dropout_fusion=True,
scale_mask_softmax_fusion=True,
apply_query_key_layer_scaling=True,
apply_residual_post_layernorm=False,
amp_enabled=False,
):
super().__init__(
vocab_size,
hidden_size,
hidden_layers,
num_attention_heads,
intermediate_size,
hidden_dropout_prob,
attention_probs_dropout_prob,
max_position_embeddings,
num_tokentypes=num_tokentypes,
add_pooling_layer=add_pooling_layer,
initializer_range=initializer_range,
layernorm_eps=layernorm_eps,
bias_gelu_fusion=bias_gelu_fusion,
bias_dropout_fusion=bias_dropout_fusion,
scale_mask_softmax_fusion=scale_mask_softmax_fusion,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
apply_residual_post_layernorm=apply_residual_post_layernorm,
amp_enabled=amp_enabled,
)
init_method = init_method_normal(initializer_range)
# Embeddings
self.embeddings = RobertaEmbeddings(
vocab_size,
hidden_size,
max_position_embeddings,
hidden_dropout_prob,
num_tokentypes,
pad_token_id,
init_method,
amp_enabled,
)
# Mask generation
self.extended_attn_mask = RobertaExtendedAttnMask()
self.pooler = RobertaPooler(hidden_size, init_method) if add_pooling_layer else None
@classmethod
def from_config(cls, cfg):
return {
"vocab_size": cfg.vocab_size,
"hidden_size": cfg.hidden_size,
"hidden_layers": cfg.hidden_layers,
"num_attention_heads": cfg.num_attention_heads,
"intermediate_size": cfg.intermediate_size,
"hidden_dropout_prob": cfg.hidden_dropout_prob,
"attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
"max_position_embeddings": cfg.max_position_embeddings,
"num_tokentypes": cfg.num_tokentypes,
"add_pooling_layer": cfg.add_pooling_layer,
"initializer_range": cfg.initializer_range,
"layernorm_eps": cfg.layernorm_eps,
"pad_token_id": cfg.pad_token_id,
"bias_gelu_fusion": cfg.bias_gelu_fusion,
"bias_dropout_fusion": cfg.bias_dropout_fusion,
"scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
"apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
"apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
"amp_enabled": cfg.amp_enabled,
}
class RobertaLMHead(nn.Module):
def __init__(self, vocab_size, hidden_size, init_method, layer_norm_eps):
super().__init__()
self.dense = Linear(
hidden_size,
hidden_size,
bias=True,
parallel="data",
init_method=init_method,
layer_idx=-1,
)
self.activation_func = build_activation("gelu")
self.layernorm = LayerNorm((hidden_size,), eps=layer_norm_eps, layer_idx=-1)
# NOTE(xzp): LMLogits as a decoder:nn.Linear(hidden_size, vocab_size),
# it shares the roberta.word_embeddings.weight
self.lm_logits = LMLogits(vocab_size, bias=True)
def forward(self, hidden_states, word_embeddings_weight):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation_func(hidden_states)
hidden_states = hidden_states.to_global(
sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])
)
hidden_states = self.layernorm(hidden_states)
hidden_states = self.lm_logits(hidden_states, word_embeddings_weight)
return hidden_states
class RobertaPreTrainedModel(nn.Module):
@staticmethod
def set_pipeline_stage_id(model):
dist_utils = dist.get_dist_util()
# Set pipeline parallelism stage_id
if hasattr(model.roberta.final_layernorm, "config"):
# Old API in OneFlow 0.8
for module_block in model.modules():
# module.origin can get the original module
if isinstance(module_block.origin, RobertaEmbeddings):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, RobertaExtendedAttnMask):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, TransformerLayer):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
# `add_pooling_layer` in RobertaForMaskedLM and RobertaForCausalLM.
# default to False.
elif isinstance(module_block.origin, RobertaPooler):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
elif isinstance(module_block.origin, RobertaLMHead):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# Set the last layernorm stage id
model.roberta.final_layernorm.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
else:
for module_block in model.modules():
# module.origin can get the original module
if isinstance(module_block.to(nn.Module), RobertaEmbeddings):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), RobertaExtendedAttnMask):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), TransformerLayer):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
# `add_pooling_layer` in RobertaForMaskedLM and RobertaForCausalLM.
# default to False.
elif isinstance(module_block.to(nn.Module), RobertaPooler):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
elif isinstance(module_block.to(nn.Module), RobertaLMHead):
module_block.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# Set the last layernorm stage id
model.roberta.final_layernorm.to(nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
class RobertaForPreTraining(RobertaPreTrainedModel):
def __init__(self, cfg):
super().__init__()
cfg.add_pooling_layer = False
self.roberta = RobertaModel(cfg)
self.lm_head = RobertaLMHead(
cfg.vocab_size,
cfg.hidden_size,
init_method_normal(cfg.initializer_range),
cfg.layernorm_eps,
)
self.loss_fc = RobertaLoss()
def forward(
self,
input_ids,
attention_mask,
tokentype_ids=None,
lm_labels=None,
loss_mask=None,
):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
attention_mask (flow.BoolTensor): Mask to avoid performing attention on
padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
and second portions of the inputs. Indices are selected in `[0, 1]`.
Defaults to None.
labels (flow.LongTensor, optional): Labels for computing the masked
language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
Defaults to None.
loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Defaults to None.
"""
input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0))
tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0))
outputs = self.roberta(input_ids, attention_mask, tokentype_ids=tokentype_ids)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output, self.roberta.word_embeddings_weight())
if lm_labels is not None:
return self.loss_fc(prediction_scores, lm_labels, loss_mask)
return {"prediction_scores": prediction_scores}
class RobertaForCausalLM(RobertaPreTrainedModel):
def __init__(self, cfg):
super().__init__()
cfg.add_pooling_layer = False
self.roberta = RobertaModel(cfg)
self.lm_head = RobertaLMHead(
cfg.vocab_size,
cfg.hidden_size,
init_method_normal(cfg.initializer_range),
cfg.layernorm_eps,
)
self.loss_fc = RobertaLoss()
def forward(
self,
input_ids,
attention_mask,
tokentype_ids=None,
position_ids=None,
labels=None,
loss_mask=None,
):
"""
Args:
input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
attention_mask (flow.BoolTensor): Mask to avoid performing attention on
padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
and second portions of the inputs. Indices are selected in `[0, 1]`.
Defaults to None.
position_ids (flow.LongTensor, optional): Indices of positions of each input sequence
tokens in the position embeddings. Defaults to None.
labels (flow.LongTensor, optional): Labels for computing the masked
language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
Defaults to None.
loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Defaults to None.
"""
outputs = self.roberta(input_ids, attention_mask, position_ids, tokentype_ids)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output, self.roberta.word_embeddings_weight())
if labels is not None:
# next-token prediction task, shift prediction_scores and labels by one.
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
shifted_prediction_scores = shifted_prediction_scores.to_global(
sbp=prediction_scores.sbp
)
shifted_labels = labels[:, 1:].contiguous()
shifted_labels = shifted_labels.to_global(sbp=shifted_labels.sbp)
lm_loss = self.loss_fc(shifted_prediction_scores, shifted_labels, loss_mask)
return {"lm_loss": lm_loss}
return {"prediction_scores": prediction_scores}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment