No commit message

No commit message

No commit message
4e32f529 · laibao · bf014115 · 4e32f529 · 4e32f529 · 4e32f529
Commit 4e32f529 authored Dec 20, 2024 by laibao
20 changed files
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+}
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
+#!/bin/bash
+
+PORT=8000
+MODEL=$1
+TOKENS=$2
+
+docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
+           -v $PWD/data:/data \
+           ghcr.io/huggingface/text-generation-inference:2.2.0 \
+           --model-id $MODEL \
+           --sharded false  \
+           --max-input-length 1024 \
+           --max-total-tokens 2048 \
+           --max-best-of 5 \
+           --max-concurrent-requests 5000 \
+           --max-batch-total-tokens $TOKENS
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
+import cProfile
+import pstats
+
+from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"
+               ] * 1000
+LONG_PROMPT = ' '.join(LONG_PROMPT)
+
+
+def main(args):
+    llm = LLM(
+        model=args.model,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        tensor_parallel_size=args.tensor_parallel_size,
+        use_v2_block_manager=args.use_v2_block_manager,
+    )
+
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    profiler = cProfile.Profile()
+
+    print("------warm up------")
+    for i in range(3):
+        output = llm.generate(LONG_PROMPT, sampling_params)
+        print(output[0].outputs[0].text)
+
+    print("------start generating------")
+    for i in range(3):
+        profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
+                        globals(), locals())
+
+    # analyze the runtime of hashing function
+    stats = pstats.Stats(profiler)
+    stats.sort_stats('cumulative')
+    total_time = 0
+    total_calls = 0
+    for func in stats.stats:
+        if 'hash_of_block' in func[2]:
+            total_time = stats.stats[func][3]
+            total_calls = stats.stats[func][0]
+    percentage = (total_time / stats.total_tt) * 100
+    print(f"Hashing took {total_time:.2f} seconds,"
+          f"{percentage:.2f}% of the total runtime.")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Benchmark the performance of hashing function in'
+        'automatic prefix caching.')
+    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--output-len', type=int, default=10)
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        help='Use BlockSpaceMangerV2')
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/sonnet.txt
+++ b/benchmarks/sonnet.txt
+FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!
\ No newline at end of file
--- a/docs/GLM.png
+++ b/docs/GLM.png
--- a/docs/transformers.jpg
+++ b/docs/transformers.jpg
--- a/examples/cpu_offload.py
+++ b/examples/cpu_offload.py
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/gguf_inference.py
+++ b/examples/gguf_inference.py
+from huggingface_hub import hf_hub_download
+
+from vllm import LLM, SamplingParams
+
+
+def run_gguf_inference(model_path):
+    PROMPT_TEMPLATE = "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"  # noqa: E501
+    system_message = "You are a friendly chatbot who always responds in the style of a pirate."  # noqa: E501
+    # Sample prompts.
+    prompts = [
+        "How many helicopters can a human eat in one sitting?",
+        "What's the future of AI?",
+    ]
+    prompts = [
+        PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt)
+        for prompt in prompts
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0, max_tokens=128)
+
+    # Create an LLM.
+    llm = LLM(model=model_path,
+              tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+              gpu_memory_utilization=0.95)
+
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+    filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
+    model = hf_hub_download(repo_id, filename=filename)
+    run_gguf_inference(model)
--- a/examples/gradio_openai_chatbot_webserver.py
+++ b/examples/gradio_openai_chatbot_webserver.py
+import argparse
+
+import gradio as gr
+from openai import OpenAI
+
+# Argument parser setup
+parser = argparse.ArgumentParser(
+    description='Chatbot Interface with Customizable Parameters')
+parser.add_argument('--model-url',
+                    type=str,
+                    default='http://localhost:8000/v1',
+                    help='Model URL')
+parser.add_argument('-m',
+                    '--model',
+                    type=str,
+                    required=True,
+                    help='Model name for the chatbot')
+parser.add_argument('--temp',
+                    type=float,
+                    default=0.8,
+                    help='Temperature for text generation')
+parser.add_argument('--stop-token-ids',
+                    type=str,
+                    default='',
+                    help='Comma-separated stop token IDs')
+parser.add_argument("--host", type=str, default=None)
+parser.add_argument("--port", type=int, default=8001)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Set OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = args.model_url
+
+# Create an OpenAI client to interact with the API server
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+
+def predict(message, history):
+    # Convert chat history to OpenAI format
+    history_openai_format = [{
+        "role": "system",
+        "content": "You are a great ai assistant."
+    }]
+    for human, assistant in history:
+        history_openai_format.append({"role": "user", "content": human})
+        history_openai_format.append({
+            "role": "assistant",
+            "content": assistant
+        })
+    history_openai_format.append({"role": "user", "content": message})
+
+    # Create a chat completion request and send it to the API server
+    stream = client.chat.completions.create(
+        model=args.model,  # Model name to use
+        messages=history_openai_format,  # Chat history
+        temperature=args.temp,  # Temperature for text generation
+        stream=True,  # Stream response
+        extra_body={
+            'repetition_penalty':
+            1,
+            'stop_token_ids': [
+                int(id.strip()) for id in args.stop_token_ids.split(',')
+                if id.strip()
+            ] if args.stop_token_ids else []
+        })
+
+    # Read and return generated text from response stream
+    partial_message = ""
+    for chunk in stream:
+        partial_message += (chunk.choices[0].delta.content or "")
+        yield partial_message
+
+
+# Create and launch a chat interface with Gradio
+gr.ChatInterface(predict).queue().launch(server_name=args.host,
+                                         server_port=args.port,
+                                         share=True)
--- a/examples/gradio_webserver.py
+++ b/examples/gradio_webserver.py
+import argparse
+import json
+
+import gradio as gr
+import requests
+
+
+def http_bot(prompt):
+    headers = {"User-Agent": "vLLM Client"}
+    pload = {
+        "prompt": prompt,
+        "stream": True,
+        "max_tokens": 128,
+    }
+    response = requests.post(args.model_url,
+                             headers=headers,
+                             json=pload,
+                             stream=True)
+
+    for chunk in response.iter_lines(chunk_size=8192,
+                                     decode_unicode=False,
+                                     delimiter=b"\0"):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"][0]
+            yield output
+
+
+def build_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# vLLM text completion demo\n")
+        inputbox = gr.Textbox(label="Input",
+                              placeholder="Enter text and press ENTER")
+        outputbox = gr.Textbox(label="Output",
+                               placeholder="Generated result from the model")
+        inputbox.submit(http_bot, [inputbox], [outputbox])
+    return demo
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument("--model-url",
+                        type=str,
+                        default="http://localhost:8000/generate")
+    args = parser.parse_args()
+
+    demo = build_demo()
+    demo.queue().launch(server_name=args.host,
+                        server_port=args.port,
+                        share=True)
--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
+# Medusa Decoding
+本文说明如何使用vllm构建和运行medusa模型
+
+## Overview
+Medusa是一种大模型并行解码算法，除了支持官方提供的Top1-proposer,我们还支持tree-style并行解码，target model和draft model均可多卡推理
+
+与其他模型不同，medusa解码需要一个base model和若干Medusa heads.
+
+Vllm medusa model的实现在[vllm/model_executor/models/medusa.py]
+
+## Support Matrix
+  * FP16
+  * BF16
+  * PAGED_KV_CACHE
+  * Tensor Parallel
+
+### convert Medusa model weights
+# medusa 模型需要转换为vllm中Medusa的模型格式
+
+```bash
+python medusa_weight_converter.py --medusa_num_heads 4 --medusa_num_layers 1 --medusa_model_path /work/model.bin --vocab_size 152064 --hidden_size 8192 --output_dir /work/medusa/vllm-medusa-qwen2-72b-head-4 --medusa_choices="[(0), (0, 0), (0, 0, 0), (0, 1), (1), (1, 0), (0, 0, 0, 0), (0, 0, 1), (0, 2), (0, 1, 0), (2), (0, 0, 2), (0, 3), (1, 0, 0), (2, 0), (0, 2, 0), (0, 4), (0, 0, 3), (3), (0, 0, 0, 1), (0, 5), (0, 0, 1, 0), (0, 0, 4)]"
+```
+此处model.bin是训练后保存的medusa head权重，如果希望采用Top1-proposer，medusa_choices可以不设置
+
+
+### Run tree-style generation server
+
+```bash
+VLLM_TREE_DECODING=1 python3 -m vllm.entrypoints.openai.api_server \
+  --served-model-name qwen_medusa \
+  --model /models/Qwen2-72B-Instruct/ -tp 4 \
+  --max-model-len 1024 --max-num-seqs 8 --gpu-memory-utilization 0.8 \
+  --speculative-model /work/medusa/vllm-medusa-qwen2-72b-head-4 \
+  --speculative-draft-tensor-parallel-size 4 \
+  --speculative-disable-by-batch-size 9 \
+  --use-v2-block-manager \
+  --spec-decoding-acceptance-method typical_acceptance_sampler \
+  --dtype float16 --trust-remote-code --port 8086\
+  --num-speculative-heads 4 --num-speculative-tokens 24
+```
+注意：
+num_speculative_tokens = len(medusa_choices) + 1
+medusa_choices个数不能太多，否则多batch下会降低推理速度
+speculative-disable-by-batch-size要大于max-num-seqs，否则当batch等于max-num-seqs时，不会走并行解码
+
+### Run Top1-proposer server
+python3 -m vllm.entrypoints.openai.api_server \
+  --served-model-name qwen_medusa \
+  --model /models/Qwen2-72B-Instruct/ -tp 4 \
+  --max-model-len 1024 --max-num-seqs 8 --gpu-memory-utilization 0.8 \
+  --speculative-model /work/medusa/vllm-medusa-qwen2-72b-head-4 \
+  --speculative-draft-tensor-parallel-size 4 \
+  --speculative-disable-by-batch-size 9 \
+  --use-v2-block-manager \
+  --spec-decoding-acceptance-method typical_acceptance_sampler \
+  --dtype float16 --trust-remote-code --port 8086\
+  --num-speculative-tokens 4
+注意:
+使用Top1-proposer时，num-speculative-tokens就是medusa head的个数
+
+# do request
+```bash
+curl http://localhost:8086/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "qwen_medusa",
+"prompt": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n帮我写一个C++的快速排序算法<|im_end|>\n<|im_start|>assistant\n",
+"max_tokens": 256,
+"temperature": 0.0
+}'
+```
+
+### Run tree-style benchmark
+```bash
+VLLM_TREE_DECODING=1 python /work/test/medusa_benchmark_throughput.py --model /models/Qwen2-72B-Instruct/ -tp 4 --dtype float16 --trust-remote-code --max-num-seqs 4 --speculative-model /work/medusa/vllm-medusa1-qwen2-72b-head-4 --speculative-draft-tensor-parallel-size 4 --speculative-disable-by-batch-size 9 --use-v2-block-manager --spec-decoding-acceptance-method typical_acceptance_sampler --max-model-len 1024 --dataset /work/medusa_benchmark_data.json --num-speculative-heads 4 --num-speculative-tokens 24 --gpu-memory-utilization 0.95
+```
+
+### Run Top1-proposer benchmark
+```bash
+python /work/test/medusa_benchmark_throughput.py --model /models/Qwen2-72B-Instruct/ -tp 4 --dtype float16 --trust-remote-code --max-num-seqs 4 --speculative-model /work/medusa/vllm-medusa1-qwen2-72b-head-4 --speculative-draft-tensor-parallel-size 4 --speculative-disable-by-batch-size 9 --use-v2-block-manager --spec-decoding-acceptance-method typical_acceptance_sampler --max-model-len 1024 --dataset /work/medusa_benchmark_data.json --num-speculative-tokens 4 --gpu-memory-utilization 0.95
+```
+可设置max-num-seqs对不同的batch进行性能测试
+
--- a/examples/medusa/medusa_benchmark_throughput.py
+++ b/examples/medusa/medusa_benchmark_throughput.py
+"""Benchmark offline inference throughput."""
+import argparse
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+import uvloop
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+from vllm.inputs import PromptInputs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+from vllm.lora.request import LoRARequest
+
+
+def nullable_str(val: str):
+    if not val or val == "None":
+        return None
+    return val
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Only keep the first two turns of each conversation.
+    dataset = [data["prompt"] for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = fixed_output_len
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    warmup_requests: List[Tuple[str, int, int]],
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    use_v2_block_manager: bool = False,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
+    max_num_seqs: int = 8,
+    speculative_model: str=None,
+    speculative_draft_tensor_parallel_size: int = 1,
+    speculative_disable_by_batch_size: int = 4,
+    spec_decoding_acceptance_method: str = None,
+    enable_lora: bool = False,
+    max_lora_rank: int = 32,
+    merge_lora: bool = False,
+    lora_extra_vocab_size: int = 0,
+    lora_target_modules: List[str] = None,
+    num_speculative_heads: int = 5,
+    num_speculative_tokens: int = 64,
+    use_new_beam_search_impl: bool = False,
+    lora_modules: str = None
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        use_v2_block_manager=use_v2_block_manager,
+        disable_async_output_proc=disable_async_output_proc,
+        max_num_seqs=max_num_seqs,
+        speculative_model=speculative_model,
+        speculative_draft_tensor_parallel_size=speculative_draft_tensor_parallel_size,
+        speculative_disable_by_batch_size=speculative_disable_by_batch_size,
+        spec_decoding_acceptance_method=spec_decoding_acceptance_method,
+        enable_lora=enable_lora,
+        max_lora_rank=max_lora_rank,
+        merge_lora=merge_lora,
+        lora_extra_vocab_size=lora_extra_vocab_size,
+        lora_target_modules=lora_target_modules,
+        num_speculative_heads=num_speculative_heads,
+        num_speculative_tokens=num_speculative_tokens
+    )
+
+    # Add the requests to the engine.
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
+    for prompt, _, output_len in requests:
+        prompts.append(prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=False,
+                max_tokens=output_len,
+            ))
+
+    # warmup
+    warmup_prompts = []
+    warmup_sampling_params = []
+    for prompt, _, output_len in warmup_requests:
+        warmup_prompts.append(prompt)
+        warmup_sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=False,
+                max_tokens=output_len,
+            ))
+        
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        if lora_modules is None:
+            llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
+        else:
+            llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True,
+                         lora_request=LoRARequest("medusa-lora", 1, lora_modules))
+
+    total_out_tokens = 0
+    start = time.perf_counter()
+
+    if lora_modules is None:
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
+    else:
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=False, 
+                               lora_request=LoRARequest("medusa-lora", 1, lora_modules))
+    for output in outputs:
+        print("token_ids len:{} text:{}".format(len(output.outputs[0].token_ids), output.outputs[0].text))
+        total_out_tokens += len(output.outputs[0].token_ids)
+
+    end = time.perf_counter()
+    return end - start, total_out_tokens
+
+
+async def run_vllm_async(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    use_v2_block_manager: bool = False,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
+    disable_frontend_multiprocessing: bool = False,
+    max_num_seqs: int = 8,
+    speculative_model: str=None,
+    speculative_draft_tensor_parallel_size: int = 1,
+    speculative_disable_by_batch_size: int = 4,
+    spec_decoding_acceptance_method: str = None,
+    enable_lora: bool = False,
+    max_lora_rank: int = 32,
+    merge_lora: bool = False,
+    lora_extra_vocab_size: int = 0,
+    lora_target_modules: List[str] = None,
+    num_speculative_heads: int = 5,
+    num_speculative_tokens: int = 64,
+    use_new_beam_search_impl: bool = False,
+    lora_modules: str = None
+) -> float:
+    from vllm import SamplingParams
+    engine_args = AsyncEngineArgs(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        use_v2_block_manager=use_v2_block_manager,
+        disable_async_output_proc=disable_async_output_proc,
+        worker_use_ray=False,
+        disable_log_requests=True,
+        max_num_seqs=max_num_seqs,
+        speculative_model=speculative_model,
+        speculative_draft_tensor_parallel_size=speculative_draft_tensor_parallel_size,
+        speculative_disable_by_batch_size=speculative_disable_by_batch_size,
+        spec_decoding_acceptance_method=spec_decoding_acceptance_method,
+        enable_lora=enable_lora,
+        max_lora_rank=max_lora_rank,
+        merge_lora=merge_lora,
+        lora_extra_vocab_size=lora_extra_vocab_size,
+        lora_target_modules=lora_target_modules,
+        num_speculative_heads=num_speculative_heads,
+        num_speculative_tokens=num_speculative_tokens
+    )
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        for prompt, _, output_len in requests:
+            prompts.append(prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=0.0 if use_beam_search else 1.0,
+                    top_p=1.0,
+                    use_beam_search=use_beam_search,
+                    ignore_eos=False,
+                    max_tokens=output_len,
+                ))
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        out_dict = {}
+        async for i, res in all_gens:
+            #print("res:", res)
+            out_dict[res.request_id] = len(res.outputs[0].token_ids)
+        end = time.perf_counter()
+        total_out_tokens = 0
+        for token_num in out_dict.values():
+            total_out_tokens += token_num
+        return end - start, total_out_tokens
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    warmup_prompt = "hi" * 10
+    warmup_requests = [(warmup_prompt, 10, 10)
+                for _ in range(1)]
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len)
+                    for _ in range(args.num_prompts)]
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+
+    if args.async_engine:
+        run_args = [
+            requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype,
+            args.quantization_param_path, args.device,
+            args.enable_prefix_caching, args.enable_chunked_prefill,
+            args.max_num_batched_tokens, args.distributed_executor_backend,
+            args.gpu_memory_utilization, args.num_scheduler_steps,
+            args.use_v2_block_manager, args.download_dir, args.load_format,
+            args.disable_async_output_proc, False, args.max_num_seqs,
+            args.speculative_model, args.speculative_draft_tensor_parallel_size,
+            args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method,
+            args.enable_lora, args.max_lora_rank, args.merge_lora, args.lora_extra_vocab_size,
+            args.lora_target_modules, args.num_speculative_heads,
+            args.num_speculative_tokens
+        ]
+    else:
+        run_args = [
+            warmup_requests, requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype,
+            args.quantization_param_path, args.device,
+            args.enable_prefix_caching, args.enable_chunked_prefill,
+            args.max_num_batched_tokens, args.distributed_executor_backend,
+            args.gpu_memory_utilization, args.num_scheduler_steps,
+            args.use_v2_block_manager, args.download_dir, args.load_format,
+            args.disable_async_output_proc, args.max_num_seqs,
+            args.speculative_model, args.speculative_draft_tensor_parallel_size,
+            args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method,
+            args.enable_lora, args.max_lora_rank, args.merge_lora, args.lora_extra_vocab_size,
+            args.lora_target_modules, args.num_speculative_heads,
+            args.num_speculative_tokens
+        ]
+    
+    if args.async_engine:
+        run_args.append(args.disable_frontend_multiprocessing)
+        elapsed_time, total_out_tokens = uvloop.run(run_vllm_async(*run_args))
+    else:
+        elapsed_time, total_out_tokens = run_vllm(*run_args, args.use_new_beam_search_impl, args.lora_modules)
+
+    total_num_tokens = total_out_tokens + sum(prompt_len
+                           for _, prompt_len, _ in requests)
+    
+    print(f"Latency: {elapsed_time:.2f} s")
+    print(f"All Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+    print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=256,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument("--use-new-beam-search-impl", action="store_true")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument("--enforce-eager",
+                        action="store_true",
+                        help="enforce eager execution")
+    parser.add_argument(
+        '--kv-cache-dtype',
+        type=str,
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (hcu) supports fp8 (=fp8_e4m3)')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (hcu), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument("--device",
+                        type=str,
+                        default="auto",
+                        choices=DEVICE_OPTIONS,
+                        help='device type for vLLM execution')
+    parser.add_argument(
+        "--num-scheduler-steps",
+        type=int,
+        default=1,
+        help="Maximum number of forward steps per scheduler call.")
+    parser.add_argument("--use-v2-block-manager",
+                        action='store_true',
+                        help="Enable block manager v2.")
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="Enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--enable-chunked-prefill",
+                        action='store_true',
+                        help="enable chunked prefill for vLLM backend.")
+    parser.add_argument('--max-num-batched-tokens',
+                        type=int,
+                        default=None,
+                        help='maximum number of batched tokens per '
+                        'iteration')
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
+    parser.add_argument(
+        '--load-format',
+        type=str,
+        default=EngineArgs.load_format,
+        choices=[
+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
+            'bitsandbytes'
+        ],
+        help='The format of the model weights to load.\n\n'
+        '* "auto" will try to load the weights in the safetensors format '
+        'and fall back to the pytorch bin format if safetensors format '
+        'is not available.\n'
+        '* "pt" will load the weights in the pytorch bin format.\n'
+        '* "safetensors" will load the weights in the safetensors format.\n'
+        '* "npcache" will load the weights in pytorch format and store '
+        'a numpy cache to speed up the loading.\n'
+        '* "dummy" will initialize the weights with random values, '
+        'which is mainly for profiling.\n'
+        '* "tensorizer" will load the weights using tensorizer from '
+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
+        'section for more information.\n'
+        '* "bitsandbytes" will load the weights using bitsandbytes '
+        'quantization.\n')
+    parser.add_argument(
+        "--disable-async-output-proc",
+        action='store_true',
+        default=False,
+        help="Disable async output processor for vLLM backend.")
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument('--max-num-seqs',
+                            type=int,
+                            default=EngineArgs.max_num_seqs,
+                            help='Maximum number of sequences per iteration.')
+    parser.add_argument(
+            '--speculative-model',
+            type=nullable_str,
+            default=EngineArgs.speculative_model,
+            help=
+            'The name of the draft model to be used in speculative decoding.')
+    parser.add_argument(
+            '--speculative-draft-tensor-parallel-size',
+            '-spec-draft-tp',
+            type=int,
+            default=EngineArgs.speculative_draft_tensor_parallel_size,
+            help='Number of tensor parallel replicas for '
+            'the draft model in speculative decoding.')
+    
+    parser.add_argument(
+            '--speculative-disable-by-batch-size',
+            type=int,
+            default=EngineArgs.speculative_disable_by_batch_size,
+            help='Disable speculative decoding for new incoming requests '
+            'if the number of enqueue requests is larger than this value.')
+    
+    parser.add_argument(
+            '--spec-decoding-acceptance-method',
+            type=str,
+            default=EngineArgs.spec_decoding_acceptance_method,
+            choices=['rejection_sampler', 'typical_acceptance_sampler'],
+            help='Specify the acceptance method to use during draft token '
+            'verification in speculative decoding. Two types of acceptance '
+            'routines are supported: '
+            '1) RejectionSampler which does not allow changing the '
+            'acceptance rate of draft tokens, '
+            '2) TypicalAcceptanceSampler which is configurable, allowing for '
+            'a higher acceptance rate at the cost of lower quality, '
+            'and vice versa.')
+
+    # LoRA related configs
+    parser.add_argument('--enable-lora',
+                            action='store_true',
+                            help='If True, enable handling of LoRA adapters.')
+    
+    parser.add_argument('--max-lora-rank',
+                            type=int,
+                            default=EngineArgs.max_lora_rank,
+                            help='Max LoRA rank.')
+    parser.add_argument('--merge-lora',
+                        type=bool,
+                        default=False,
+                        help='If set to True, the weights of the base layer will be merged with the weights of Lora.')
+    
+    parser.add_argument(
+            '--lora-extra-vocab-size',
+            type=int,
+            default=EngineArgs.lora_extra_vocab_size,
+            help=('Maximum size of extra vocabulary that can be '
+                  'present in a LoRA adapter (added to the base '
+                  'model vocabulary).'))
+    
+    parser.add_argument('--lora-target-modules',
+                            nargs='*',
+                            default=None,
+                            help='List of lora module name, If not specified, modules will be chosen according to the model architecture.')
+    
+    parser.add_argument(
+            '--num-speculative-heads',
+            type=int,
+            default=EngineArgs.num_speculative_heads,
+            help='The number of speculative heads to sample from '
+                 'the draft model in speculative decoding.')
+    
+    parser.add_argument(
+            '--num-speculative-tokens',
+            type=int,
+            default=EngineArgs.num_speculative_tokens,
+            help='The number of speculative tokens to sample from '
+            'the draft model in speculative decoding.')
+    
+    parser.add_argument(
+            '--lora-modules',
+            type=nullable_str,
+            default=None,
+            help=
+            'Path of lora model.')
+    
+
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    main(args)
\ No newline at end of file
--- a/examples/medusa/medusa_weight_converter.py
+++ b/examples/medusa/medusa_weight_converter.py
+import os
+import ast
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple, Union
+from addict import Dict
+import yaml
+import argparse
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+from transformers import PretrainedConfig
+from safetensors.torch import save_model, safe_open
+
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.utils import set_weight_attrs
+
+DEFAULT_VOCAB_PADDING_SIZE = 64
+
+TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE = 'medusa_head.{}.{}.linear.weight'
+TRAINED_MEDUSA_HEADS_NEMA_TEMPLATE = 'medusa_head.{}.1.weight'
+TRAINED_BLOCK_BIAS_NAME_TEMPLATE = 'medusa_head.{}.{}.linear.bias'
+
+VLLM_BLOCK_WEIGHT_NAME_TEMPLATE = 'blocks.{}.layers.{}.weight'
+VLLM_BLOCK_BIAS_NAME_TEMPLATE = 'blocks.{}.layers.{}.bias'
+VLLM_MEDUSA_HEADS_WEIGHT_NAME_TEMPLATE = 'lm_heads.{}.weight'
+
+
+def default_weight_loader(param: torch.Tensor,
+                          loaded_weight: torch.Tensor) -> None:
+    """Default weight loader."""
+    assert param.size() == loaded_weight.size()
+    param.data.copy_(loaded_weight)
+
+def pad_vocab_size(vocab_size: int,
+                   pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
+    """Pad the vocab size to the given value."""
+    return ((vocab_size + pad_to - 1) // pad_to) * pad_to
+
+class MedusaConfig(PretrainedConfig):
+    model_type = "medusa"
+
+    def __init__(self,
+                 hidden_size: int = 4096,
+                 vocab_size: int = 32001,
+                 num_heads: int = 5,
+                 num_hidden_layers: int = 1,
+                 max_paths: int = 64,
+                 topk: int = 10,
+                 truncated_vocab_size: Optional[int] = None,
+                 **kwargs):
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.max_paths = max_paths
+        self.topk = topk
+        self.max_seq_len = int(2**20)
+        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
+            else truncated_vocab_size
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["MedusaModel"]
+
+        super().__init__(**kwargs)
+
+    @property
+    def num_attention_heads(self):
+        return 0
+
+    @property
+    def num_lookahead_tokens(self):
+        return self.num_heads
+
+    @num_lookahead_tokens.setter
+    def num_lookahead_tokens(self, num_lookahead_tokens: int):
+        self.num_heads = num_lookahead_tokens
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
+    make sure it is divisible by the number of model parallel GPUs.
+
+    In order to support various loading methods, we ensure that LoRA-added
+    embeddings are always at the end of TP-sharded tensors. In other words,
+    we shard base embeddings and LoRA embeddings separately (both padded),
+    and place them in the same tensor.
+    In this example, we will have the original vocab size = 1010,
+    added vocab size = 16 and padding to 64. Therefore, the total
+    vocab size with padding will be 1088 (because we first pad 1010 to
+    1024, add 16, and then pad to 1088).
+    Therefore, the tensor format looks like the following:
+    TP1, rank 0 (no sharding):
+                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
+    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
+                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
+
+    TP2, rank 0:
+                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
+    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
+    TP2, rank 1:
+                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
+    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |
+
+    Args:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        params_dtype: type of the parameters.
+        org_num_embeddings: original vocabulary size (without LoRA).
+        padding_size: padding size for the vocabulary.
+        quant_config: quant config for the layer
+        prefix: full name of the layer in the state dict
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 org_num_embeddings: Optional[int] = None,
+                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.num_embeddings = num_embeddings
+        self.padding_size = padding_size
+        self.org_vocab_size = org_num_embeddings or num_embeddings
+        num_added_embeddings = num_embeddings - self.org_vocab_size
+        self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
+                                                    self.padding_size)
+        self.num_embeddings_padded = pad_vocab_size(
+            self.org_vocab_size_padded + num_added_embeddings,
+            self.padding_size)
+        assert self.org_vocab_size_padded <= self.num_embeddings_padded
+
+        self.embedding_dim = embedding_dim
+
+        linear_method = None
+        if quant_config is not None:
+            linear_method = quant_config.get_quant_method(self, prefix=prefix)
+        if linear_method is None:
+            linear_method = UnquantizedLinearMethod()
+        self.linear_method: QuantizeMethodBase = linear_method
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        self.linear_method.create_weights(self,
+                                          self.embedding_dim,
+                                          [self.num_embeddings_padded],
+                                          self.embedding_dim,
+                                          self.num_embeddings_padded,
+                                          params_dtype=params_dtype,
+                                          weight_loader=self.weight_loader)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        assert param.data.shape == loaded_weight.shape
+        param.data.copy_(loaded_weight)
+
+    def forward(self, input_):
+        masked_input = input_
+        # Get the embeddings.
+        output = F.embedding(masked_input.long(), self.weight)
+        return output
+
+class ParallelLMHead(VocabParallelEmbedding):
+    """Parallelized LM head.
+
+    Output logits weight matrices used in the Sampler. The weight and bias
+    tensors are padded to make sure they are divisible by the number of
+    model parallel GPUs.
+
+    Args:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        bias: whether to use bias.
+        params_dtype: type of the parameters.
+        org_num_embeddings: original vocabulary size (without LoRA).
+        padding_size: padding size for the vocabulary.
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 bias: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 org_num_embeddings: Optional[int] = None,
+                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__(num_embeddings, embedding_dim, params_dtype,
+                         org_num_embeddings, padding_size, quant_config,
+                         prefix)
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.num_embeddings_per_partition,
+                            dtype=params_dtype))
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
+        else:
+            self.register_parameter("bias", None)
+
+    def forward(self, input_):
+        del input_
+        raise RuntimeError("LMHead's weights should be used in the sampler.")
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, hidden_size: int, num_layers: int) -> None:
+        super().__init__()
+
+        self.layers = nn.ModuleList([
+            nn.Linear(hidden_size, hidden_size)
+            for _ in range(num_layers)
+        ])
+        self.act = nn.SiLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            x = x + self.act(layer(x))
+        return x
+
+class Medusa(nn.Module):
+
+    def __init__(self, config: MedusaConfig, **_) -> None:
+        super().__init__()
+        self.config = config
+        self.blocks = nn.ModuleList([
+            ResidualBlock(hidden_size=self.config.hidden_size,
+                          num_layers=self.config.num_hidden_layers)
+            for _ in range(self.config.num_heads)
+        ])
+        self.orig_vocab_size = config.vocab_size
+        self.truncated_vocab_size = config.truncated_vocab_size
+        self.unpadded_vocab_size = self.truncated_vocab_size
+
+        self.lm_heads = nn.ModuleList([
+            ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.truncated_vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            ) for _ in range(self.config.num_heads)
+        ]) 
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+
+        self.token_map = None
+
+    def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
+        return [block(hidden_states) for block in self.blocks]
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+
+        weights_map = {}
+
+        for name, loaded_weight in weights:
+            name = name.replace("medusa_heads.", "")
+
+            if name == "token_map":
+                if self.truncated_vocab_size < self.orig_vocab_size:
+                    self.token_map = nn.Parameter(loaded_weight,
+                                                  requires_grad=False)
+            elif name in params_dict:
+                weights_map[name] = loaded_weight
+
+        for name, loaded_weight in weights_map.items():
+            if "lm_head" in name and self.token_map is not None and\
+                loaded_weight.shape[0] > self.token_map.shape[0]:
+
+                loaded_weight = loaded_weight[self.token_map]
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        if self.token_map is not None:
+            self.token_map.to(device=self.lm_heads[0].weight.device)
+
+        assert (self.truncated_vocab_size
+                == self.orig_vocab_size) or (self.token_map is not None)
+
+class CustomMedusaConfig(PretrainedConfig):
+    model_type = "medusa"
+
+    def __init__(self,
+                 name_or_path: str = "S-3000/vllm-medusa-qwen1.5-7b-chat",
+                 architectures: list[str] = ["MedusaModel"],
+                 hidden_size: int = 4096,
+                 model_type: str = "medusa",
+                 num_heads: int = 5,
+                 num_hidden_layers: int = 1,
+                 transformers_version: str = "4.41.2",
+                 truncated_vocab_size: Optional[int] = None,
+                 vocab_size: int = 151936,
+                 medusa_choices:List[List[int]] = None,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self._name_or_path = name_or_path
+        self.architectures = architectures
+        self.hidden_size = hidden_size
+        self.model_type = model_type
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.transformers_version = transformers_version
+        self.truncated_vocab_size = truncated_vocab_size
+        self.vocab_size = vocab_size
+        self.medusa_choices = medusa_choices
+
+
+def main(args):
+    medusa_head_num = args.medusa_num_heads
+    medusa_num_layers = args.medusa_num_layers
+
+    config = MedusaConfig(hidden_size=args.hidden_size, vocab_size=args.vocab_size, num_heads=medusa_head_num)
+    medusa_model = Medusa(config)
+
+    params_dict = dict(medusa_model.named_parameters())
+
+    trained_medusa_model = torch.load(args.medusa_model_path)
+
+    for i in range(medusa_head_num):
+        vllm_medusa_head_weight_name = VLLM_MEDUSA_HEADS_WEIGHT_NAME_TEMPLATE.format(i)
+        trained_medusa_head_weight_name = TRAINED_MEDUSA_HEADS_NEMA_TEMPLATE.format(i)
+
+        vllm_medusa_head_param = params_dict[vllm_medusa_head_weight_name]
+        trained_medusa_head_param = trained_medusa_model[trained_medusa_head_weight_name]
+        weight_loader = getattr(vllm_medusa_head_param, "weight_loader",
+                                    default_weight_loader)
+        weight_loader(vllm_medusa_head_param, trained_medusa_head_param)
+
+    for i in range(medusa_head_num):
+        for j in range(medusa_num_layers):
+            # load linear weight
+            vllm_medusa_block_weight_name = VLLM_BLOCK_WEIGHT_NAME_TEMPLATE.format(i, j)
+            trained_medusa_block_weight_name = TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE.format(i, j)
+
+            vllm_medusa_block_param = params_dict[vllm_medusa_block_weight_name]
+            trained_medusa_block_param = trained_medusa_model[trained_medusa_block_weight_name]
+
+            weight_loader = getattr(vllm_medusa_block_param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(vllm_medusa_block_param, trained_medusa_block_param)
+
+            # load linear bias
+            vllm_medusa_block_bias_name = VLLM_BLOCK_BIAS_NAME_TEMPLATE.format(i, j)
+            trained_medusa_block_bias_name = TRAINED_BLOCK_BIAS_NAME_TEMPLATE.format(i, j)
+
+            vllm_medusa_block_bias_param = params_dict[vllm_medusa_block_bias_name]
+            trained_medusa_block_bias_param = trained_medusa_model[trained_medusa_block_bias_name]
+
+            weight_loader = getattr(vllm_medusa_block_bias_param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(vllm_medusa_block_bias_param, trained_medusa_block_bias_param)
+    
+
+    if not Path(args.output_dir).is_dir():
+        os.makedirs(args.output_dir, exist_ok=True)
+    save_model(medusa_model, os.path.join(args.output_dir, "model.safetensors"))
+    
+    medusa_choices = ast.literal_eval(args.medusa_choices) if args.medusa_choices is not None else None
+    to_save_config = CustomMedusaConfig(name_or_path=os.path.join(args.output_dir, "config.json"),
+                                        hidden_size=args.hidden_size,
+                                        num_heads=medusa_head_num,
+                                        num_hidden_layers=medusa_num_layers,
+                                        vocab_size=args.vocab_size,
+                                        medusa_choices=medusa_choices)
+    to_save_config.save_pretrained(args.output_dir)
+
+    # validate weight
+    # with safe_open(os.path.join(args.output_dir, "model.safetensors"), framework="pt") as f:
+    #     param = f.get_tensor(VLLM_BLOCK_WEIGHT_NAME_TEMPLATE.format(3, 0))
+    #     trained_param = trained_medusa_model[TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE.format(3, 0)]
+    #     mse_value = torch.nn.functional.mse_loss(param.cpu(), trained_param.cpu())
+    #     print("weight mes:", mse_value)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Medusa Model Evaluator")
+    parser.add_argument("--medusa_model_path", type=str, required=True,
+                        help="Path to the medusa model file.")
+    parser.add_argument("--vocab_size", type=int, required=True,
+                        help="Vocab size")
+    parser.add_argument("--medusa_num_heads", type=int, required=True,
+                        help="Number of Medusa heads")
+    parser.add_argument("--medusa_num_layers", type=int, required=True,
+                        help="Number of Medusa layers")
+    parser.add_argument("--hidden_size", type=int, required=True,
+                        help="Hidden size")
+    parser.add_argument("--output_dir", type=str, required=True,
+                        help="Output dir")
+    parser.add_argument(
+        '--medusa_choices',
+        type=str,
+        default=None,
+        help="Medusa choice to use, if not none, will use Medusa decoding."
+        "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
+    )
+    args = parser.parse_args()
+    main(args)
--- a/examples/offline_chat_with_tools.py
+++ b/examples/offline_chat_with_tools.py
+# ruff: noqa
+import json
+import random
+import string
+
+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+
+# This script is an offline demo for function calling
+#
+# If you want to run a server/client setup, please follow this code:
+#
+# - Server:
+#
+# ```bash
+# vllm serve mistralai/Mistral-7B-Instruct-v0.3 --tokenizer-mode mistral --load-format mistral --config-format mistral
+# ```
+#
+# - Client:
+#
+# ```bash
+# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
+# --header 'Content-Type: application/json' \
+# --header 'Authorization: Bearer token' \
+# --data '{
+#     "model": "mistralai/Mistral-7B-Instruct-v0.3"
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#             {"type" : "text", "text": "Describe this image in detail please."},
+#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
+#             {"type" : "text", "text": "and this one as well. Answer in French."},
+#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
+#         ]
+#       }
+#     ]
+#   }'
+# ```
+#
+# Usage:
+#     python demo.py simple
+#     python demo.py advanced
+
+model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+# or switch to "mistralai/Mistral-Nemo-Instruct-2407"
+# or "mistralai/Mistral-Large-Instruct-2407"
+# or any other mistral model with function calling ability
+
+sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
+llm = LLM(model=model_name,
+          tokenizer_mode="mistral",
+          config_format="mistral",
+          load_format="mistral")
+
+
+def generate_random_id(length=9):
+    characters = string.ascii_letters + string.digits
+    random_id = ''.join(random.choice(characters) for _ in range(length))
+    return random_id
+
+
+# simulate an API that can be called
+def get_current_weather(city: str, state: str, unit: 'str'):
+    return (f"The weather in {city}, {state} is 85 degrees {unit}. It is "
+            "partly cloudly, with highs in the 90's.")
+
+
+tool_funtions = {"get_current_weather": get_current_weather}
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+
+messages = [{
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
+output = outputs[0].outputs[0].text.strip()
+
+# append the assistant message
+messages.append({
+    "role": "assistant",
+    "content": output,
+})
+
+# let's now actually parse and execute the model's output simulating an API call by using the
+# above defined function
+tool_calls = json.loads(output)
+tool_answers = [
+    tool_funtions[call['name']](**call['arguments']) for call in tool_calls
+]
+
+# append the answer as a tool message and let the LLM give you an answer
+messages.append({
+    "role": "tool",
+    "content": "\n\n".join(tool_answers),
+    "tool_call_id": generate_random_id(),
+})
+
+outputs = llm.chat(messages, sampling_params, tools=tools)
+
+print(outputs[0].outputs[0].text.strip())
+# yields
+#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
+#   'It is partly cloudly, with highs in the 90's.'
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on audio language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.utils import FlexibleArgumentParser
+
+audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+question_per_audio_count = [
+    "What is recited in the audio?",
+    "What sport and what nursery rhyme are referenced?"
+]
+
+
+# Ultravox 0.3
+def run_ultravox(question, audio_count):
+    model_name = "fixie-ai/ultravox-v0_3"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [{
+        'role':
+        'user',
+        'content':
+        "<|reserved_special_token_0|>\n" * audio_count + question
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    llm = LLM(model=model_name,
+              enforce_eager=True,
+              enable_chunked_prefill=False,
+              max_model_len=8192,
+              limit_mm_per_prompt={"audio": audio_count})
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+model_example_map = {
+    "ultravox": run_ultravox,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    audio_count = args.num_audios
+    llm, prompt, stop_token_ids = model_example_map[model](
+        question_per_audio_count[audio_count - 1], audio_count)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=stop_token_ids)
+
+    assert args.num_prompts > 0
+    inputs = {
+        "prompt": prompt,
+        "multi_modal_data": {
+            "audio": [
+                asset.audio_and_sample_rate
+                for asset in audio_assets[:audio_count]
+            ]
+        },
+    }
+    if args.num_prompts > 1:
+        # Batch inference
+        inputs = [inputs] * args.num_prompts
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'audio language models')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="ultravox",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument("--num-audios",
+                        type=int,
+                        default=1,
+                        choices=[1, 2],
+                        help="Number of audio items per prompt.")
+
+    args = parser.parse_args()
+    main(args)
--- a/examples/offline_inference_chat.py
+++ b/examples/offline_inference_chat.py
+from vllm import LLM, SamplingParams
+
+llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+sampling_params = SamplingParams(temperature=0.5)
+
+
+def print_outputs(outputs):
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print("-" * 80)
+
+
+print("=" * 80)
+
+# In this script, we demonstrate how to pass input to the chat method:
+
+conversation = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant"
+    },
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+outputs = llm.chat(conversation,
+                   sampling_params=sampling_params,
+                   use_tqdm=False)
+print_outputs(outputs)
+
+# You can run batch inference with llm.chat API
+conversation = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant"
+    },
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+conversations = [conversation for _ in range(10)]
+
+# We turn on tqdm progress bar to verify it's indeed running batch inference
+outputs = llm.chat(messages=conversations,
+                   sampling_params=sampling_params,
+                   use_tqdm=True)
+print_outputs(outputs)
+
+# A chat template can be optionally supplied.
+# If not, the model will use its default chat template.
+
+# with open('template_falcon_180b.jinja', "r") as f:
+#     chat_template = f.read()
+
+# outputs = llm.chat(
+#     conversations,
+#     sampling_params=sampling_params,
+#     use_tqdm=False,
+#     chat_template=chat_template,
+# )
--- a/examples/offline_inference_encoder_decoder.py
+++ b/examples/offline_inference_encoder_decoder.py
+'''
+Demonstrate prompting of text-to-text
+encoder/decoder models, specifically BART
+'''
+
+from vllm import LLM, SamplingParams
+from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
+                         TokensPrompt, zip_enc_dec_prompts)
+
+dtype = "float"
+
+# Create a BART encoder/decoder model instance
+llm = LLM(
+    model="facebook/bart-large-cnn",
+    dtype=dtype,
+)
+
+# Get BART tokenizer
+tokenizer = llm.llm_engine.get_tokenizer_group()
+
+# Test prompts
+#
+# This section shows all of the valid ways to prompt an
+# encoder/decoder model.
+#
+# - Helpers for building prompts
+text_prompt_raw = "Hello, my name is"
+text_prompt = TextPrompt(prompt="The president of the United States is")
+tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
+    prompt="The capital of France is"))
+# - Pass a single prompt to encoder/decoder model
+#   (implicitly encoder input prompt);
+#   decoder input prompt is assumed to be None
+
+single_text_prompt_raw = text_prompt_raw  # Pass a string directly
+single_text_prompt = text_prompt  # Pass a TextPrompt
+single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
+
+# - Pass explicit encoder and decoder input prompts within one data structure.
+#   Encoder and decoder prompts can both independently be text or tokens, with
+#   no requirement that they be the same prompt type. Some example prompt-type
+#   combinations are shown below, note that these are not exhaustive.
+
+enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
+    # Pass encoder prompt string directly, &
+    # pass decoder prompt tokens
+    encoder_prompt=single_text_prompt_raw,
+    decoder_prompt=single_tokens_prompt,
+)
+enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+    # Pass TextPrompt to encoder, and
+    # pass decoder prompt string directly
+    encoder_prompt=single_text_prompt,
+    decoder_prompt=single_text_prompt_raw,
+)
+enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
+    # Pass encoder prompt tokens directly, and
+    # pass TextPrompt to decoder
+    encoder_prompt=single_tokens_prompt,
+    decoder_prompt=single_text_prompt,
+)
+
+# - Finally, here's a useful helper function for zipping encoder and
+#   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
+#   instances
+zipped_prompt_list = zip_enc_dec_prompts(
+    ['An encoder prompt', 'Another encoder prompt'],
+    ['A decoder prompt', 'Another decoder prompt'])
+
+# - Let's put all of the above example prompts together into one list
+#   which we will pass to the encoder/decoder LLM.
+prompts = [
+    single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
+    enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
+] + zipped_prompt_list
+
+print(prompts)
+
+# Create a sampling params object.
+sampling_params = SamplingParams(
+    temperature=0,
+    top_p=1.0,
+    min_tokens=0,
+    max_tokens=20,
+)
+
+# Generate output tokens from the prompts. The output is a list of
+# RequestOutput objects that contain the prompt, generated
+# text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    encoder_prompt = output.encoder_prompt
+    generated_text = output.outputs[0].text
+    print(f"Encoder prompt: {encoder_prompt!r}, "
+          f"Decoder prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
+import gc
+import time
+from typing import List
+
+from vllm import LLM, SamplingParams
+
+
+def time_generation(llm: LLM, prompts: List[str],
+                    sampling_params: SamplingParams):
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    # Warmup first
+    llm.generate(prompts, sampling_params)
+    llm.generate(prompts, sampling_params)
+    start = time.time()
+    outputs = llm.generate(prompts, sampling_params)
+    end = time.time()
+    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    # Print the outputs.
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+
+    template = (
+        "Below is an instruction that describes a task. Write a response "
+        "that appropriately completes the request.\n\n### Instruction:\n{}"
+        "\n\n### Response:\n")
+
+    # Sample prompts.
+    prompts = [
+        "Write about the president of the United States.",
+    ]
+    prompts = [template.format(prompt) for prompt in prompts]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
+
+    # Create an LLM without spec decoding
+    llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
+
+    print("Without speculation")
+    time_generation(llm, prompts, sampling_params)
+
+    del llm
+    gc.collect()
+
+    # Create an LLM with spec decoding
+    llm = LLM(
+        model="meta-llama/Llama-2-13b-chat-hf",
+        speculative_model="ibm-fms/llama-13b-accelerator",
+        # These are currently required for MLPSpeculator decoding
+        use_v2_block_manager=True,
+    )
+
+    print("With speculation")
+    time_generation(llm, prompts, sampling_params)
--- a/examples/offline_inference_neuron_int8_quantization.py
+++ b/examples/offline_inference_neuron_int8_quantization.py
+import os
+
+from vllm import LLM, SamplingParams
+
+# creates XLA hlo graphs for all the context length buckets.
+os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+# creates XLA hlo graphs for all the token gen buckets.
+os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+# Quantizes neuron model weight to int8 ,
+# The default config for quantization is int8 dtype.
+os.environ['NEURON_QUANT_DTYPE'] = "s8"
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(
+    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    max_num_seqs=8,
+    # The max_model_len and block_size arguments are required to be same as
+    # max sequence length when targeting neuron device.
+    # Currently, this is a known limitation in continuous batching support
+    # in transformers-neuronx.
+    # TODO(liangfu): Support paged-attention in transformers-neuronx.
+    max_model_len=2048,
+    block_size=2048,
+    # The device can be automatically detected when AWS Neuron SDK is installed.
+    # The device argument can be either unspecified for automated detection,
+    # or explicitly assigned.
+    device="neuron",
+    quantization="neuron_quant",
+    override_neuron_config={
+        "cast_logits_dtype": "bfloat16",
+    },
+    tensor_parallel_size=2)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference_pixtral.py
+++ b/examples/offline_inference_pixtral.py
+# ruff: noqa
+import argparse
+
+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+
+# This script is an offline demo for running Pixtral.
+#
+# If you want to run a server/client setup, please follow this code:
+#
+# - Server:
+#
+# ```bash
+# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# ```
+#
+# - Client:
+#
+# ```bash
+# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
+# --header 'Content-Type: application/json' \
+# --header 'Authorization: Bearer token' \
+# --data '{
+#     "model": "mistralai/Pixtral-12B-2409",
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#             {"type" : "text", "text": "Describe this image in detail please."},
+#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
+#             {"type" : "text", "text": "and this one as well. Answer in French."},
+#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
+#         ]
+#       }
+#     ]
+#   }'
+# ```
+#
+# Usage:
+#     python demo.py simple
+#     python demo.py advanced
+
+
+def run_simple_demo():
+    model_name = "mistralai/Pixtral-12B-2409"
+    sampling_params = SamplingParams(max_tokens=8192)
+
+    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
+    llm = LLM(model=model_name, tokenizer_mode="mistral")
+
+    prompt = "Describe this image in one sentence."
+    image_url = "https://picsum.photos/id/237/200/300"
+
+    messages = [
+        {
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": prompt
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+            ],
+        },
+    ]
+    outputs = llm.chat(messages, sampling_params=sampling_params)
+
+    print(outputs[0].outputs[0].text)
+
+
+def run_advanced_demo():
+    model_name = "mistralai/Pixtral-12B-2409"
+    max_img_per_msg = 5
+    max_tokens_per_img = 4096
+
+    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral",
+        limit_mm_per_prompt={"image": max_img_per_msg},
+        max_model_len=max_img_per_msg * max_tokens_per_img,
+    )
+
+    prompt = "Describe the following image."
+
+    url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
+    url_2 = "https://picsum.photos/seed/picsum/200/300"
+    url_3 = "https://picsum.photos/id/32/512/512"
+
+    messages = [
+        {
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": prompt
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": url_1
+                    }
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": url_2
+                    }
+                },
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": "The images show nature.",
+        },
+        {
+            "role": "user",
+            "content": "More details please and answer only in French!.",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": url_3
+                    }
+                },
+            ],
+        },
+    ]
+
+    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run a demo in simple or advanced mode.")
+
+    parser.add_argument(
+        "mode",
+        choices=["simple", "advanced"],
+        help="Specify the demo mode: 'simple' or 'advanced'",
+    )
+
+    args = parser.parse_args()
+
+    if args.mode == "simple":
+        print("Running simple demo...")
+        run_simple_demo()
+    elif args.mode == "advanced":
+        print("Running advanced demo...")
+        run_advanced_demo()
+
+
+if __name__ == "__main__":
+    main()