Adding Idefics multi modal model. (#842)

Co-Authored-By: Victor Sanh <victorsanh@gmail.com> # What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation ). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: Victor Sanh <victorsanh@gmail.com>

Adding Idefics multi modal model. (#842)
Co-Authored-By: Victor Sanh <victorsanh@gmail.com> # What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation ). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: Victor Sanh <victorsanh@gmail.com>
bce5e224 · Nicolas Patry · GitHub · b9e33c49 · bce5e224 · bce5e224
Unverified Commit bce5e224 authored Aug 17, 2023 by Nicolas Patry Committed by GitHub Aug 17, 2023
17 changed files
--- a/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4911,
+        "logprob": -5.3632812,
+        "text": "User"
+      },
+      {
+        "id": 29901,
+        "logprob": -0.00762558,
+        "text": ":"
+      },
+      {
+        "id": 32000,
+        "logprob": -0.7739258,
+        "text": "<fake_token_around_image>"
+      },
+      {
+        "id": 32001,
+        "logprob": -9.775162e-05,
+        "text": "<image>"
+      },
+      {
+        "id": 32000,
+        "logprob": -1.1920929e-07,
+        "text": "<fake_token_around_image>"
+      },
+      {
+        "id": 1815,
+        "logprob": -4.4140625,
+        "text": "Can"
+      },
+      {
+        "id": 366,
+        "logprob": -0.01436615,
+        "text": "you"
+      },
+      {
+        "id": 2649,
+        "logprob": -4.9414062,
+        "text": "tell"
+      },
+      {
+        "id": 592,
+        "logprob": -0.3005371,
+        "text": "me"
+      },
+      {
+        "id": 263,
+        "logprob": -3.5703125,
+        "text": "a"
+      },
+      {
+        "id": 1407,
+        "logprob": -9.4296875,
+        "text": "very"
+      },
+      {
+        "id": 3273,
+        "logprob": -1.9111328,
+        "text": "short"
+      },
+      {
+        "id": 5828,
+        "logprob": -0.28881836,
+        "text": "story"
+      },
+      {
+        "id": 2729,
+        "logprob": -3.4179688,
+        "text": "based"
+      },
+      {
+        "id": 373,
+        "logprob": -0.00056886673,
+        "text": "on"
+      },
+      {
+        "id": 278,
+        "logprob": -0.14123535,
+        "text": "the"
+      },
+      {
+        "id": 1967,
+        "logprob": -0.053985596,
+        "text": "image"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.15771484,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 32002,
+        "logprob": -0.004295349,
+        "special": true,
+        "text": "<end_of_utterance>"
+      },
+      {
+        "id": 29871,
+        "logprob": -7.43866e-05,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 13,
+        "logprob": -2.3126602e-05,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 7900,
+        "logprob": -3.9339066e-06,
+        "special": false,
+        "text": "Ass"
+      },
+      {
+        "id": 22137,
+        "logprob": 0.0,
+        "special": false,
+        "text": "istant"
+      },
+      {
+        "id": 29901,
+        "logprob": -2.6226044e-06,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 319,
+        "logprob": -0.87841797,
+        "special": false,
+        "text": " A"
+      },
+      {
+        "id": 521,
+        "logprob": -1.3837891,
+        "special": false,
+        "text": " ch"
+      },
+      {
+        "id": 21475,
+        "logprob": -0.00051641464,
+        "special": false,
+        "text": "icken"
+      },
+      {
+        "id": 338,
+        "logprob": -1.1435547,
+        "special": false,
+        "text": " is"
+      }
+    ]
+  },
+  "generated_text": "\nAssistant: A chicken is"
+}
--- a/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4911,
+          "logprob": -5.3476562,
+          "text": "User"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.0075531006,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -0.7729492,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -9.787083e-05,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -1.1920929e-07,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 1815,
+          "logprob": -4.4296875,
+          "text": "Can"
+        },
+        {
+          "id": 366,
+          "logprob": -0.01424408,
+          "text": "you"
+        },
+        {
+          "id": 2649,
+          "logprob": -4.9335938,
+          "text": "tell"
+        },
+        {
+          "id": 592,
+          "logprob": -0.2993164,
+          "text": "me"
+        },
+        {
+          "id": 263,
+          "logprob": -3.5664062,
+          "text": "a"
+        },
+        {
+          "id": 1407,
+          "logprob": -9.4453125,
+          "text": "very"
+        },
+        {
+          "id": 3273,
+          "logprob": -1.9306641,
+          "text": "short"
+        },
+        {
+          "id": 5828,
+          "logprob": -0.2836914,
+          "text": "story"
+        },
+        {
+          "id": 2729,
+          "logprob": -3.4179688,
+          "text": "based"
+        },
+        {
+          "id": 373,
+          "logprob": -0.00056934357,
+          "text": "on"
+        },
+        {
+          "id": 278,
+          "logprob": -0.13928223,
+          "text": "the"
+        },
+        {
+          "id": 1967,
+          "logprob": -0.05355835,
+          "text": "image"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.15771484,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 32002,
+          "logprob": -0.004333496,
+          "special": true,
+          "text": "<end_of_utterance>"
+        },
+        {
+          "id": 29871,
+          "logprob": -7.426739e-05,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 13,
+          "logprob": -2.348423e-05,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 7900,
+          "logprob": -3.9339066e-06,
+          "special": false,
+          "text": "Ass"
+        },
+        {
+          "id": 22137,
+          "logprob": 0.0,
+          "special": false,
+          "text": "istant"
+        },
+        {
+          "id": 29901,
+          "logprob": -2.861023e-06,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 319,
+          "logprob": -0.8828125,
+          "special": false,
+          "text": " A"
+        },
+        {
+          "id": 521,
+          "logprob": -1.3759766,
+          "special": false,
+          "text": " ch"
+        },
+        {
+          "id": 21475,
+          "logprob": -0.0005083084,
+          "special": false,
+          "text": "icken"
+        },
+        {
+          "id": 338,
+          "logprob": -1.1367188,
+          "special": false,
+          "text": " is"
+        }
+      ]
+    },
+    "generated_text": "\nAssistant: A chicken is"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4911,
+          "logprob": -5.3476562,
+          "text": "User"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.0075531006,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -0.7729492,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -9.787083e-05,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -1.1920929e-07,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 1815,
+          "logprob": -4.4296875,
+          "text": "Can"
+        },
+        {
+          "id": 366,
+          "logprob": -0.01423645,
+          "text": "you"
+        },
+        {
+          "id": 2649,
+          "logprob": -4.9335938,
+          "text": "tell"
+        },
+        {
+          "id": 592,
+          "logprob": -0.2993164,
+          "text": "me"
+        },
+        {
+          "id": 263,
+          "logprob": -3.5664062,
+          "text": "a"
+        },
+        {
+          "id": 1407,
+          "logprob": -9.4453125,
+          "text": "very"
+        },
+        {
+          "id": 3273,
+          "logprob": -1.9306641,
+          "text": "short"
+        },
+        {
+          "id": 5828,
+          "logprob": -0.2836914,
+          "text": "story"
+        },
+        {
+          "id": 2729,
+          "logprob": -3.4179688,
+          "text": "based"
+        },
+        {
+          "id": 373,
+          "logprob": -0.00056934357,
+          "text": "on"
+        },
+        {
+          "id": 278,
+          "logprob": -0.13928223,
+          "text": "the"
+        },
+        {
+          "id": 1967,
+          "logprob": -0.05355835,
+          "text": "image"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.15771484,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 32002,
+          "logprob": -0.004333496,
+          "special": true,
+          "text": "<end_of_utterance>"
+        },
+        {
+          "id": 29871,
+          "logprob": -7.4505806e-05,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 13,
+          "logprob": -2.3722649e-05,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 7900,
+          "logprob": -3.9339066e-06,
+          "special": false,
+          "text": "Ass"
+        },
+        {
+          "id": 22137,
+          "logprob": 0.0,
+          "special": false,
+          "text": "istant"
+        },
+        {
+          "id": 29901,
+          "logprob": -2.861023e-06,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 319,
+          "logprob": -0.8828125,
+          "special": false,
+          "text": " A"
+        },
+        {
+          "id": 521,
+          "logprob": -1.3759766,
+          "special": false,
+          "text": " ch"
+        },
+        {
+          "id": 21475,
+          "logprob": -0.00050878525,
+          "special": false,
+          "text": "icken"
+        },
+        {
+          "id": 338,
+          "logprob": -1.1367188,
+          "special": false,
+          "text": " is"
+        }
+      ]
+    },
+    "generated_text": "\nAssistant: A chicken is"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4911,
+          "logprob": -5.3476562,
+          "text": "User"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.0075531006,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -0.7729492,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -9.775162e-05,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -1.1920929e-07,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 1815,
+          "logprob": -4.4296875,
+          "text": "Can"
+        },
+        {
+          "id": 366,
+          "logprob": -0.01424408,
+          "text": "you"
+        },
+        {
+          "id": 2649,
+          "logprob": -4.9335938,
+          "text": "tell"
+        },
+        {
+          "id": 592,
+          "logprob": -0.2993164,
+          "text": "me"
+        },
+        {
+          "id": 263,
+          "logprob": -3.5664062,
+          "text": "a"
+        },
+        {
+          "id": 1407,
+          "logprob": -9.4453125,
+          "text": "very"
+        },
+        {
+          "id": 3273,
+          "logprob": -1.9306641,
+          "text": "short"
+        },
+        {
+          "id": 5828,
+          "logprob": -0.2836914,
+          "text": "story"
+        },
+        {
+          "id": 2729,
+          "logprob": -3.4179688,
+          "text": "based"
+        },
+        {
+          "id": 373,
+          "logprob": -0.00056934357,
+          "text": "on"
+        },
+        {
+          "id": 278,
+          "logprob": -0.13928223,
+          "text": "the"
+        },
+        {
+          "id": 1967,
+          "logprob": -0.05355835,
+          "text": "image"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.15771484,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 32002,
+          "logprob": -0.004333496,
+          "special": true,
+          "text": "<end_of_utterance>"
+        },
+        {
+          "id": 29871,
+          "logprob": -7.43866e-05,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 13,
+          "logprob": -2.360344e-05,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 7900,
+          "logprob": -3.9339066e-06,
+          "special": false,
+          "text": "Ass"
+        },
+        {
+          "id": 22137,
+          "logprob": 0.0,
+          "special": false,
+          "text": "istant"
+        },
+        {
+          "id": 29901,
+          "logprob": -2.7418137e-06,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 319,
+          "logprob": -0.8828125,
+          "special": false,
+          "text": " A"
+        },
+        {
+          "id": 521,
+          "logprob": -1.3759766,
+          "special": false,
+          "text": " ch"
+        },
+        {
+          "id": 21475,
+          "logprob": -0.00050878525,
+          "special": false,
+          "text": "icken"
+        },
+        {
+          "id": 338,
+          "logprob": -1.1367188,
+          "special": false,
+          "text": " is"
+        }
+      ]
+    },
+    "generated_text": "\nAssistant: A chicken is"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4911,
+          "logprob": -5.3632812,
+          "text": "User"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.00762558,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -0.7739258,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -9.775162e-05,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -1.1920929e-07,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 1815,
+          "logprob": -4.4140625,
+          "text": "Can"
+        },
+        {
+          "id": 366,
+          "logprob": -0.01436615,
+          "text": "you"
+        },
+        {
+          "id": 2649,
+          "logprob": -4.9414062,
+          "text": "tell"
+        },
+        {
+          "id": 592,
+          "logprob": -0.3005371,
+          "text": "me"
+        },
+        {
+          "id": 263,
+          "logprob": -3.5703125,
+          "text": "a"
+        },
+        {
+          "id": 1407,
+          "logprob": -9.4296875,
+          "text": "very"
+        },
+        {
+          "id": 3273,
+          "logprob": -1.9111328,
+          "text": "short"
+        },
+        {
+          "id": 5828,
+          "logprob": -0.28881836,
+          "text": "story"
+        },
+        {
+          "id": 2729,
+          "logprob": -3.4179688,
+          "text": "based"
+        },
+        {
+          "id": 373,
+          "logprob": -0.00056886673,
+          "text": "on"
+        },
+        {
+          "id": 278,
+          "logprob": -0.14123535,
+          "text": "the"
+        },
+        {
+          "id": 1967,
+          "logprob": -0.053985596,
+          "text": "image"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.15771484,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 32002,
+          "logprob": -0.004295349,
+          "special": true,
+          "text": "<end_of_utterance>"
+        },
+        {
+          "id": 29871,
+          "logprob": -7.43866e-05,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 13,
+          "logprob": -2.3126602e-05,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 7900,
+          "logprob": -3.9339066e-06,
+          "special": false,
+          "text": "Ass"
+        },
+        {
+          "id": 22137,
+          "logprob": 0.0,
+          "special": false,
+          "text": "istant"
+        },
+        {
+          "id": 29901,
+          "logprob": -2.6226044e-06,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 319,
+          "logprob": -0.87841797,
+          "special": false,
+          "text": " A"
+        },
+        {
+          "id": 521,
+          "logprob": -1.3837891,
+          "special": false,
+          "text": " ch"
+        },
+        {
+          "id": 21475,
+          "logprob": -0.00051641464,
+          "special": false,
+          "text": "icken"
+        },
+        {
+          "id": 338,
+          "logprob": -1.1435547,
+          "special": false,
+          "text": " is"
+        }
+      ]
+    },
+    "generated_text": "\nAssistant: A chicken is"
+  }
+]
--- a/integration-tests/models/test_idefics.py
+++ b/integration-tests/models/test_idefics.py
+import pytest
+@pytest.fixture(scope="module")
+def idefics_handle(launcher):
+    with launcher(
+        "HuggingFaceM4/idefics-9b-instruct", num_shard=2
+    ) as handle:
+        yield handle
+@pytest.fixture(scope="module")
+async def idefics(idefics_handle):
+    await idefics_handle.health(300)
+    return idefics_handle.client
+@pytest.mark.asyncio
+async def test_idefics(idefics, response_snapshot):
+    response = await idefics.generate(
+        "User:![](https://temp-5681.s3.us-west-2.amazonaws.com/chicken_on_money.png)Can you tell me a very short story based on the image?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+@pytest.mark.asyncio
+async def test_idefics_load(idefics, generate_load, response_snapshot):
+    responses = await generate_load(
+        idefics,
+        "User:![](https://temp-5681.s3.us-west-2.amazonaws.com/chicken_on_money.png)Can you tell me a very short story based on the image?",
+        max_new_tokens=10,
+        n=4,
+    )
+    generated_texts = [r.generated_text for r in responses]
+    assert len(generated_texts) == 4
+    assert generated_texts, all(
+        [text == generated_texts[0] for text in generated_texts]
+    )
+    assert responses == response_snapshot
--- a/server/poetry.lock
+++ b/server/poetry.lock
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -31,8 +31,9 @@ einops = "^0.6.1"
 texttable = { version = "^1.6.7", optional = true }
 datasets = { version = "^2.14.0", optional = true }
 peft = "^0.4.0"
-torch = {version = "^2.0.1+cu118", source = "pytorch-gpu-src"}
+torch = { version = "^2.0.1" }
 scipy = "^1.11.1"
+pillow = "^10.0.0"
 [tool.poetry.extras]
 accelerate = ["accelerate"]

--- a/server/requirements.txt
+++ b/server/requirements.txt
--extra-index-url https://download.pytorch.org/whl/cu118
 accelerate==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
 backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
 certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
 charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.6 ; python_version >= "3.9" and python_version < "3.13"
-cmake==3.27.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
@@ -13,14 +10,13 @@ filelock==3.12.2 ; python_version >= "3.9" and python_version < "3.13"
 fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
 googleapis-common-protos==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.56.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.56.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.56.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.3 ; python_version >= "3.9" and python_version < "3.13"
 huggingface-hub==0.14.1 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
 jinja2==3.1.2 ; python_version >= "3.9" and python_version < "3.13"
-lit==16.0.6 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
 markupsafe==2.1.3 ; python_version >= "3.9" and python_version < "3.13"
 mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
@@ -37,10 +33,11 @@ opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
 packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
 peft==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.23.4 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.0.0 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.24.0 ; python_version >= "3.9" and python_version < "3.13"
 psutil==5.9.5 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2023.6.3 ; python_version >= "3.9" and python_version < "3.13"
+regex==2023.8.8 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.3.2 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.11.1 ; python_version >= "3.9" and python_version < "3.13"
@@ -48,10 +45,9 @@ sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==68.0.0 ; python_version >= "3.9" and python_version < "3.13"
 sympy==1.12 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
-torch==2.0.1+cu118 ; python_version >= "3.9" and python_version < "3.13"
+torch==2.0.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.65.0 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
 transformers==4.31.0 ; python_version >= "3.9" and python_version < "3.13"
-triton==2.0.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"

--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -54,6 +54,7 @@ try:
    from text_generation_server.models.flash_santacoder import (
        FlashSantacoderSharded,
    )
+    from text_generation_server.models.idefics import IDEFICSSharded
 except ImportError as e:
    logger.warning(f"Could not import Flash Attention enabled models: {e}")
@@ -64,6 +65,7 @@ if FLASH_ATTENTION:
    __all__.append(FlashRWSharded)
    __all__.append(FlashSantacoderSharded)
    __all__.append(FlashLlama)
+    __all__.append(IDEFICSSharded)
 def get_model(
@@ -248,6 +250,17 @@ def get_model(
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
+    elif model_type == "idefics":
+        if FLASH_ATTENTION:
+           return IDEFICSSharded(
+               model_id,
+               revision,
+               quantize=quantize,
+               dtype=dtype,
+               trust_remote_code=trust_remote_code,
+           )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
    if sharded:
        raise ValueError("sharded is not supported for AutoModel")

--- a/server/text_generation_server/models/custom_modeling/idefics_config.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_config.py
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Idefics model configuration"""
+import copy
+from transformers import PretrainedConfig
+IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "HuggingFaceM4/idefics-9b": "https://huggingface.co/HuggingFaceM4/idefics-9b/blob/main/config.json",
+    "HuggingFaceM4/idefics-80b": "https://huggingface.co/HuggingFaceM4/idefics-80b/blob/main/config.json",
+}
+class IdeficsVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        intermediate_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_num_channels (`int`, *optional*, defaults to `3`):
+            Number of image channels.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
+            testing).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+    model_type = "idefics"
+    attribute_map = {
+        "hidden_size": "embed_dim",
+    }
+    def __init__(
+        self,
+        embed_dim=768,
+        image_size=224,
+        intermediate_size=5120,
+        patch_size=14,
+        num_hidden_layers=32,
+        num_attention_heads=16,
+        num_channels=3,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.intermediate_size = intermediate_size
+        self.patch_size = patch_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+        super().__init__(**kwargs)
+class IdeficsPerceiverConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        use_resampler (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the resampler
+        resampler_n_latents (`int`, *optional*, defaults to ):
+            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+        resampler_depth (`int`, *optional*, defaults to 6):
+            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+        resampler_n_heads (`int`, *optional*, defaults to 16):
+            Number of heads in each Transformer block (for multi-headed self-attention).
+        resampler_head_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of each head projection in the Transformer block.
+        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
+            Whether or not to use qk layer norms in perceiver
+    """
+    model_type = "idefics"
+    def __init__(
+        self,
+        use_resampler=False,
+        resampler_n_latents=64,
+        resampler_depth=6,
+        resampler_n_heads=16,
+        resampler_head_dim=96,
+        qk_layer_norms_perceiver=False,
+        **kwargs,
+    ):
+        self.use_resampler = use_resampler
+        self.resampler_n_latents = resampler_n_latents
+        self.resampler_depth = resampler_depth
+        self.resampler_n_heads = resampler_n_heads
+        self.resampler_head_dim = resampler_head_dim
+        self.qk_layer_norms_perceiver = qk_layer_norms_perceiver
+        super().__init__(**kwargs)
+class IdeficsConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        additional_vocab_size (`int`, *optional`, defaults to 0):
+            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
+            are always trainable whereas regular vocab tokens can be frozen or not.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~IdeficsModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
+            Initialization type for the alphas.
+        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
+            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
+            Attention.
+        alpha_type (`str`, *optional*, defaults to `"float"`):
+            Whether the gating alphas should be vectors or single floats.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0)
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1)
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2)
+            End of stream token id.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        cross_layer_interval (`int`, *optional*, default to 1)
+            Interval for cross attention (from text to image) layers.
+        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
+        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
+        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing text layers when `freeze_text_layers` is `True`
+        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
+        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
+        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
+        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
+        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
+        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict
+    Example:
+    ```python
+    >>> from transformers import IdeficsModel, IdeficsConfig
+    >>> # Initializing a Idefics idefics-9b style configuration
+    >>> configuration = IdeficsConfig()
+    >>> # Initializing a model from the idefics-9b style configuration
+    >>> model = IdeficsModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "idefics"
+    is_composition = True
+    def __init__(
+        self,
+        vocab_size=32000,
+        additional_vocab_size=0,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        dropout=0.0,
+        hidden_act="silu",
+        initializer_range=0.02,
+        alpha_initializer="zeros",
+        alphas_initializer_range=0.0,
+        alpha_type="float",
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        cross_layer_interval=1,
+        qk_layer_norms=False,
+        freeze_text_layers=True,
+        freeze_text_module_exceptions=[],
+        freeze_lm_head=False,
+        freeze_vision_layers=True,
+        freeze_vision_module_exceptions=[],
+        use_resampler=False,
+        vision_config=None,
+        perceiver_config=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.alpha_initializer = alpha_initializer
+        self.alphas_initializer_range = alphas_initializer_range
+        self.alpha_type = alpha_type
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.cross_layer_interval = cross_layer_interval
+        self.qk_layer_norms = qk_layer_norms
+        self.freeze_vision_layers = freeze_vision_layers
+        self.freeze_text_layers = freeze_text_layers
+        self.freeze_text_module_exceptions = freeze_text_module_exceptions
+        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
+        self.freeze_lm_head = freeze_lm_head
+        self.use_resampler = use_resampler
+        if perceiver_config is None:
+            self.perceiver_config = IdeficsPerceiverConfig()
+        elif isinstance(perceiver_config, dict):
+            self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
+        elif isinstance(perceiver_config, IdeficsPerceiverConfig):
+            self.perceiver_config = perceiver_config
+        if vision_config is None:
+            self.vision_config = IdeficsVisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = IdeficsVisionConfig(**vision_config)
+        elif isinstance(vision_config, IdeficsVisionConfig):
+            self.vision_config = vision_config
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
+        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
+        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
+        # of this object many attributes have default values and haven't yet been overridden.
+        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["perceiver_config"] = self.perceiver_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
--- a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Idefics."""
+from typing import Callable, Dict, List, Optional, Union, Iterable
+import numpy as np
+from PIL import Image
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import resize, to_channel_dimension_format, rescale, normalize
+from transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from io import BytesIO
+import requests
+from transformers import TensorType, is_torch_available
+IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
+IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
+def convert_to_rgb(image):
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+class IdeficsImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics image processor.
+    Args:
+        image_size (`int`, *optional*, defaults to `224`):
+            Resize to image size
+        image_num_channels (`int`, *optional*, defaults to `3`):
+            Number of image channels.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        image_size: int = 224,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        image_num_channels: Optional[int] = 3,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.image_num_channels = image_num_channels
+        self.image_mean = image_mean
+        self.image_std = image_std
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_num_channels: Optional[int] = 3,
+        image_size: Optional[Dict[str, int]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        transform: Callable = None,
+        **kwargs,
+    ) -> TensorType.PYTORCH:
+        """
+        Preprocess a batch of images.
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            image_size (`int`, *optional*, defaults to `self.image_size`):
+                Resize to image size
+            image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
+                Number of image channels.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+                Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+                channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
+                be overridden by the `image_mean` parameter in the `preprocess` method.
+            image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+                Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+                number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
+                method. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            transform (`Callable`, *optional*, defaults to `None`):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
+                assumed - and then a preset of inference-specific transforms will be applied to the images
+        Returns:
+            a PyTorch tensor of the processed images
+        """
+        image_size = image_size if image_size is not None else self.image_size
+        image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = (image_size, image_size)
+        if len(images) == 0:
+            return []
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # For training a user needs to pass their own set of transforms as a Callable.
+        # For reference this is what was used in the original IDEFICS training:
+        # transform = transforms.Compose([
+        #     convert_to_rgb,
+        #     transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
+        #     transforms.ToTensor(),
+        #     transforms.Normalize(mean=image_mean, std=image_std),
+        # ])
+        if transform is not None:
+            if not is_torch_available():
+                raise ImportError("To pass in `transform` torch must be installed")
+            import torch
+            images = [transform(x) for x in images]
+            return torch.stack(images)
+        # for inference we do the exact transforms that were used to train IDEFICS
+        images = [convert_to_rgb(x) for x in images]
+        # further transforms expect numpy arrays
+        images = [to_numpy_array(x) for x in images]
+        images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
+        images = [self.rescale(image=image, scale=1 / 255) for image in images]
+        images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
+        images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
+        # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
+        images = BatchFeature(data={"pixel_values": images}, tensor_type=TensorType.PYTORCH)["pixel_values"]
+        return images
+    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
+        """
+        Convert a single or a list of urls into the corresponding `PIL.Image` objects.
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
+                " Safari/537.36"
+            )
+        }
+        if isinstance(image_url_or_urls, list):
+            return [self.fetch_images(x) for x in image_url_or_urls]
+        elif isinstance(image_url_or_urls, str):
+            response = requests.get(image_url_or_urls, stream=True, headers=headers)
+            response.raise_for_status()
+            return Image.open(BytesIO(response.content))
+        else:
+            raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Rescale an image by a scale factor. image = image * scale.
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        Returns:
+            `np.ndarray`: The rescaled image.
+        """
+        # return rescale(image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs)
+        # requires 4.32
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, Iterable[float]],
+        std: Union[float, Iterable[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `Iterable[float]`):
+                Image mean to use for normalization.
+            std (`float` or `Iterable[float]`):
+                Image standard deviation to use for normalization.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        Returns:
+            `np.ndarray`: The normalized image.
+        """
+        # TODO 4.32
+        return normalize(
+            image, mean=mean, std=std, data_format=data_format, **kwargs
+        )
+import transformers
+transformers.IdeficsImageProcessor = IdeficsImageProcessor
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
--- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
+# This code was adapted from https://github.com/lucidrains/flamingo-pytorch licensed under the MIT License.
+#
+# MIT License
+#
+# Copyright (c) 2020  The Google AI Language Team Authors, The HuggingFace Inc. team and github/lonePatient
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
+time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
+that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
+prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
+to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.
+References:
+    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
+    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
+"""
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from text_generation_server.utils.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+EPS=1e-5
+class IdeficsPerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        prefix,
+        config,
+        embed_dim: int,
+        depth: int,
+        n_heads: int,
+        head_dim: int,
+        n_latents: int,
+        weights,
+    ) -> None:
+        """
+        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
+        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
+        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
+        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
+        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.
+        Args:
+            config (`IdeficsConfig`): config object
+            embed_dim (`int`): The size of each embedding vector
+            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
+            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
+            n_latents (`int`):
+                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+        """
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
+        self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver
+        # Create Latents for Perceiver
+        self.latents = nn.Parameter(weights.get_tensor(f"{prefix}.latents"))
+        self.intermediate_dim = (
+            self.embed_dim * 4
+            if not hasattr(config.vision_config, "embed_dim")
+            else config.vision_config.embed_dim * 4
+        )
+        # Create Transformer Blocks
+        self.blocks = nn.ModuleList(
+            [
+                nn.ModuleList(
+                    [
+                        IdeficsPerceiverAttention(
+                            prefix=f"{prefix}.blocks.{layer_id}.0",
+                            config=config,
+                            embed_dim=self.embed_dim,
+                            n_heads=self.n_heads,
+                            head_dim=self.head_dim,
+                            qk_layer_norms=self.qk_layer_norms,
+                            weights=weights,
+                        ),
+                        IdeficsMLP(
+                            prefix=f"{prefix}.blocks.{layer_id}.1",
+                            intermediate_size=self.intermediate_dim,
+                            config=config,
+                            weights=weights
+                        ),
+                    ]
+                )
+                for layer_id in range(depth)
+            ]
+        )
+        self.layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.layer_norm", weights=weights, eps=EPS)
+    def forward(self, context: torch.Tensor) -> torch.Tensor:
+        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
+        # einsum.repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
+        latents = self.latents.repeat(context.shape[0], 1, 1)
+        # Feed through Perceiver Attention blocks...
+        for attn, ff in self.blocks:
+            latents = attn(context, latents) + latents
+            latents = ff(latents) + latents
+        return self.layer_norm(latents)
+class IdeficsPerceiverAttention(nn.Module):
+    def __init__(self,
+            prefix,
+            config,
+            embed_dim: int,
+            n_heads: int,
+            head_dim: int,
+            qk_layer_norms: bool,
+            weights
+        ) -> None:
+        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
+        self.qk_layer_norms = qk_layer_norms
+        # Normalization & Scaling
+        self.context_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.context_layer_norm", weights=weights, eps=EPS)
+        self.latents_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.latents_layer_norm", weights=weights, eps=EPS)
+        if self.qk_layer_norms:
+            self.q_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.q_layer_norm", weights=weights, eps=EPS)
+            self.k_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.k_layer_norm", weights=weights, eps=EPS)
+        self.qk_scale = self.head_dim**-0.5
+        process_group = weights.process_group
+        if n_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {n_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.n_heads //= weights.process_group.size()
+        # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
+        self.q_proj = TensorParallelColumnLinear.load(
+            config=config, prefix=f"{prefix}.q_proj", weights=weights, bias=False
+        )
+        self.k_proj =  TensorParallelColumnLinear.load(
+            config=config, prefix=f"{prefix}.k_proj", weights=weights, bias=False
+        )
+        self.v_proj =  TensorParallelColumnLinear.load(
+            config=config, prefix=f"{prefix}.v_proj", weights=weights, bias=False
+        )
+        self.output_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.output_proj", weights=weights, bias=False
+        )
+    def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
+        """
+        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
+        Args:
+            context (`torch.Tensor`):
+                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
+            latents (`torch.Tensor`):
+                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.
+        Returns:
+            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
+            from context.
+        """
+        context = self.context_layer_norm(context)
+        latents = self.latents_layer_norm(latents)
+        batch_size, seq_length, embed_dim = context.shape[:3]
+        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
+        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
+        q = self.q_proj(latents)
+        k = self.k_proj(torch.cat([context, latents], dim=-2))
+        v = self.v_proj(torch.cat([context, latents], dim=-2))
+        # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
+        #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
+        # einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
+        q, k, v = [x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(1, 2) for x in (q, k, v)]
+        if self.qk_layer_norms:
+            q = self.q_layer_norm(q)
+            k = self.k_layer_norm(k)
+        scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
+        stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
+        attn = stabilized_scores.softmax(dim=-1)
+        # Attend & project back to output...
+        resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
+        # einsum.rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
+        return self.output_proj(resampled.transpose(1, 2).flatten(-2))
+class IdeficsMLP(nn.Module):
+    def __init__(self,
+            prefix,
+            intermediate_size,
+            config,
+            weights,
+        ):
+        """Simple MLP block with intermediate_size and embedding size"""
+        super().__init__()
+        self.embed_dim = config.vision_config.embed_dim
+        self.ln = nn.LayerNorm.load(prefix=f"{prefix}.ln", weights=weights, eps=EPS)
+        self.fc = TensorParallelColumnLinear.load(
+            config=config, prefix=f"{prefix}.fc", weights=weights, bias=False,
+        )
+        self.act = nn.ReLU()
+        self.c_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.c_proj", weights=weights, bias=False,
+        )
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.ln(hidden_states)
+        hidden_states = self.fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        return hidden_states
--- a/server/text_generation_server/models/custom_modeling/idefics_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_processing.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for IDEFICS.
+"""
+from typing import Callable, List, Optional, Union
+from urllib.parse import urlparse
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from transformers.utils import TensorType, is_torch_available
+from text_generation_server.models.custom_modeling.idefics_image_processing import IdeficsImageProcessor
+if is_torch_available():
+    import torch
+IMAGE_TOKEN = "<image>"
+# copied from m4.training.packing
+def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
+    # This function converts: [-1, 0, 1] => [[0, 0], [1, 0], [0, 1]]
+    # If any of images index are more than num_classes, set them to -1.
+    # Words after the max number of images allowed have been seen don't attend on anything
+    if num_classes != -1:
+        incremental_mask[incremental_mask >= num_classes] = -1
+    negatives = incremental_mask == -1
+    incremental_mask[negatives] = 0
+    attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
+    attn_mask[negatives, :] = 0
+    return attn_mask
+# copied from m4.training.packing
+def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
+    image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+    eod_token_id = tokenizer.eos_token_id
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx, token_id in enumerate(input_ids[batch_idx]):
+            if token_id == image_token_id:
+                count += 1
+                image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                image_attention_mask[batch_idx][idx] = count
+            if seen_eod:
+                image_attention_mask[batch_idx][idx] = -1
+            if token_id == eod_token_id:
+                seen_eod = True
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
+            token_id = input_ids[batch_idx][idx]
+            if token_id == image_token_id:
+                count += 1
+                next_image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                next_image_attention_mask[batch_idx][idx] = count
+            if token_id == eod_token_id:
+                seen_eod = True
+            if seen_eod:
+                next_image_attention_mask[batch_idx][idx] = -1
+        non_negative_indices = next_image_attention_mask[batch_idx] != -1
+        next_image_attention_mask[batch_idx][non_negative_indices] -= count
+        next_image_attention_mask[batch_idx][non_negative_indices] *= -1
+    return image_attention_mask, next_image_attention_mask
+def is_url(string):
+    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
+    invalidated the url"""
+    if " " in string:
+        return False
+    result = urlparse(string)
+    return all([result.scheme, result.netloc])
+class IdeficsProcessor(ProcessorMixin):
+    r"""
+    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
+    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+    Args:
+        image_processor (`IdeficsImageProcessor`):
+            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizerFast`):
+            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
+        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "IdeficsImageProcessor"
+    tokenizer_class = "LlamaTokenizerFast"
+    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        self.default_image_dims = (
+            self.image_processor.image_num_channels,
+            self.image_processor.image_size,
+            self.image_processor.image_size,
+        )
+        self.tokenizer_was_trained_with_end_of_utterance_token = (
+            True
+            if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
+            else False
+        )
+    def __call__(
+        self,
+        prompts: Union[List[TextInput], List[List[TextInput]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        transform: Callable = None,
+        add_eos_token=False,
+        add_end_of_utterance_token=None,
+        debug=False,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchEncoding:
+        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
+        the model was trained on and prepares the image pixel values for the model to process.
+        Args:
+            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
+                either a single prompt or a batched list of prompts - see the detailed description immediately after
+                the end of the arguments doc section.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            transform (`Callable`, *optional*):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
+                set of transforms will be applied to the images
+            add_eos_token (`bool`, *optional*, defaults to `False`):
+                Adds `eos_token` at the end of the final prompt if True`
+            add_end_of_utterance_token (`bool`, *optional*)
+                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
+                image). If `None` the tokenizer will be checked instead and if this token is found in
+                `additional_special_tokens` then the value will be `True`.
+            debug (`bool`, *optional*, defaults to `False`):
+                `True` value will help debug prompt generation by dumping useful information
+            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
+                The type of tensors to return. Can be one of:
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+        Returns:
+            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
+            directly passed to `model.generate`
+        Detailed explanation:
+        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
+        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
+        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
+        entry into the prompt.
+        Example:
+        ```python
+        checkpoint = "HuggingFaceM4/idefics-9b"
+        processor = AutoProcessor.from_pretrained(checkpoint)
+        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
+        img = processor.image_processor.fetch_images([url])[0]
+        prompts = [
+            "User:",
+            img,
+            "Describe this image.\nAssistant: An image of two kittens in grass.\n",
+            "User:",
+            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
+            "Describe this image.\nAssistant:",
+        ]
+        inputs = processor(prompts, return_tensors="pt")
+        generated_ids = model.generate(**inputs, max_length=100)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```
+        In this example the `prompts` will be converted into:
+        ```
+        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant: An image of two kittens in grass.
+        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant:'
+        ```
+        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
+        `pixel_values` dict entry of the return value.
+        This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
+        first image is passed as object and the second one as a url.
+        To do training do:
+        ```python
+        image_transform = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=self.image_mean, std=self.image_std),
+            ]
+        )
+        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
+        ```
+        In order to help debug prompt generation enable `debug=True` which will show you what's happening.
+        """
+        # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
+        if add_end_of_utterance_token is None:
+            add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
+        # turn non-batched prompts into batched
+        if not any(isinstance(i, list) for i in prompts):
+            prompts = [prompts]
+        fake_token = "<fake_token_around_image>"
+        image_token = "<image>"
+        end_of_utterance_token = "<end_of_utterance>"
+        def image_tokens(last_was_image):
+            if last_was_image:
+                return image_token + fake_token
+            else:
+                return fake_token + image_token + fake_token
+        all_texts = []
+        all_images = []
+        for sample in prompts:
+            # the model was trained on samples starting with <s>
+            full_text = f"{self.tokenizer.bos_token}"
+            # an image can either be an image object in the item or the url, everything else is a verbatim prompt text
+            image_objects = []
+            last_was_image = False
+            last_was_text = False
+            for i, item in enumerate(sample):
+                if i > 0:
+                    last_was_text = True if not last_was_image else False
+                if isinstance(item, str):
+                    item = item.strip(" ")
+                    if is_url(item):
+                        image = self.image_processor.fetch_images(item)
+                        full_text += image_tokens(last_was_image)
+                        image_objects.append(image)
+                        last_was_image = True
+                    else:
+                        # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
+                        if add_end_of_utterance_token and last_was_text:
+                            full_text += end_of_utterance_token
+                        full_text += item
+                        last_was_image = False
+                else:
+                    # must be an image obj
+                    full_text += image_tokens(last_was_image)
+                    image_objects.append(item)
+                    last_was_image = True
+            if add_eos_token:
+                full_text += self.tokenizer.eos_token
+            if debug is True:
+                print(f"{full_text=}")
+            image_objects = self.image_processor(image_objects, transform=transform)
+            text_encoding = self.tokenizer(
+                text=full_text,
+                add_special_tokens=False,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+            )
+            all_texts.append(text_encoding["input_ids"])
+            all_images.append(image_objects)
+        max_seq_len = max(len(x) for x in all_texts)
+        # max_num_images has to be at least 1 even when there are no images
+        max_num_images = max(len(x) for x in all_images)
+        max_num_images = max(1, max_num_images)
+        at_least_one_image = sum(len(x) for x in all_images) > 0
+        output_input_ids = []
+        output_images = []
+        output_attention_masks = []
+        for text, images in zip(all_texts, all_images):
+            padded_input_ids = [self.tokenizer.pad_token_id] * max_seq_len
+            unpadded_seq_len = len(text)
+            start = max_seq_len - unpadded_seq_len
+            padded_input_ids[start:] = text[:max_seq_len]
+            attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
+            attention_mask[start:] = 1
+            image_count = padded_input_ids.count(self.image_token_id)
+            local_max_num_images = min(image_count, max_num_images)
+            current_images = images[:local_max_num_images]
+            if len(current_images) > 0:
+                padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
+                padded_image_tensor[: current_images.size(0)] = current_images
+            else:
+                padded_image_tensor = torch.zeros(max_num_images, *self.default_image_dims)
+            output_images.append(padded_image_tensor)
+            output_input_ids.append(torch.tensor(padded_input_ids))
+            output_attention_masks.append(attention_mask)
+        output_input_ids = torch.stack(output_input_ids)
+        output_images = torch.stack(output_images)
+        output_attention_masks = torch.stack(output_attention_masks)
+        if at_least_one_image:
+            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(output_input_ids, self.tokenizer)
+            image_attention_mask = incremental_to_binary_attention_mask(
+                image_attention_mask, num_classes=max_num_images
+            )
+        else:
+            # in full language mode we set the image mask to all-0s
+            image_attention_mask = torch.zeros(
+                output_input_ids.shape[0], output_input_ids.shape[1], 1, dtype=torch.bool
+            )
+        return BatchFeature(
+            data={
+                "input_ids": output_input_ids,
+                "attention_mask": output_attention_masks,
+                "pixel_values": output_images,
+                "image_attention_mask": image_attention_mask,
+            }
+        )
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
--- a/server/text_generation_server/models/idefics.py
+++ b/server/text_generation_server/models/idefics.py
+import torch
+import torch.distributed
+from typing import List, Optional, Tuple
+from transformers import (
+    AutoTokenizer,
+    AutoConfig,
+    AutoProcessor,
+)
+from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
+from text_generation_server.models.custom_modeling.idefics_processing import (
+    IdeficsProcessor,
+)
+from transformers import LlamaTokenizerFast
+from text_generation_server.models.custom_modeling.idefics_modeling import (
+    IdeficsForVisionText2Text,
+)
+from text_generation_server.models.idefics_causal_lm import IdeficsCausalLM
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+class IDEFICSSharded(IdeficsCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            # 9b seems to work correctly enough in float16, but 80b seems
+            # to be really saturating for f16.
+            dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32
+        self.device, self.dtype = device, dtype
+        config = IdeficsConfig.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.vision_config.quantize = quantize
+        tokenizer = LlamaTokenizerFast.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        self.processor = IdeficsProcessor.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device=device,
+            dtype=dtype,
+            process_group=self.process_group,
+        )
+        model = IdeficsForVisionText2Text(config, weights)
+        torch.distributed.barrier(group=self.process_group)
+        super(IdeficsCausalLM, self).__init__(
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
--- a/server/text_generation_server/models/idefics_causal_lm.py
+++ b/server/text_generation_server/models/idefics_causal_lm.py
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -14,7 +14,7 @@ from text_generation_server.interceptor import ExceptionInterceptor
 from text_generation_server.models import Model, get_model
 from text_generation_server.pb import generate_pb2_grpc, generate_pb2
 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
+from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
 class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
    def __init__(self, model: Model, cache: Cache, server_urls: List[str]):
@@ -26,6 +26,7 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
            # Force inference mode for the lifetime of TextGenerationService
            self._inference_mode_raii_guard = torch._C._InferenceMode(True)
    async def Info(self, request, context):
        return self.model.info
@@ -54,9 +55,14 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
    async def Warmup(self, request, context):
-        batch = self.model.batch_type.from_pb(
+        if self.model.batch_type == IdeficsCausalLMBatch: #Hack, i would rather use kwargs in the `from_pb` call
-            request.batch, self.model.tokenizer, self.model.dtype, self.model.device
+            batch = self.model.batch_type.from_pb(
-        )
+                request.batch, self.model.tokenizer, self.model.processor, self.model.dtype, self.model.device
+            )
+        else:
+            batch = self.model.batch_type.from_pb(
+                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
+            )
        max_supported_total_tokens = self.model.warmup(batch)
        return generate_pb2.WarmupResponse(
@@ -64,9 +70,14 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
        )
    async def Prefill(self, request, context):
-        batch = self.model.batch_type.from_pb(
+        if self.model.batch_type == IdeficsCausalLMBatch: #Hack, i would rather use kwargs in the `from_pb` call
-            request.batch, self.model.tokenizer, self.model.dtype, self.model.device
+            batch = self.model.batch_type.from_pb(
-        )
+                request.batch, self.model.tokenizer, self.model.processor, self.model.dtype, self.model.device
+            )
+        else:
+            batch = self.model.batch_type.from_pb(
+                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
+            )
        generations, next_batch = self.model.generate_token(batch)
        self.cache.set(next_batch)

--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py