"Plugson/www/vscode:/vscode.git/clone" did not exist on "c57717aea24be2ff4ed8c9b095ae74f3a3220de1"
Unverified Commit bb6f6d53 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add X-CLIP (#18852)

* First draft

* Improve conversion script

* Make vision encoder work

* More improvements

* Improve conversion script

* Fix quality

* Add MultiframeIntegrationTransformer

* More improvements

* Make MiT output work

* Fix quality

* Add prompts generator

* Add tests

* Fix some tests

* Fix some more tests

* Fix more tests

* Improve conversion script

* Fix model outputs

* Fix more tests

* Add XClipProcessor

* Use processor in conversion script

* Fix integration test

* Update README, fix docs

* Fix all tests

* Add MIT output to XClipOutput

* Create better variable names

* Rename XClip to XCLIP

* Extend conversion script

* Add support for large models

* Add support for 16 frame models

* Add another model'

* Fix module issue

* Apply suggestions from code review

* Add figure to docs

* Fix CLIPProcessor issue

* Apply suggestions from code review

* Delete file

* Convert more checkpoints

* Convert last checkpoint

* Update nielsr to microsoft
parent 9832ac7c
......@@ -5202,6 +5202,37 @@ class WavLMPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
class XCLIPModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XCLIPPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XCLIPTextModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XCLIPVisionModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -24,13 +24,6 @@ class CLIPFeatureExtractor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class CLIPProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class ConvNextFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
......
This diff is collapsed.
......@@ -49,6 +49,7 @@ CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = {
"SpeechEncoderDecoderConfig",
"VisionEncoderDecoderConfig",
"VisionTextDualEncoderConfig",
"XCLIPConfig",
}
......
......@@ -207,6 +207,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"TFWav2Vec2ForCTC",
"TFHubertForCTC",
"MaskFormerForInstanceSegmentation",
"XCLIPVisionModel",
"XCLIPTextModel",
]
# Update this list for models that have multiple model types for the same
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment