[ { "number": 0, "type": 0, "bbox": [ 163.52999877929688, 87.15061950683594, 432.6361389160156, 123.74800872802734 ], "lines": [ { "spans": [ { "size": 14.346199989318848, "flags": 20, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": "ViLT: Vision-and-Language Transformer", "origin": [ 172.30999755859375, 100.9229736328125 ], "bbox": [ 172.30999755859375, 87.15061950683594, 424.8892517089844, 105.81502532958984 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 172.30999755859375, 87.15061950683594, 424.8892517089844, 105.81502532958984 ] }, { "spans": [ { "size": 14.346199989318848, "flags": 20, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": "Without Convolution or Region Supervision", "origin": [ 163.52999877929688, 118.85595703125 ], "bbox": [ 163.52999877929688, 105.08360290527344, 432.6361389160156, 123.74800872802734 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 163.52999877929688, 105.08360290527344, 432.6361389160156, 123.74800872802734 ] } ] }, { "number": 1, "type": 0, "bbox": [ 198.60899353027344, 155.3678436279297, 396.6109313964844, 171.94522094726562 ], "lines": [ { "spans": [ { "size": 9.962599754333496, "flags": 20, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": "Wonjae Kim", "origin": [ 198.60899353027344, 168.5479736328125 ], "bbox": [ 198.60899353027344, 158.98387145996094, 252.37713623046875, 171.94522094726562 ] }, { "size": 6.973800182342529, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " ", "origin": [ 252.37713623046875, 168.5479736328125 ], "bbox": [ 252.37713623046875, 158.4881591796875, 254.03799438476562, 170.5076141357422 ] }, { "size": 6.973800182342529, "flags": 5, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "* 1 \u2020", "origin": [ 254.03799438476562, 164.93194580078125 ], "bbox": [ 254.03799438476562, 158.4881591796875, 267.8182373046875, 166.89158630371094 ] }, { "size": 9.962599754333496, "flags": 21, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": " ", "origin": [ 267.8182373046875, 164.93194580078125 ], "bbox": [ 267.8182373046875, 155.3678436279297, 274.4620056152344, 171.94522094726562 ] }, { "size": 9.962599754333496, "flags": 20, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": "Bokyung Son", "origin": [ 274.4620056152344, 168.5479736328125 ], "bbox": [ 274.4620056152344, 158.98387145996094, 331.06951904296875, 171.94522094726562 ] }, { "size": 6.973800182342529, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " ", "origin": [ 331.06951904296875, 168.5479736328125 ], "bbox": [ 331.06951904296875, 158.4881591796875, 332.7300109863281, 170.5076141357422 ] }, { "size": 6.973800182342529, "flags": 5, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "* 1", "origin": [ 332.7300109863281, 164.93194580078125 ], "bbox": [ 332.7300109863281, 158.4881591796875, 341.36358642578125, 166.89158630371094 ] }, { "size": 9.962599754333496, "flags": 21, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": " ", "origin": [ 341.36358642578125, 164.93194580078125 ], "bbox": [ 341.36358642578125, 155.3678436279297, 348.00701904296875, 171.94522094726562 ] }, { "size": 9.962599754333496, "flags": 20, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": "Ildoo Kim", "origin": [ 348.00701904296875, 168.5479736328125 ], "bbox": [ 348.00701904296875, 158.98387145996094, 391.4638977050781, 171.94522094726562 ] }, { "size": 6.973800182342529, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " ", "origin": [ 391.4638977050781, 168.5479736328125 ], "bbox": [ 391.4638977050781, 158.4881591796875, 393.1240234375, 170.5076141357422 ] }, { "size": 6.973800182342529, "flags": 5, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "2", "origin": [ 393.1240234375, 164.93194580078125 ], "bbox": [ 393.1240234375, 158.4881591796875, 396.6109313964844, 166.89158630371094 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 198.60899353027344, 155.3678436279297, 396.6109313964844, 171.94522094726562 ] } ] }, { "number": 2, "type": 0, "bbox": [ 150.19802856445312, 191.6519775390625, 194.68331909179688, 207.2056884765625 ], "lines": [ { "spans": [ { "size": 11.9552001953125, "flags": 20, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": "Abstract", "origin": [ 150.19802856445312, 203.12896728515625 ], "bbox": [ 150.19802856445312, 191.6519775390625, 194.68331909179688, 207.2056884765625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 150.19802856445312, 191.6519775390625, 194.68331909179688, 207.2056884765625 ] } ] }, { "number": 3, "type": 0, "bbox": [ 75.00700378417969, 214.5135498046875, 271.2561340332031, 501.48748779296875 ], "lines": [ { "spans": [ { "size": 9.947644233703613, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Vision-and-Language Pre-training (VLP) has im-", "origin": [ 75.36599731445312, 223.718994140625 ], "bbox": [ 75.36599731445312, 214.5135498046875, 271.1695251464844, 226.5184783935547 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 214.5135498046875, 271.1695251464844, 226.5184783935547 ] }, { "spans": [ { "size": 10.041984558105469, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "proved performance on various joint vision-and-", "origin": [ 75.36599731445312, 235.67401123046875 ], "bbox": [ 75.36599731445312, 226.46856689453125, 271.16607666015625, 238.47349548339844 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 226.46856689453125, 271.16607666015625, 238.47349548339844 ] }, { "spans": [ { "size": 10.002370834350586, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "language downstream tasks. Current approaches", "origin": [ 75.36599731445312, 247.6290283203125 ], "bbox": [ 75.36599731445312, 238.423583984375, 269.5137939453125, 250.4285125732422 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 238.423583984375, 269.5137939453125, 250.4285125732422 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "to VLP heavily rely on image feature extraction", "origin": [ 75.36599731445312, 259.583984375 ], "bbox": [ 75.36599731445312, 250.3785400390625, 269.5183410644531, 262.38348388671875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 250.3785400390625, 269.5183410644531, 262.38348388671875 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "processes, most of which involve region super-", "origin": [ 75.36599731445312, 271.53900146484375 ], "bbox": [ 75.36599731445312, 262.33355712890625, 271.16461181640625, 274.3385009765625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 262.33355712890625, 271.16461181640625, 274.3385009765625 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "vision (e.g., object detection) and the convolu-", "origin": [ 75.11699676513672, 283.4949951171875 ], "bbox": [ 75.11699676513672, 274.28955078125, 271.1696472167969, 286.29449462890625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.11699676513672, 274.28955078125, 271.1696472167969, 286.29449462890625 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "tional architecture (e.g., ResNet). Although dis-", "origin": [ 75.36599731445312, 295.45001220703125 ], "bbox": [ 75.36599731445312, 286.24456787109375, 271.16461181640625, 298.24951171875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 286.24456787109375, 271.16461181640625, 298.24951171875 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "regarded in the literature, we \ufb01nd it problem-", "origin": [ 75.36599731445312, 307.4049987792969 ], "bbox": [ 75.36599731445312, 298.1995544433594, 271.1645812988281, 310.2044982910156 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 298.1995544433594, 271.1645812988281, 310.2044982910156 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "atic in terms of both (1) ef\ufb01ciency/speed, that", "origin": [ 75.36599731445312, 319.3599853515625 ], "bbox": [ 75.36599731445312, 310.154541015625, 269.5184020996094, 322.15948486328125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 310.154541015625, 269.5184020996094, 322.15948486328125 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "simply extracting input features requires much", "origin": [ 75.36599731445312, 331.31500244140625 ], "bbox": [ 75.36599731445312, 322.10955810546875, 269.5184020996094, 334.114501953125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 322.10955810546875, 269.5184020996094, 334.114501953125 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "more computation than the multimodal interac-", "origin": [ 75.36599731445312, 343.2699890136719 ], "bbox": [ 75.36599731445312, 334.0645446777344, 271.16461181640625, 346.0694885253906 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 334.0645446777344, 271.16461181640625, 346.0694885253906 ] }, { "spans": [ { "size": 9.862470626831055, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "tion steps; and (2) expressive power, as it is upper", "origin": [ 75.36599731445312, 355.22601318359375 ], "bbox": [ 75.36599731445312, 346.02056884765625, 269.6858825683594, 358.0255126953125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 346.02056884765625, 269.6858825683594, 358.0255126953125 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "bounded to the expressive power of the visual", "origin": [ 75.36599731445312, 367.1809997558594 ], "bbox": [ 75.36599731445312, 357.9755554199219, 269.51837158203125, 369.9804992675781 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 357.9755554199219, 269.51837158203125, 369.9804992675781 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "embedder and its prede\ufb01ned visual vocabulary.", "origin": [ 75.36599731445312, 379.135986328125 ], "bbox": [ 75.36599731445312, 369.9305419921875, 271.2561340332031, 381.93548583984375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 369.9305419921875, 271.2561340332031, 381.93548583984375 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "In this paper, we present a minimal VLP model,", "origin": [ 75.36599731445312, 391.09100341796875 ], "bbox": [ 75.36599731445312, 381.88555908203125, 270.7580871582031, 393.8905029296875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 381.88555908203125, 270.7580871582031, 393.8905029296875 ] }, { "spans": [ { "size": 9.887598037719727, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Vision-and-Language Transformer (ViLT), mono-", "origin": [ 75.00700378417969, 403.0459899902344 ], "bbox": [ 75.00700378417969, 393.8405456542969, 271.1720886230469, 405.8454895019531 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.00700378417969, 393.8405456542969, 271.1720886230469, 405.8454895019531 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "lithic in the sense that the processing of visual", "origin": [ 75.36599731445312, 415.0010070800781 ], "bbox": [ 75.36599731445312, 405.7955627441406, 269.51837158203125, 417.8005065917969 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 405.7955627441406, 269.51837158203125, 417.8005065917969 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "inputs is drastically simpli\ufb01ed to just the same", "origin": [ 75.36599731445312, 426.9570007324219 ], "bbox": [ 75.36599731445312, 417.7515563964844, 269.51837158203125, 429.7565002441406 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 417.7515563964844, 269.51837158203125, 429.7565002441406 ] }, { "spans": [ { "size": 10.037040710449219, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "convolution-free manner that we process textual", "origin": [ 75.36599731445312, 438.9119873046875 ], "bbox": [ 75.36599731445312, 429.70654296875, 269.5171203613281, 441.71148681640625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 429.70654296875, 269.5171203613281, 441.71148681640625 ] }, { "spans": [ { "size": 10.02714729309082, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "inputs. We show that ViLT is up to tens of times", "origin": [ 75.36599731445312, 450.86700439453125 ], "bbox": [ 75.36599731445312, 441.66156005859375, 269.5180358886719, 453.66650390625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 441.66156005859375, 269.5180358886719, 453.66650390625 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "faster than previous VLP models, yet with com-", "origin": [ 75.36599731445312, 462.8219909667969 ], "bbox": [ 75.36599731445312, 453.6165466308594, 271.1645812988281, 465.6214904785156 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 453.6165466308594, 271.1645812988281, 465.6214904785156 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "petitive or better downstream task performance.", "origin": [ 75.36599731445312, 474.7770080566406 ], "bbox": [ 75.36599731445312, 465.5715637207031, 271.25604248046875, 477.5765075683594 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 465.5715637207031, 271.25604248046875, 477.5765075683594 ] }, { "spans": [ { "size": 9.9176664352417, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Our code and pre-trained weights are available at", "origin": [ 75.36599731445312, 486.73199462890625 ], "bbox": [ 75.36599731445312, 477.52655029296875, 269.5173034667969, 489.531494140625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 477.52655029296875, 269.5173034667969, 489.531494140625 ] }, { "spans": [ { "size": 9.962599754333496, "flags": 4, "font": "NimbusMonL-Regu", "color": 5234, "ascender": 0.8109999895095825, "descender": -0.2370000034570694, "text": "https://github.com/dandelin/vilt", "origin": [ 75.36599731445312, 498.68798828125 ], "bbox": [ 75.36599731445312, 490.6083068847656, 266.6478271484375, 501.04913330078125 ] }, { "size": 9.962599754333496, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ".", "origin": [ 266.64801025390625, 498.68798828125 ], "bbox": [ 266.64801025390625, 489.4825439453125, 269.138671875, 501.48748779296875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 75.36599731445312, 489.4825439453125, 269.138671875, 501.48748779296875 ] } ] }, { "number": 4, "type": 0, "bbox": [ 55.44001770019531, 535.7109985351562, 132.27609252929688, 551.2647094726562 ], "lines": [ { "spans": [ { "size": 11.9552001953125, "flags": 20, "font": "NimbusRomNo9L-Medi", "color": 0, "ascender": 0.9599999785423279, "descender": -0.3409999907016754, "text": "1. Introduction", "origin": [ 55.44001770019531, 547.18798828125 ], "bbox": [ 55.44001770019531, 535.7109985351562, 132.27609252929688, 551.2647094726562 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.44001770019531, 535.7109985351562, 132.27609252929688, 551.2647094726562 ] } ] }, { "number": 5, "type": 0, "bbox": [ 55.11199951171875, 557.3555908203125, 291.0977478027344, 629.1365356445312 ], "lines": [ { "spans": [ { "size": 9.927669525146484, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "The pre-train-and-\ufb01ne-tune scheme has been expanded to a", "origin": [ 55.13100051879883, 566.56103515625 ], "bbox": [ 55.13100051879883, 557.3555908203125, 289.4433898925781, 569.3605346679688 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.13100051879883, 557.3555908203125, 289.4433898925781, 569.3605346679688 ] }, { "spans": [ { "size": 9.92266845703125, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "joint domain of vision and language, giving birth to the cat-", "origin": [ 55.439998626708984, 578.5169677734375 ], "bbox": [ 55.439998626708984, 569.3115234375, 291.0977478027344, 581.3164672851562 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.439998626708984, 569.3115234375, 291.0977478027344, 581.3164672851562 ] }, { "spans": [ { "size": 10.017244338989258, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "egory of", "origin": [ 55.439998626708984, 590.4719848632812 ], "bbox": [ 55.439998626708984, 581.2665405273438, 89.09119415283203, 593.271484375 ] }, { "size": 10.017244338989258, "flags": 6, "font": "NimbusRomNo9L-ReguItal", "color": 0, "ascender": 0.9240000247955322, "descender": -0.27000001072883606, "text": " Vision-and-Language Pre-training (VLP)", "origin": [ 89.09119415283203, 590.4719848632812 ], "bbox": [ 89.09119415283203, 581.2665405273438, 257.868896484375, 593.161865234375 ] }, { "size": 10.017244338989258, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " models", "origin": [ 257.868896484375, 590.4719848632812 ], "bbox": [ 257.868896484375, 581.2665405273438, 289.4405822753906, 593.271484375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.439998626708984, 581.2665405273438, 289.4405822753906, 593.271484375 ] }, { "spans": [ { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "(", "origin": [ 55.11199951171875, 602.427001953125 ], "bbox": [ 55.11199951171875, 593.2215576171875, 58.436180114746094, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Lu et al.", "origin": [ 58.436180114746094, 602.427001953125 ], "bbox": [ 58.436180114746094, 593.2215576171875, 91.40845489501953, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 91.40845489501953, 602.427001953125 ], "bbox": [ 91.40845489501953, 593.2215576171875, 93.90408325195312, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2019", "origin": [ 93.90408325195312, 602.427001953125 ], "bbox": [ 93.90408325195312, 593.2215576171875, 116.3647689819336, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 116.3647689819336, 602.427001953125 ], "bbox": [ 116.3647689819336, 593.2215576171875, 119.13990783691406, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Chen et al.", "origin": [ 119.13990783691406, 602.427001953125 ], "bbox": [ 119.13990783691406, 593.2215576171875, 164.58033752441406, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 164.58033752441406, 602.427001953125 ], "bbox": [ 164.58033752441406, 593.2215576171875, 167.0759735107422, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2019", "origin": [ 167.0759735107422, 602.427001953125 ], "bbox": [ 167.0759735107422, 593.2215576171875, 189.52664184570312, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 189.52664184570312, 602.427001953125 ], "bbox": [ 189.52664184570312, 593.2215576171875, 192.30178833007812, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Su et al.", "origin": [ 192.30178833007812, 602.427001953125 ], "bbox": [ 192.30178833007812, 593.2215576171875, 227.22064208984375, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 227.22064208984375, 602.427001953125 ], "bbox": [ 227.22064208984375, 593.2215576171875, 229.71627807617188, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2019", "origin": [ 229.71627807617188, 602.427001953125 ], "bbox": [ 229.71627807617188, 593.2215576171875, 252.1669464111328, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 252.1669464111328, 602.427001953125 ], "bbox": [ 252.1669464111328, 593.2215576171875, 254.9420928955078, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Li et al.", "origin": [ 254.9420928955078, 602.427001953125 ], "bbox": [ 254.9420928955078, 593.2215576171875, 288.1939392089844, 605.2265014648438 ] }, { "size": 9.972557067871094, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 288.1939392089844, 602.427001953125 ], "bbox": [ 288.1939392089844, 593.2215576171875, 290.6895751953125, 605.2265014648438 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.11199951171875, 593.2215576171875, 290.6895751953125, 605.2265014648438 ] }, { "spans": [ { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "2019", "origin": [ 55.439998626708984, 614.3820190429688 ], "bbox": [ 55.439998626708984, 605.1765747070312, 75.26556396484375, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 75.26556396484375, 614.3820190429688 ], "bbox": [ 75.26556396484375, 605.1765747070312, 78.02131652832031, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Tan & Bansal", "origin": [ 78.02131652832031, 614.3820190429688 ], "bbox": [ 78.02131652832031, 605.1765747070312, 134.8215789794922, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 134.8215789794922, 614.3820190429688 ], "bbox": [ 134.8215789794922, 605.1765747070312, 137.29977416992188, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2019", "origin": [ 137.29977416992188, 614.3820190429688 ], "bbox": [ 137.29977416992188, 605.1765747070312, 159.62335205078125, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 159.62335205078125, 614.3820190429688 ], "bbox": [ 159.62335205078125, 605.1765747070312, 162.3791046142578, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Li et al.", "origin": [ 162.3791046142578, 614.3820190429688 ], "bbox": [ 162.3791046142578, 605.1765747070312, 195.4679718017578, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 195.4679718017578, 614.3820190429688 ], "bbox": [ 195.4679718017578, 605.1765747070312, 197.9461669921875, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2020a", "origin": [ 197.9461669921875, 614.3820190429688 ], "bbox": [ 197.9461669921875, 605.1765747070312, 224.66111755371094, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 224.66111755371094, 614.3820190429688 ], "bbox": [ 224.66111755371094, 605.1765747070312, 227.4168701171875, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Lu et al.", "origin": [ 227.4168701171875, 614.3820190429688 ], "bbox": [ 227.4168701171875, 605.1765747070312, 262.71630859375, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 262.71630859375, 614.3820190429688 ], "bbox": [ 262.71630859375, 605.1765747070312, 265.19451904296875, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2020", "origin": [ 265.19451904296875, 614.3820190429688 ], "bbox": [ 265.19451904296875, 605.1765747070312, 287.5081787109375, 617.1815185546875 ] }, { "size": 9.937662124633789, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 287.5081787109375, 614.3820190429688 ], "bbox": [ 287.5081787109375, 605.1765747070312, 290.2639465332031, 617.1815185546875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.439998626708984, 605.1765747070312, 290.2639465332031, 617.1815185546875 ] }, { "spans": [ { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Cho et al.", "origin": [ 55.439998626708984, 626.3370361328125 ], "bbox": [ 55.439998626708984, 617.131591796875, 94.53492736816406, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 94.53492736816406, 626.3370361328125 ], "bbox": [ 94.53492736816406, 617.131591796875, 97.0728988647461, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2020", "origin": [ 97.0728988647461, 626.3370361328125 ], "bbox": [ 97.0728988647461, 617.131591796875, 119.86388397216797, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 119.86388397216797, 626.3370361328125 ], "bbox": [ 119.86388397216797, 617.131591796875, 122.68611145019531, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Qi et al.", "origin": [ 122.68611145019531, 626.3370361328125 ], "bbox": [ 122.68611145019531, 617.131591796875, 157.4969482421875, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 157.4969482421875, 626.3370361328125 ], "bbox": [ 157.4969482421875, 617.131591796875, 160.03492736816406, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2020", "origin": [ 160.03492736816406, 626.3370361328125 ], "bbox": [ 160.03492736816406, 617.131591796875, 182.82591247558594, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 182.82591247558594, 626.3370361328125 ], "bbox": [ 182.82591247558594, 617.131591796875, 185.64813232421875, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Zhou et al.", "origin": [ 185.64813232421875, 626.3370361328125 ], "bbox": [ 185.64813232421875, 617.131591796875, 231.7377166748047, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 231.7377166748047, 626.3370361328125 ], "bbox": [ 231.7377166748047, 617.131591796875, 234.27569580078125, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2020", "origin": [ 234.27569580078125, 626.3370361328125 ], "bbox": [ 234.27569580078125, 617.131591796875, 257.0666809082031, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 257.0666809082031, 626.3370361328125 ], "bbox": [ 257.0666809082031, 617.131591796875, 259.888916015625, 629.1365356445312 ] }, { "size": 10.056798934936523, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Huang", "origin": [ 259.888916015625, 626.3370361328125 ], "bbox": [ 259.888916015625, 617.131591796875, 289.44110107421875, 629.1365356445312 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.439998626708984, 617.131591796875, 289.44110107421875, 629.1365356445312 ] } ] }, { "number": 6, "type": 0, "bbox": [ 54.426998138427734, 635.9896850585938, 290.9289855957031, 687.6925048828125 ], "lines": [ { "spans": [ { "size": 5.97760009765625, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "*", "origin": [ 64.95500183105469, 641.5130004882812 ], "bbox": [ 64.95500183105469, 635.9896850585938, 67.94380187988281, 643.1926879882812 ] }, { "size": 8.876283645629883, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Equal contribution", "origin": [ 68.44200134277344, 645.322021484375 ], "bbox": [ 68.44200134277344, 637.0370483398438, 133.5630340576172, 647.841552734375 ] }, { "size": 5.97760009765625, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " \u2020", "origin": [ 133.5630340576172, 645.322021484375 ], "bbox": [ 133.5630340576172, 635.9896850585938, 138.3968048095703, 647.001708984375 ] }, { "size": 8.876283645629883, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Current af\ufb01liation: NAVER AI Lab, Seong-", "origin": [ 138.89500427246094, 645.322021484375 ], "bbox": [ 138.89500427246094, 637.0370483398438, 290.9289855957031, 647.841552734375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 64.95500183105469, 635.9896850585938, 290.9289855957031, 647.841552734375 ] }, { "spans": [ { "size": 9.055620193481445, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "nam, Gyeonggi, Republic of Korea.", "origin": [ 55.439998626708984, 655.2850341796875 ], "bbox": [ 55.439998626708984, 647.0000610351562, 187.9615936279297, 657.8045654296875 ] }, { "size": 5.97760009765625, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " ", "origin": [ 187.9615936279297, 655.2850341796875 ], "bbox": [ 187.9615936279297, 645.9526977539062, 192.20399475097656, 656.9647216796875 ] }, { "size": 5.97760009765625, "flags": 5, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "1", "origin": [ 192.20399475097656, 651.4760131835938 ], "bbox": [ 192.20399475097656, 645.9526977539062, 195.1927947998047, 653.1557006835938 ] }, { "size": 9.055620193481445, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Kakao Enterprise, Seong-", "origin": [ 195.6909942626953, 655.2850341796875 ], "bbox": [ 195.6909942626953, 647.0000610351562, 290.92547607421875, 657.8045654296875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.439998626708984, 645.9526977539062, 290.92547607421875, 657.8045654296875 ] }, { "spans": [ { "size": 9.055620193481445, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "nam, Gyeonggi, Republic of Korea", "origin": [ 55.439998626708984, 665.2479858398438 ], "bbox": [ 55.439998626708984, 656.9630126953125, 188.9858856201172, 667.7675170898438 ] }, { "size": 5.97760009765625, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " ", "origin": [ 188.9858856201172, 665.2479858398438 ], "bbox": [ 188.9858856201172, 655.9156494140625, 192.4550018310547, 666.9276733398438 ] }, { "size": 5.97760009765625, "flags": 5, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "2", "origin": [ 192.4550018310547, 661.43896484375 ], "bbox": [ 192.4550018310547, 655.9156494140625, 195.4438018798828, 663.11865234375 ] }, { "size": 9.055620193481445, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Kakao Brain, Seongnam,", "origin": [ 195.94200134277344, 665.2479858398438 ], "bbox": [ 195.94200134277344, 656.9630126953125, 290.5637512207031, 667.7675170898438 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.439998626708984, 655.9156494140625, 290.5637512207031, 667.7675170898438 ] }, { "spans": [ { "size": 9.055620193481445, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Gyeonggi, Republic of Korea. Correspondence to: Wonjae Kim", "origin": [ 55.439998626708984, 675.2100219726562 ], "bbox": [ 55.439998626708984, 666.925048828125, 289.44256591796875, 677.7295532226562 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.439998626708984, 666.925048828125, 289.44256591796875, 677.7295532226562 ] }, { "spans": [ { "size": 8.966400146484375, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ".", "origin": [ 54.426998138427734, 685.1729736328125 ], "bbox": [ 54.426998138427734, 676.8880004882812, 170.25494384765625, 687.6925048828125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 54.426998138427734, 676.8880004882812, 170.25494384765625, 687.6925048828125 ] } ] }, { "number": 7, "type": 0, "bbox": [ 55.1619987487793, 693.0040283203125, 289.4412841796875, 717.5805053710938 ], "lines": [ { "spans": [ { "size": 9.055620193481445, "flags": 6, "font": "NimbusRomNo9L-ReguItal", "color": 0, "ascender": 0.9240000247955322, "descender": -0.27000001072883606, "text": "Proceedings of the", "origin": [ 55.1619987487793, 705.0980224609375 ], "bbox": [ 55.1619987487793, 696.8130493164062, 124.84329223632812, 707.5189208984375 ] }, { "size": 8.966400146484375, "flags": 6, "font": "CMTI9", "color": 0, "ascender": 0.75, "descender": -0.25, "text": " 38", "origin": [ 124.84329223632812, 705.0980224609375 ], "bbox": [ 124.84329223632812, 698.3732299804688, 137.28375244140625, 707.339599609375 ] }, { "size": 5.97760009765625, "flags": 6, "font": "CMMI6", "color": 0, "ascender": 0.75, "descender": -0.25, "text": " ", "origin": [ 137.28375244140625, 705.0980224609375 ], "bbox": [ 137.28375244140625, 696.8057861328125, 138.4969940185547, 706.5924072265625 ] }, { "size": 5.97760009765625, "flags": 7, "font": "CMMI6", "color": 0, "ascender": 0.75, "descender": -0.25, "text": "th", "origin": [ 138.4969940185547, 701.2890014648438 ], "bbox": [ 138.4969940185547, 696.8057861328125, 145.68206787109375, 702.7833862304688 ] }, { "size": 9.055620193481445, "flags": 7, "font": "NimbusRomNo9L-ReguItal", "color": 0, "ascender": 0.9240000247955322, "descender": -0.27000001072883606, "text": " ", "origin": [ 145.68206787109375, 701.2890014648438 ], "bbox": [ 145.68206787109375, 693.0040283203125, 149.218994140625, 707.5189208984375 ] }, { "size": 9.055620193481445, "flags": 6, "font": "NimbusRomNo9L-ReguItal", "color": 0, "ascender": 0.9240000247955322, "descender": -0.27000001072883606, "text": "International Conference on Machine", "origin": [ 149.218994140625, 705.0980224609375 ], "bbox": [ 149.218994140625, 696.8130493164062, 289.4412841796875, 707.5189208984375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.1619987487793, 693.0040283203125, 289.4412841796875, 707.5189208984375 ] }, { "spans": [ { "size": 8.966400146484375, "flags": 6, "font": "NimbusRomNo9L-ReguItal", "color": 0, "ascender": 0.9240000247955322, "descender": -0.27000001072883606, "text": "Learning", "origin": [ 55.18899917602539, 715.0609741210938 ], "bbox": [ 55.18899917602539, 706.7760009765625, 88.06878662109375, 717.4818725585938 ] }, { "size": 8.966400146484375, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ", PMLR 139, 2021. Copyright 2021 by the author(s).", "origin": [ 88.06900024414062, 715.0609741210938 ], "bbox": [ 88.06900024414062, 706.7760009765625, 277.80706787109375, 717.5805053710938 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 55.18899917602539, 706.7760009765625, 277.80706787109375, 717.5805053710938 ] } ] }, { "number": 8, "type": 0, "bbox": [ 502.3902282714844, 251.63328552246094, 525.6985473632812, 263.7827453613281 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Modality ", "origin": [ 504.17840576171875, 256.42926025390625 ], "bbox": [ 504.17840576171875, 251.63328552246094, 525.2564697265625, 257.59344482421875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 504.17840576171875, 251.63328552246094, 525.2564697265625, 257.59344482421875 ] }, { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Interaction", "origin": [ 502.3902282714844, 262.6185607910156 ], "bbox": [ 502.3902282714844, 257.8226013183594, 525.6985473632812, 263.7827453613281 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 502.3902282714844, 257.8226013183594, 525.6985473632812, 263.7827453613281 ] } ] }, { "number": 9, "type": 0, "bbox": [ 459.72119140625, 292.3870849609375, 475.1094055175781, 298.34722900390625 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Linear ", "origin": [ 459.72119140625, 297.18304443359375 ], "bbox": [ 459.72119140625, 292.3870849609375, 475.1094055175781, 298.34722900390625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 459.72119140625, 292.3870849609375, 475.1094055175781, 298.34722900390625 ] } ] }, { "number": 10, "type": 0, "bbox": [ 454.3236999511719, 298.57635498046875, 479.13580322265625, 304.5364990234375 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Embedding", "origin": [ 454.3236999511719, 303.372314453125 ], "bbox": [ 454.3236999511719, 298.57635498046875, 479.13580322265625, 304.5364990234375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 454.3236999511719, 298.57635498046875, 479.13580322265625, 304.5364990234375 ] } ] }, { "number": 11, "type": 0, "bbox": [ 459.72119140625, 262.53955078125, 475.1094055175781, 268.49969482421875 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Linear ", "origin": [ 459.72119140625, 267.33551025390625 ], "bbox": [ 459.72119140625, 262.53955078125, 475.1094055175781, 268.49969482421875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 459.72119140625, 262.53955078125, 475.1094055175781, 268.49969482421875 ] } ] }, { "number": 12, "type": 0, "bbox": [ 454.3236999511719, 268.7288513183594, 479.13580322265625, 274.6889953613281 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Embedding", "origin": [ 454.3236999511719, 273.5248107910156 ], "bbox": [ 454.3236999511719, 268.7288513183594, 479.13580322265625, 274.6889953613281 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 454.3236999511719, 268.7288513183594, 479.13580322265625, 274.6889953613281 ] } ] }, { "number": 13, "type": 0, "bbox": [ 461.06146240234375, 238.60382080078125, 473.7705078125, 244.56396484375 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "CNN ", "origin": [ 461.06146240234375, 243.39979553222656 ], "bbox": [ 461.06146240234375, 238.60382080078125, 473.7705078125, 244.56396484375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 461.06146240234375, 238.60382080078125, 473.7705078125, 244.56396484375 ] } ] }, { "number": 14, "type": 0, "bbox": [ 455.9700927734375, 244.79312133789062, 477.48974609375, 250.75326538085938 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Backbone", "origin": [ 455.9700927734375, 249.58909606933594 ], "bbox": [ 455.9700927734375, 244.79312133789062, 477.48974609375, 250.75326538085938 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 455.9700927734375, 244.79312133789062, 477.48974609375, 250.75326538085938 ] } ] }, { "number": 15, "type": 0, "bbox": [ 423.95831298828125, 214.78736877441406, 436.6673583984375, 220.7475128173828 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "CNN ", "origin": [ 423.95831298828125, 219.58334350585938 ], "bbox": [ 423.95831298828125, 214.78736877441406, 436.6673583984375, 220.7475128173828 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 423.95831298828125, 214.78736877441406, 436.6673583984375, 220.7475128173828 ] } ] }, { "number": 16, "type": 0, "bbox": [ 418.866943359375, 220.97666931152344, 440.3865966796875, 226.9368133544922 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Backbone", "origin": [ 418.866943359375, 225.77264404296875 ], "bbox": [ 418.866943359375, 220.97666931152344, 440.3865966796875, 226.9368133544922 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 418.866943359375, 220.97666931152344, 440.3865966796875, 226.9368133544922 ] } ] }, { "number": 17, "type": 0, "bbox": [ 458.9696044921875, 214.73648071289062, 475.8615417480469, 220.69662475585938 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Region ", "origin": [ 458.9696044921875, 219.53245544433594 ], "bbox": [ 458.9696044921875, 214.73648071289062, 475.8615417480469, 220.69662475585938 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 458.9696044921875, 214.73648071289062, 475.8615417480469, 220.69662475585938 ] } ] }, { "number": 18, "type": 0, "bbox": [ 454.9228820800781, 220.92578125, 478.5367126464844, 226.88592529296875 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Operations", "origin": [ 454.9228820800781, 225.7217559814453 ], "bbox": [ 454.9228820800781, 220.92578125, 478.5367126464844, 226.88592529296875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 454.9228820800781, 220.92578125, 478.5367126464844, 226.88592529296875 ] } ] }, { "number": 19, "type": 0, "bbox": [ 388.87701416015625, 192.8269805908203, 476.27813720703125, 201.76719665527344 ], "lines": [ { "spans": [ { "size": 8.072999954223633, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Visual Embedding Schema", "origin": [ 388.87701416015625, 200.02093505859375 ], "bbox": [ 388.87701416015625, 192.8269805908203, 476.27813720703125, 201.76719665527344 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 388.87701416015625, 192.8269805908203, 476.27813720703125, 201.76719665527344 ] } ] }, { "number": 20, "type": 0, "bbox": [ 320.90106201171875, 342.736328125, 362.5631408691406, 350.1865234375 ], "lines": [ { "spans": [ { "size": 6.7275004386901855, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "UNITER-Base ", "origin": [ 320.90106201171875, 348.7312927246094 ], "bbox": [ 320.90106201171875, 342.736328125, 362.5631408691406, 350.1865234375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 320.90106201171875, 342.736328125, 362.5631408691406, 350.1865234375 ] } ] }, { "number": 21, "type": 0, "bbox": [ 317.7691650390625, 350.3517761230469, 364.0132141113281, 357.2059631347656 ], "lines": [ { "spans": [ { "size": 6.189300060272217, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(75.8 / 85.9 / 72.5)", "origin": [ 317.7691650390625, 355.8671569824219 ], "bbox": [ 317.7691650390625, 350.3517761230469, 364.0132141113281, 357.2059631347656 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 317.7691650390625, 350.3517761230469, 364.0132141113281, 357.2059631347656 ] } ] }, { "number": 22, "type": 0, "bbox": [ 317.7691650390625, 363.93743896484375, 365.4202880859375, 378.4071044921875 ], "lines": [ { "spans": [ { "size": 6.7275004386901855, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Pixel-BERT-R50 ", "origin": [ 318.043212890625, 369.9324035644531 ], "bbox": [ 318.043212890625, 363.93743896484375, 365.4202880859375, 371.38763427734375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 318.043212890625, 363.93743896484375, 365.4202880859375, 371.38763427734375 ] }, { "spans": [ { "size": 6.189300060272217, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(72.4 / 75.7 / 53.4)", "origin": [ 317.7691650390625, 377.06829833984375 ], "bbox": [ 317.7691650390625, 371.55291748046875, 364.0132141113281, 378.4071044921875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 317.7691650390625, 371.55291748046875, 364.0132141113281, 378.4071044921875 ] } ] }, { "number": 23, "type": 0, "bbox": [ 316.9542541503906, 385.1385192871094, 366.50970458984375, 399.6081848144531 ], "lines": [ { "spans": [ { "size": 6.7275004386901855, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "ViLT-B/32 (Ours) ", "origin": [ 316.9542541503906, 391.13348388671875 ], "bbox": [ 316.9542541503906, 385.1385192871094, 366.50970458984375, 392.5887145996094 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 316.9542541503906, 385.1385192871094, 366.50970458984375, 392.5887145996094 ] }, { "spans": [ { "size": 6.189300060272217, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(76.1 / 83.5 / 64.4)", "origin": [ 317.7691650390625, 398.2693786621094 ], "bbox": [ 317.7691650390625, 392.7539978027344, 364.0132141113281, 399.6081848144531 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 317.7691650390625, 392.7539978027344, 364.0132141113281, 399.6081848144531 ] } ] }, { "number": 24, "type": 0, "bbox": [ 380.48431396484375, 345.4301452636719, 393.43341064453125, 354.4749450683594 ], "lines": [ { "spans": [ { "size": 4.036499977111816, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "~75 ms ", "origin": [ 380.48431396484375, 349.0271301269531 ], "bbox": [ 380.48431396484375, 345.4301452636719, 393.43341064453125, 349.9002685546875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 380.48431396484375, 345.4301452636719, 393.43341064453125, 349.9002685546875 ] }, { "spans": [ { "size": 4.036499977111816, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(R101)", "origin": [ 380.736572265625, 353.601806640625 ], "bbox": [ 380.736572265625, 350.00482177734375, 392.1719970703125, 354.4749450683594 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 380.736572265625, 350.00482177734375, 392.1719970703125, 354.4749450683594 ] } ] }, { "number": 25, "type": 0, "bbox": [ 449.0875244140625, 345.69921875, 464.05487060546875, 350.1693420410156 ], "lines": [ { "spans": [ { "size": 4.036499977111816, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "~810 ms ", "origin": [ 449.0875244140625, 349.29620361328125 ], "bbox": [ 449.0875244140625, 345.69921875, 464.05487060546875, 350.1693420410156 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 449.0875244140625, 345.69921875, 464.05487060546875, 350.1693420410156 ] } ] }, { "number": 26, "type": 0, "bbox": [ 429.29095458984375, 350.6640625, 482.8427429199219, 354.24017333984375 ], "lines": [ { "spans": [ { "size": 3.2292001247406006, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(RPNs, RoI Align, NMS, and RoI Heads)", "origin": [ 429.29095458984375, 353.5416564941406 ], "bbox": [ 429.29095458984375, 350.6640625, 482.8427429199219, 354.24017333984375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 429.29095458984375, 350.6640625, 482.8427429199219, 354.24017333984375 ] } ] }, { "number": 27, "type": 0, "bbox": [ 374.83319091796875, 367.0010681152344, 387.78228759765625, 376.0458679199219 ], "lines": [ { "spans": [ { "size": 4.036499977111816, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "~45 ms ", "origin": [ 374.83319091796875, 370.5980529785156 ], "bbox": [ 374.83319091796875, 367.0010681152344, 387.78228759765625, 371.47119140625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 374.83319091796875, 367.0010681152344, 387.78228759765625, 371.47119140625 ] }, { "spans": [ { "size": 4.036499977111816, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(R50)", "origin": [ 376.0946044921875, 375.1727294921875 ], "bbox": [ 376.0946044921875, 371.57574462890625, 385.51177978515625, 376.0458679199219 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 376.0946044921875, 371.57574462890625, 385.51177978515625, 376.0458679199219 ] } ] }, { "number": 28, "type": 0, "bbox": [ 519.169677734375, 347.3037414550781, 534.98876953125, 352.3698425292969 ], "lines": [ { "spans": [ { "size": 4.574700355529785, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "~900 ms", "origin": [ 519.169677734375, 351.38031005859375 ], "bbox": [ 519.169677734375, 347.3037414550781, 534.98876953125, 352.3698425292969 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 519.169677734375, 347.3037414550781, 534.98876953125, 352.3698425292969 ] } ] }, { "number": 29, "type": 0, "bbox": [ 397.5927429199219, 368.80059814453125, 411.1245422363281, 373.86669921875 ], "lines": [ { "spans": [ { "size": 4.574700355529785, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "~60 ms", "origin": [ 397.5927429199219, 372.8771667480469 ], "bbox": [ 397.5927429199219, 368.80059814453125, 411.1245422363281, 373.86669921875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 397.5927429199219, 368.80059814453125, 411.1245422363281, 373.86669921875 ] } ] }, { "number": 30, "type": 0, "bbox": [ 382.01416015625, 390.0016784667969, 395.54595947265625, 395.0677795410156 ], "lines": [ { "spans": [ { "size": 4.574700355529785, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "~15 ms", "origin": [ 382.01416015625, 394.0782470703125 ], "bbox": [ 382.01416015625, 390.0016784667969, 395.54595947265625, 395.0677795410156 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 382.01416015625, 390.0016784667969, 395.54595947265625, 395.0677795410156 ] } ] }, { "number": 31, "type": 0, "bbox": [ 399.98944091796875, 397.70892333984375, 415.8087463378906, 402.7750244140625 ], "lines": [ { "spans": [ { "size": 4.574700355529785, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "~0.4 ms ", "origin": [ 399.98944091796875, 401.7854919433594 ], "bbox": [ 399.98944091796875, 397.70892333984375, 415.8087463378906, 402.7750244140625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 399.98944091796875, 397.70892333984375, 415.8087463378906, 402.7750244140625 ] } ] }, { "number": 32, "type": 0, "bbox": [ 388.718017578125, 403.0909118652344, 425.93505859375, 408.1570129394531 ], "lines": [ { "spans": [ { "size": 4.574700355529785, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(Linear Embedding)", "origin": [ 388.718017578125, 407.16748046875 ], "bbox": [ 388.718017578125, 403.0909118652344, 425.93505859375, 408.1570129394531 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 388.718017578125, 403.0909118652344, 425.93505859375, 408.1570129394531 ] } ] }, { "number": 33, "type": 0, "bbox": [ 507.81719970703125, 373.48095703125, 522.4928588867188, 378.54705810546875 ], "lines": [ { "spans": [ { "size": 4.574700355529785, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "~15 ms ", "origin": [ 507.81719970703125, 377.5575256347656 ], "bbox": [ 507.81719970703125, 373.48095703125, 522.4928588867188, 378.54705810546875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 507.81719970703125, 373.48095703125, 522.4928588867188, 378.54705810546875 ] } ] }, { "number": 34, "type": 0, "bbox": [ 498.5437316894531, 378.86297607421875, 530.6211547851562, 383.9290771484375 ], "lines": [ { "spans": [ { "size": 4.574700355529785, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(BERT-base-like)", "origin": [ 498.5437316894531, 382.9395446777344 ], "bbox": [ 498.5437316894531, 378.86297607421875, 530.6211547851562, 383.9290771484375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 498.5437316894531, 378.86297607421875, 530.6211547851562, 383.9290771484375 ] } ] }, { "number": 35, "type": 0, "bbox": [ 409.3610534667969, 318.82305908203125, 457.8135986328125, 327.7632751464844 ], "lines": [ { "spans": [ { "size": 8.072999954223633, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Running Time ", "origin": [ 409.3610534667969, 326.01702880859375 ], "bbox": [ 409.3610534667969, 318.82305908203125, 457.8135986328125, 327.7632751464844 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 409.3610534667969, 318.82305908203125, 457.8135986328125, 327.7632751464844 ] } ] }, { "number": 36, "type": 0, "bbox": [ 356.78668212890625, 328.10675048828125, 508.369140625, 334.06689453125 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(Performances : NLVR2 test-P Acc. / F30K TR R@1 / F30K IR R@1)", "origin": [ 356.78668212890625, 332.9027099609375 ], "bbox": [ 356.78668212890625, 328.10675048828125, 508.369140625, 334.06689453125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 356.78668212890625, 328.10675048828125, 508.369140625, 334.06689453125 ] } ] }, { "number": 37, "type": 0, "bbox": [ 320.24737548828125, 214.0364532470703, 363.2159423828125, 221.48663330078125 ], "lines": [ { "spans": [ { "size": 6.7275004386901855, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Region Feature ", "origin": [ 320.24737548828125, 220.0314178466797 ], "bbox": [ 320.24737548828125, 214.0364532470703, 363.2159423828125, 221.48663330078125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 320.24737548828125, 214.0364532470703, 363.2159423828125, 221.48663330078125 ] } ] }, { "number": 38, "type": 0, "bbox": [ 313.8355407714844, 221.77296447753906, 367.9461975097656, 227.7331085205078 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(ViLBERT, UNITER, ...)", "origin": [ 313.8355407714844, 226.56893920898438 ], "bbox": [ 313.8355407714844, 221.77296447753906, 367.9461975097656, 227.7331085205078 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 313.8355407714844, 221.77296447753906, 367.9461975097656, 227.7331085205078 ] } ] }, { "number": 39, "type": 0, "bbox": [ 323.7983703613281, 237.6286163330078, 359.6653747558594, 251.32525634765625 ], "lines": [ { "spans": [ { "size": 6.7275004386901855, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Grid Feature ", "origin": [ 323.7983703613281, 243.6235809326172 ], "bbox": [ 323.7983703613281, 237.6286163330078, 359.6653747558594, 245.07879638671875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 323.7983703613281, 237.6286163330078, 359.6653747558594, 245.07879638671875 ] }, { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(Pixel-BERT)", "origin": [ 325.95556640625, 250.1610870361328 ], "bbox": [ 325.95556640625, 245.3651123046875, 355.82623291015625, 251.32525634765625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 325.95556640625, 245.3651123046875, 355.82623291015625, 251.32525634765625 ] } ] }, { "number": 40, "type": 0, "bbox": [ 318.3692626953125, 261.5335388183594, 364.3302001953125, 268.9837341308594 ], "lines": [ { "spans": [ { "size": 6.7275004386901855, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Patch Projection ", "origin": [ 318.3692626953125, 267.52850341796875 ], "bbox": [ 318.3692626953125, 261.5335388183594, 364.3302001953125, 268.9837341308594 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 318.3692626953125, 261.5335388183594, 364.3302001953125, 268.9837341308594 ] } ] }, { "number": 41, "type": 0, "bbox": [ 332.43145751953125, 269.14898681640625, 348.58807373046875, 276.003173828125 ], "lines": [ { "spans": [ { "size": 6.189300060272217, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "(Ours)", "origin": [ 332.43145751953125, 274.66436767578125 ], "bbox": [ 332.43145751953125, 269.14898681640625, 348.58807373046875, 276.003173828125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 332.43145751953125, 269.14898681640625, 348.58807373046875, 276.003173828125 ] } ] }, { "number": 42, "type": 0, "bbox": [ 385.84881591796875, 218.06820678710938, 399.29547119140625, 224.02835083007812 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Image", "origin": [ 385.84881591796875, 222.8641815185547 ], "bbox": [ 385.84881591796875, 218.06820678710938, 399.29547119140625, 224.02835083007812 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 385.84881591796875, 218.06820678710938, 399.29547119140625, 224.02835083007812 ] } ] }, { "number": 43, "type": 0, "bbox": [ 385.84881591796875, 241.94732666015625, 399.29547119140625, 247.907470703125 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Image", "origin": [ 385.84881591796875, 246.74330139160156 ], "bbox": [ 385.84881591796875, 241.94732666015625, 399.29547119140625, 247.907470703125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 385.84881591796875, 241.94732666015625, 399.29547119140625, 247.907470703125 ] } ] }, { "number": 44, "type": 0, "bbox": [ 385.84881591796875, 265.88916015625, 399.29547119140625, 271.84930419921875 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Image", "origin": [ 385.84881591796875, 270.68511962890625 ], "bbox": [ 385.84881591796875, 265.88916015625, 399.29547119140625, 271.84930419921875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 385.84881591796875, 265.88916015625, 399.29547119140625, 271.84930419921875 ] } ] }, { "number": 45, "type": 0, "bbox": [ 387.8289794921875, 295.6824951171875, 397.3154602050781, 301.64263916015625 ], "lines": [ { "spans": [ { "size": 5.381999969482422, "flags": 4, "font": "TimesNewRomanPSMT", "color": 0, "ascender": 0.89111328125, "descender": -0.21630859375, "text": "Text", "origin": [ 387.8289794921875, 300.47845458984375 ], "bbox": [ 387.8289794921875, 295.6824951171875, 397.3154602050781, 301.64263916015625 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 387.8289794921875, 295.6824951171875, 397.3154602050781, 301.64263916015625 ] } ] }, { "number": 46, "type": 0, "bbox": [ 306.93798828125, 422.1160583496094, 542.9336547851562, 487.715576171875 ], "lines": [ { "spans": [ { "size": 8.966400146484375, "flags": 6, "font": "NimbusRomNo9L-Regu-Slant", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Figure 1.", "origin": [ 306.93798828125, 430.4010009765625 ], "bbox": [ 306.93798828125, 422.1160583496094, 339.3157043457031, 432.9205627441406 ] }, { "size": 9.055620193481445, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Visual comparison of conventional VLP architectures", "origin": [ 339.3157043457031, 430.4010009765625 ], "bbox": [ 339.3157043457031, 422.1160583496094, 541.4371337890625, 432.9205627441406 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 306.93798828125, 422.1160583496094, 541.4371337890625, 432.9205627441406 ] }, { "spans": [ { "size": 9.020037651062012, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "and our proposed ViLT. We have entirely removed convolutional", "origin": [ 307.44000244140625, 441.3599853515625 ], "bbox": [ 307.44000244140625, 433.0750427246094, 541.4402465820312, 443.8795471191406 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 433.0750427246094, 541.4402465820312, 443.8795471191406 ] }, { "spans": [ { "size": 9.055620193481445, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "neural networks from the VLP pipeline without hurting perfor-", "origin": [ 307.44000244140625, 452.3190002441406 ], "bbox": [ 307.44000244140625, 444.0340576171875, 542.9336547851562, 454.83856201171875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 444.0340576171875, 542.9336547851562, 454.83856201171875 ] }, { "spans": [ { "size": 8.876283645629883, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "mance on downstream tasks. ViLT is the \ufb01rst VLP model of which", "origin": [ 307.44000244140625, 463.27801513671875 ], "bbox": [ 307.44000244140625, 454.9930725097656, 541.4396362304688, 465.7975769042969 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 454.9930725097656, 541.4396362304688, 465.7975769042969 ] }, { "spans": [ { "size": 8.988788604736328, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "the modal-speci\ufb01c components require", "origin": [ 307.44000244140625, 474.23699951171875 ], "bbox": [ 307.44000244140625, 465.9520568847656, 447.3302917480469, 476.7565612792969 ] }, { "size": 8.988788604736328, "flags": 6, "font": "NimbusRomNo9L-ReguItal", "color": 0, "ascender": 0.9240000247955322, "descender": -0.27000001072883606, "text": " less", "origin": [ 447.3302917480469, 474.23699951171875 ], "bbox": [ 447.3302917480469, 465.9520568847656, 463.1018371582031, 476.6579284667969 ] }, { "size": 8.988788604736328, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " computation than the", "origin": [ 463.1018371582031, 474.23699951171875 ], "bbox": [ 463.1018371582031, 465.9520568847656, 541.44384765625, 476.7565612792969 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 465.9520568847656, 541.44384765625, 476.7565612792969 ] }, { "spans": [ { "size": 8.966400146484375, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "transformer component for multimodal interactions.", "origin": [ 307.44000244140625, 485.1960144042969 ], "bbox": [ 307.44000244140625, 476.91107177734375, 493.9679260253906, 487.715576171875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 476.91107177734375, 493.9679260253906, 487.715576171875 ] } ] }, { "number": 47, "type": 0, "bbox": [ 307.0820007324219, 515.8865966796875, 542.686767578125, 587.66748046875 ], "lines": [ { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "et al.", "origin": [ 307.44000244140625, 525.092041015625 ], "bbox": [ 307.44000244140625, 515.8865966796875, 328.06854248046875, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 328.06854248046875, 525.092041015625 ], "bbox": [ 328.06854248046875, 515.8865966796875, 330.6090087890625, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2020", "origin": [ 330.6090087890625, 525.092041015625 ], "bbox": [ 330.6090087890625, 515.8865966796875, 354.3572692871094, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 354.3572692871094, 525.092041015625 ], "bbox": [ 354.3572692871094, 515.8865966796875, 357.1822509765625, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Li et al.", "origin": [ 357.1822509765625, 525.092041015625 ], "bbox": [ 357.1822509765625, 515.8865966796875, 393.67340087890625, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 393.67340087890625, 525.092041015625 ], "bbox": [ 393.67340087890625, 515.8865966796875, 396.2138671875, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2020b", "origin": [ 396.2138671875, 525.092041015625 ], "bbox": [ 396.2138671875, 515.8865966796875, 425.0430603027344, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 425.0430603027344, 525.092041015625 ], "bbox": [ 425.0430603027344, 515.8865966796875, 427.8680419921875, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Gan et al.", "origin": [ 427.8680419921875, 525.092041015625 ], "bbox": [ 427.8680419921875, 515.8865966796875, 472.26513671875, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 472.26513671875, 525.092041015625 ], "bbox": [ 472.26513671875, 515.8865966796875, 474.80560302734375, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2020", "origin": [ 474.80560302734375, 525.092041015625 ], "bbox": [ 474.80560302734375, 515.8865966796875, 498.543701171875, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 498.543701171875, 525.092041015625 ], "bbox": [ 498.543701171875, 515.8865966796875, 501.3686828613281, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Yu et al.", "origin": [ 501.3686828613281, 525.092041015625 ], "bbox": [ 501.3686828613281, 515.8865966796875, 540.1463012695312, 527.8915405273438 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 540.1463012695312, 525.092041015625 ], "bbox": [ 540.1463012695312, 515.8865966796875, 542.686767578125, 527.8915405273438 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 515.8865966796875, 542.686767578125, 527.8915405273438 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "2020", "origin": [ 307.44000244140625, 537.0469970703125 ], "bbox": [ 307.44000244140625, 527.841552734375, 327.76373291015625, 539.8464965820312 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ";", "origin": [ 327.76373291015625, 537.0469970703125 ], "bbox": [ 327.76373291015625, 527.841552734375, 330.5887145996094, 539.8464965820312 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Zhang et al.", "origin": [ 330.5887145996094, 537.0469970703125 ], "bbox": [ 330.5887145996094, 527.841552734375, 383.5625, 539.8464965820312 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 383.5625, 537.0469970703125 ], "bbox": [ 383.5625, 527.841552734375, 386.10296630859375, 539.8464965820312 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2021", "origin": [ 386.10296630859375, 537.0469970703125 ], "bbox": [ 386.10296630859375, 527.841552734375, 409.6988220214844, 539.8464965820312 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "). These models are pre-trained", "origin": [ 409.6988220214844, 537.0469970703125 ], "bbox": [ 409.6988220214844, 527.841552734375, 541.437255859375, 539.8464965820312 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 527.841552734375, 541.437255859375, 539.8464965820312 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "with image text matching and masked language modeling", "origin": [ 307.0820007324219, 549.0020141601562 ], "bbox": [ 307.0820007324219, 539.7965698242188, 541.4448852539062, 551.801513671875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.0820007324219, 539.7965698242188, 541.4448852539062, 551.801513671875 ] }, { "spans": [ { "size": 9.862470626831055, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "objectives", "origin": [ 307.44000244140625, 560.95703125 ], "bbox": [ 307.44000244140625, 551.7515869140625, 346.6300964355469, 563.7565307617188 ] }, { "size": 6.973800182342529, "flags": 5, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "1", "origin": [ 346.6319885253906, 557.3419799804688 ], "bbox": [ 346.6319885253906, 550.898193359375, 350.118896484375, 559.3016357421875 ] }, { "size": 9.862470626831055, "flags": 5, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " ", "origin": [ 350.118896484375, 557.3419799804688 ], "bbox": [ 350.118896484375, 548.1365356445312, 352.7349853515625, 563.7565307617188 ] }, { "size": 9.862470626831055, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "on images and their aligned descriptions, and are", "origin": [ 352.7349853515625, 560.95703125 ], "bbox": [ 352.7349853515625, 551.7515869140625, 541.4409790039062, 563.7565307617188 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 548.1365356445312, 541.4409790039062, 563.7565307617188 ] }, { "spans": [ { "size": 9.89763069152832, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "\ufb01ne-tuned on vision-and-language downstream tasks where", "origin": [ 307.44000244140625, 572.9129638671875 ], "bbox": [ 307.44000244140625, 563.70751953125, 541.4378662109375, 575.7124633789062 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 563.70751953125, 541.4378662109375, 575.7124633789062 ] }, { "spans": [ { "size": 9.962599754333496, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "the inputs involve two modalities.", "origin": [ 307.44000244140625, 584.8679809570312 ], "bbox": [ 307.44000244140625, 575.6625366210938, 441.925048828125, 587.66748046875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 575.6625366210938, 441.925048828125, 587.66748046875 ] } ] }, { "number": 48, "type": 0, "bbox": [ 307.11199951171875, 593.5955810546875, 543.1831665039062, 689.2864990234375 ], "lines": [ { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "To be fed into VLP models, image pixels need to be ini-", "origin": [ 307.1310119628906, 602.801025390625 ], "bbox": [ 307.1310119628906, 593.5955810546875, 543.08935546875, 605.6005249023438 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.1310119628906, 593.5955810546875, 543.08935546875, 605.6005249023438 ] }, { "spans": [ { "size": 9.90765380859375, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "tially embedded in a dense form alongside language tokens.", "origin": [ 307.44000244140625, 614.7559814453125 ], "bbox": [ 307.44000244140625, 605.550537109375, 543.1831665039062, 617.5554809570312 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 605.550537109375, 543.1831665039062, 617.5554809570312 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Since the seminal work of", "origin": [ 307.44000244140625, 626.7109985351562 ], "bbox": [ 307.44000244140625, 617.5055541992188, 414.6678771972656, 629.510498046875 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Krizhevsky et al.", "origin": [ 414.6678771972656, 626.7109985351562 ], "bbox": [ 414.6678771972656, 617.5055541992188, 486.8475036621094, 629.510498046875 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " (", "origin": [ 486.8475036621094, 626.7109985351562 ], "bbox": [ 486.8475036621094, 617.5055541992188, 493.07672119140625, 629.510498046875 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "2012", "origin": [ 493.07672119140625, 626.7109985351562 ], "bbox": [ 493.07672119140625, 617.5055541992188, 513.4004516601562, 629.510498046875 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "), deep", "origin": [ 513.4004516601562, 626.7109985351562 ], "bbox": [ 513.4004516601562, 617.5055541992188, 541.43701171875, 629.510498046875 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 617.5055541992188, 541.43701171875, 629.510498046875 ] }, { "spans": [ { "size": 9.962599754333496, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "convolutional networks have been regarded as essential for", "origin": [ 307.44000244140625, 638.666015625 ], "bbox": [ 307.44000244140625, 629.4605712890625, 541.6109008789062, 641.4655151367188 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 629.4605712890625, 541.6109008789062, 641.4655151367188 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "this visual embedding step. Most VLP models employ an", "origin": [ 307.44000244140625, 650.6209716796875 ], "bbox": [ 307.44000244140625, 641.41552734375, 541.43701171875, 653.4204711914062 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 641.41552734375, 541.43701171875, 653.4204711914062 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "object detector pre-trained on the Visual Genome dataset", "origin": [ 307.44000244140625, 662.5759887695312 ], "bbox": [ 307.44000244140625, 653.3705444335938, 541.43701171875, 665.37548828125 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 653.3705444335938, 541.43701171875, 665.37548828125 ] }, { "spans": [ { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "(", "origin": [ 307.11199951171875, 674.531982421875 ], "bbox": [ 307.11199951171875, 665.3265380859375, 310.49591064453125, 677.3314819335938 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "Krishna et al.", "origin": [ 310.49591064453125, 674.531982421875 ], "bbox": [ 310.49591064453125, 665.3265380859375, 365.3394775390625, 677.3314819335938 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ",", "origin": [ 365.3394775390625, 674.531982421875 ], "bbox": [ 365.3394775390625, 665.3265380859375, 367.87994384765625, 677.3314819335938 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " 2017", "origin": [ 367.87994384765625, 674.531982421875 ], "bbox": [ 367.87994384765625, 665.3265380859375, 390.92706298828125, 677.3314819335938 ] }, { "size": 10.061732292175293, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": ") annotated with 1,600 object classes", "origin": [ 390.92706298828125, 674.531982421875 ], "bbox": [ 390.92706298828125, 665.3265380859375, 541.4443969726562, 677.3314819335938 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.11199951171875, 665.3265380859375, 541.4443969726562, 677.3314819335938 ] }, { "spans": [ { "size": 9.862470626831055, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "and 400 attribute classes as in", "origin": [ 307.44000244140625, 686.4869995117188 ], "bbox": [ 307.44000244140625, 677.2815551757812, 423.8484802246094, 689.2864990234375 ] }, { "size": 9.862470626831055, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " Anderson et al.", "origin": [ 423.8484802246094, 686.4869995117188 ], "bbox": [ 423.8484802246094, 677.2815551757812, 485.84576416015625, 689.2864990234375 ] }, { "size": 9.862470626831055, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": " (", "origin": [ 485.84576416015625, 686.4869995117188 ], "bbox": [ 485.84576416015625, 677.2815551757812, 491.59637451171875, 689.2864990234375 ] }, { "size": 9.862470626831055, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 5234, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "2018", "origin": [ 491.59637451171875, 686.4869995117188 ], "bbox": [ 491.59637451171875, 677.2815551757812, 511.12310791015625, 689.2864990234375 ] }, { "size": 9.862470626831055, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "). Pixel-", "origin": [ 511.12310791015625, 686.4869995117188 ], "bbox": [ 511.12310791015625, 677.2815551757812, 543.09814453125, 689.2864990234375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 677.2815551757812, 543.09814453125, 689.2864990234375 ] } ] }, { "number": 49, "type": 0, "bbox": [ 307.44000244140625, 695.7656860351562, 541.440673828125, 717.5805053710938 ], "lines": [ { "spans": [ { "size": 5.97760009765625, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "1", "origin": [ 320.0929870605469, 701.2890014648438 ], "bbox": [ 320.0929870605469, 695.7656860351562, 323.081787109375, 702.9686889648438 ] }, { "size": 9.055620193481445, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "While some works employ additional objectives and data", "origin": [ 323.5799865722656, 705.0980224609375 ], "bbox": [ 323.5799865722656, 696.8130493164062, 541.440673828125, 707.6175537109375 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 320.0929870605469, 695.7656860351562, 541.440673828125, 707.6175537109375 ] }, { "spans": [ { "size": 8.903413772583008, "flags": 4, "font": "NimbusRomNo9L-Regu", "color": 0, "ascender": 0.9240000247955322, "descender": -0.2809999883174896, "text": "structures, these two objectives apply to almost every VLP model.", "origin": [ 307.44000244140625, 715.0609741210938 ], "bbox": [ 307.44000244140625, 706.7760009765625, 541.4136962890625, 717.5805053710938 ] } ], "wmode": 0, "dir": [ 1.0, 0.0 ], "bbox": [ 307.44000244140625, 706.7760009765625, 541.4136962890625, 717.5805053710938 ] } ] }, { "number": 50, "type": 0, "bbox": [ 10.940000534057617, 210.01995849609375, 37.619998931884766, 570.0 ], "lines": [ { "spans": [ { "size": 20.0, "flags": 4, "font": "Times-Roman", "color": 9539985, "ascender": 1.0529999732971191, "descender": -0.2809999883174896, "text": "arXiv:2102.03334v2 [stat.ML] 10 Jun 2021", "origin": [ 32.0, 570.0 ], "bbox": [ 10.940000534057617, 210.01995849609375, 37.619998931884766, 570.0 ] } ], "wmode": 0, "dir": [ 0.0, -1.0 ], "bbox": [ 10.940000534057617, 210.01995849609375, 37.619998931884766, 570.0 ] } ] } ]