diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6a98a38b72ef9a59a2fdab266697661cdd1fa136
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,88 @@
+# Virtualenv
+/.venv/
+/venv/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+.ipynb_checkpoints/
+*.py[cod]
+
+# C extensions
+*.so
+
+# json file
+*.json
+
+# log file
+*.log
+
+# Distribution / packaging
+/bin/
+/build/
+/develop-eggs/
+/dist/
+/eggs/
+/lib/
+/lib64/
+/output/
+/inference_model/
+/output_inference/
+/parts/
+/sdist/
+/var/
+/*.egg-info/
+/.installed.cfg
+/*.egg
+/.eggs
+
+# AUTHORS and ChangeLog will be generated while packaging
+/AUTHORS
+/ChangeLog
+
+# BCloud / BuildSubmitter
+/build_submitter.*
+/logger_client_log
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+.tox/
+.coverage
+.cache
+.pytest_cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+
+# Sphinx documentation
+/docs/_build/
+
+*.tar
+*.pyc
+
+.idea/
+
+dataset/coco/annotations
+dataset/coco/train2017
+dataset/coco/val2017
+dataset/voc/VOCdevkit
+dataset/fruit/fruit-detection/
+dataset/voc/test.txt
+dataset/voc/trainval.txt
+dataset/wider_face/WIDER_test
+dataset/wider_face/WIDER_train
+dataset/wider_face/WIDER_val
+dataset/wider_face/wider_face_split
+
+ppdet/version.py
+
+# NPU meta folder
+kernel_meta/
+
+# MAC
+*.DS_Store
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..099148ac4ed123b68803486f7d30d157005b617d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,44 @@
+-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
+    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
+    hooks:
+    -   id: yapf
+        files: \.py$
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    sha: a11d9314b22d8f8c7556443875b731ef05965464
+    hooks:
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+        files: (?!.*paddle)^.*$
+    -   id: end-of-file-fixer
+        files: \.(md|yml)$
+    -   id: trailing-whitespace
+        files: \.(md|yml)$
+-   repo: https://github.com/Lucas-C/pre-commit-hooks
+    sha: v1.0.1
+    hooks:
+    -   id: forbid-crlf
+        files: \.(md|yml)$
+    -   id: remove-crlf
+        files: \.(md|yml)$
+    -   id: forbid-tabs
+        files: \.(md|yml)$
+    -   id: remove-tabs
+        files: \.(md|yml)$
+-   repo: local
+    hooks:
+    -   id: clang-format-with-version-check
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: bash ./.travis/codestyle/clang_format.hook -i
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+
+-   repo: local
+    hooks:
+    -   id: cpplint-cpp-source
+        name: cpplint
+        description: Check C++ code style using cpplint.py.
+        entry: bash ./.travis/codestyle/cpplint_pre_commit.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 0000000000000000000000000000000000000000..4741fb4f3bbc6681088cf9e960321e7b857a93a8
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+based_on_style = pep8
+column_limit = 80
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b8eff51456d9723695cd037543e73f921ad4d009
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,35 @@
+language: cpp
+cache: ccache
+sudo: required
+dist: trusty
+services:
+  - docker
+os:
+  - linux
+env:
+  - JOB=PRE_COMMIT
+
+addons:
+  apt:
+    packages:
+      - git
+      - python
+      - python-pip
+      - python2.7-dev
+  ssh_known_hosts: 13.229.163.131
+before_install:
+  - sudo pip install -U virtualenv pre-commit pip -i https://pypi.tuna.tsinghua.edu.cn/simple
+  - docker pull paddlepaddle/paddle:latest
+  - git pull https://github.com/PaddlePaddle/PaddleDetection develop
+
+script:
+  - exit_code=0
+  - .travis/precommit.sh || exit_code=$(( exit_code | $? ))
+  # - docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c
+  #   'cd /py_unittest; sh .travis/unittest.sh' || exit_code=$(( exit_code | $? ))
+  - if [ $exit_code -eq 0  ]; then true; else exit 1; fi;
+
+notifications:
+  email:
+    on_success: change
+    on_failure: always
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/README.md b/README.md
index e4b8020625a9dfff55080873c1218a74632a7551..c3b74736ae560a5be418e50469a90ba3241eb6cc 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,146 @@
-# Yolov5_paddle
+# yolov5_paddle
 
+## 论文
+
+无论文
+
+## 模型结构
+
+YOLOv5 是一种目标检测算法，采用单阶段（one-stage）的方法，基于轻量级的卷积神经网络结构，通过引入不同尺度的特征融合和特征金字塔结构来实现高效准确的目标检测。
+
+![Backbone.png](asserts%2FBackbone.png)
+
+## 算法原理
+
+YOLOv5 是一种基于单阶段目标检测算法，通过将图像划分为不同大小的网格，预测每个网格中的目标类别和边界框，利用特征金字塔结构和自适应的模型缩放来实现高效准确的实时目标检测。
+
+![Algorithm_principle.png](asserts%2FAlgorithm_principle.png)
+
+## 环境配置
+
+### Docker (方法一)
+
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/paddlepaddle:2.4.2-centos7.6-dtk-23.04-py38-latest
+
+docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
+
+cd /path/workspace/
+pip3 install -r requirements.txt
+```
+
+### Dockerfile (方法二)
+
+```
+cd ./docker
+docker build --no-cache -t yolov5_paddle:last .
+docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
+```
+
+### Anaconda (方法三)
+
+1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装： https://developer.hpccube.com/tool/
+
+```
+DTK软件栈：dtk23.04
+python：python3.8
+paddle：2.4.2
+```
+
+Tips：以上dtk软件栈、python、paddle等DCU相关工具版本需要严格一一对应
+
+2、其他非特殊库直接按照requirements.txt安装
+
+```
+pip3 install -r requirements.txt
+```
+
+
+
+## 数据集
+
+COCO2017（在网络良好的情况下，如果没有下载数据集，程序会默认在线下载数据集）
+
+[训练数据](http://images.cocodataset.org/zips/train2017.zip)
+
+[验证数据](http://images.cocodataset.org/zips/val2017.zip)
+
+[测试数据](http://images.cocodataset.org/zips/test2017.zip)
+
+[标签数据](https://github.com/ultralytics/yolov5/releases/download/v1.0/coco2017labels.zip)
+
+数据集的目录结构如下：
+
+```
+├── images 
+│   ├── train2017
+│   ├── val2017
+│   ├── test2017
+├── labels
+│   ├── train2017
+│   ├── val2017
+├── annotations
+│   ├── instances_val2017.json
+├── LICENSE
+├── README.txt 
+├── test-dev2017.txt
+├── train2017.txt
+├── val2017.txt
+
+```
+
+## 训练
+
+### 单机单卡
+
+```
+export HIP_VISIBLE_DEVICES=0
+export USE_MIOPEN_BATCHNORM=1
+
+
+python3 train.py --batch 32 --data coco.yaml --cfg 'yolov5m.yaml' --weights '' --project 'run/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --epochs 1000  2>&1 | tee  yolov5m.log
+```
+
+### 单机多卡
+
+```
+#以单机四卡为例子
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export USE_MIOPEN_BATCHNORM=1
+
+python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/yolov5/yolov5_s_300e_coco.yml --amp --eval
+```
+
+## 推理
+
+#### 单卡推理
+
+```
+HIP_VISIBLE_DEVICES=0 python tools/infer.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams --infer_img=demo/000000014439_640x640.jpg
+```
+## result
+此处以yolov5s模型进行推理测试
+![result_000000014439_640x640.jpg](asserts%2Fresult_000000014439_640x640.jpg)
+## 精度
+
+|   模型    | 数据类型 | map0.5:0.95 | map0.5 |
+|:-------:|:----:|:-----------:|:------:|
+| yolov5s |  单精  |    37.0     |  56.4  |
+| yolov5s |  混精  |    37.2     |  56.4  |
+
+
+## 应用场景
+### 算法分类
+目标检测
+
+### 热点应用行业
+金融,交通,教育
+
+## 源码仓库及问题反馈
+
+https://developer.hpccube.com/codes/modelzoo/yolov5_pytorch
+
+## 参考
+
+[GitHub - ultralytics/yolov5 at v6.0](https://github.com/ultralytics/yolov5/tree/v6.0)
diff --git a/README_cn.md b/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..8542d1377e22c5a4fa00aaa23f6f2e77c9af4478
--- /dev/null
+++ b/README_cn.md
@@ -0,0 +1,441 @@
+简体中文 | [English](README_en.md)
+
+## 简介
+
+**PaddleYOLO**是基于[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)的YOLO系列模型库，**只包含YOLO系列模型的相关代码**，支持`YOLOv3`、`PP-YOLO`、`PP-YOLOv2`、`PP-YOLOE`、**`PP-YOLOE+`**、**`RT-DETR`**、`YOLOX`、`YOLOv5`、`YOLOv6`、`YOLOv7`、`YOLOv8`、`YOLOv5u`、`YOLOv7u`、`YOLOv6Lite`、`RTMDet`等模型，COCO数据集模型库请参照 [ModelZoo](docs/MODEL_ZOO_cn.md) 和 [configs](configs/)。
+
+<div  align="center">
+  <img src="https://user-images.githubusercontent.com/13104100/213197403-c8257486-9ac4-486f-a0d5-4e3fe27ca852.jpg" width="480"/>
+  <img src="https://user-images.githubusercontent.com/13104100/213197635-eeb55433-bb2d-44f6-b374-73c616cfab24.jpg" width="480"/>
+</div>
+
+**注意:**
+
+ - **PaddleYOLO** 代码库协议为 **[GPL 3.0](LICENSE)**，[YOLOv5](configs/yolov5)、[YOLOv6](configs/yolov6)、[YOLOv7](configs/yolov7)和[YOLOv8](configs/yolov8)这几类模型代码不合入[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)，其余YOLO模型推荐在[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)中使用，**会最先发布PP-YOLO系列特色检测模型的最新进展**；
+ - **PaddleYOLO**代码库**推荐使用paddlepaddle-2.4.2以上的版本**，请参考[官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载对应适合版本，**Windows平台请安装paddle develop版本**；
+ - **PaddleYOLO 的[Roadmap](https://github.com/PaddlePaddle/PaddleYOLO/issues/44)** issue用于收集用户的需求，欢迎提出您的建议和需求；
+
+## 教程
+
+<details open>
+<summary>安装</summary>
+
+Clone 代码库和安装 [requirements.txt](./requirements.txt)，环境需要在一个
+[**Python>=3.7.0**](https://www.python.org/) 下的环境，且需要安装
+[**PaddlePaddle>=2.4.2**](https://www.paddlepaddle.org.cn/install/)。
+
+```bash
+git clone https://github.com/PaddlePaddle/PaddleYOLO  # clone
+cd PaddleYOLO
+pip install -r requirements.txt  # install
+```
+
+</details>
+
+<details open>
+<summary>训练/验证/预测/</summary>
+将以下命令写在一个脚本文件里如```run.sh```，一键运行命令为：```sh run.sh```，也可命令行一句句去运行。
+
+```bash
+model_name=ppyoloe # 可修改，如 yolov7
+job_name=ppyoloe_plus_crn_s_80e_coco # 可修改，如 yolov7_tiny_300e_coco
+
+config=configs/${model_name}/${job_name}.yml
+log_dir=log_dir/${job_name}
+# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
+weights=output/${job_name}/model_final.pdparams
+
+# 1.训练（单卡/多卡），加 --eval 表示边训边评估，加 --amp 表示混合精度训练
+# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp
+python -m paddle.distributed.launch --log_dir=${log_dir} --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp
+
+# 2.评估，加 --classwise 表示输出每一类mAP
+CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c ${config} -o weights=${weights} --classwise
+
+# 3.预测 (单张图/图片文件夹）
+CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_img=demo/000000014439_640x640.jpg --draw_threshold=0.5
+# CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_dir=demo/ --draw_threshold=0.5
+```
+
+</details>
+
+<details>
+<summary>部署/测速</summary>
+
+将以下命令写在一个脚本文件里如```run.sh```，一键运行命令为：```sh run.sh```，也可命令行一句句去运行。
+
+```bash
+model_name=ppyoloe # 可修改，如 yolov7
+job_name=ppyoloe_plus_crn_s_80e_coco # 可修改，如 yolov7_tiny_300e_coco
+
+config=configs/${model_name}/${job_name}.yml
+log_dir=log_dir/${job_name}
+# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
+weights=output/${job_name}/model_final.pdparams
+
+# 4.导出模型，以下3种模式选一种
+## 普通导出，加trt表示用于trt加速，对NMS和silu激活函数提速明显
+CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} # trt=True
+
+## exclude_post_process去除后处理导出，返回和YOLOv5导出ONNX时相同格式的concat后的1个Tensor，是未缩放回原图的坐标+分类置信度
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_post_process=True # trt=True
+
+## exclude_nms去除NMS导出，返回2个Tensor，是缩放回原图后的坐标和分类置信度
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_nms=True # trt=True
+
+# 5.部署预测，注意不能使用 去除后处理 或 去除NMS 导出后的模型去预测
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU
+
+# 6.部署测速，加 “--run_mode=trt_fp16” 表示在TensorRT FP16模式下测速，注意如需用到 trt_fp16 则必须为加 trt=True 导出的模型
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU --run_benchmark=True # --run_mode=trt_fp16
+
+# 7.onnx导出，一般结合 exclude_post_process去除后处理导出的模型
+paddle2onnx --model_dir output_inference/${job_name} --model_filename model.pdmodel --params_filename model.pdiparams --opset_version 12 --save_file ${job_name}.onnx
+
+# 8.onnx trt测速
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp16
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp32
+```
+
+- 如果想切换模型，只要修改开头两行即可，如:
+  ```
+  model_name=yolov7
+  job_name=yolov7_tiny_300e_coco
+  ```
+- 导出**onnx**，首先安装[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX)，`pip install paddle2onnx`；
+- **统计FLOPs(G)和Params(M)**，首先安装[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)，`pip install paddleslim`，然后设置[runtime.yml](configs/runtime.yml)里`print_flops: True`和`print_params: True`，并且注意确保是**单尺度**下如640x640，**打印的是MACs，FLOPs=2*MACs**。
+
+</details>
+
+
+<details open>
+<summary> [训练自定义数据集](https://github.com/PaddlePaddle/PaddleYOLO/issues/43) </summary>
+
+- 请参照[文档](docs/MODEL_ZOO_cn.md#自定义数据集)和[issue](https://github.com/PaddlePaddle/PaddleYOLO/issues/43)；
+- PaddleDetection团队提供了**基于PP-YOLOE的各种垂类检测模型**的配置文件和权重，用户也可以作为参考去使用自定义数据集。请参考 [PP-YOLOE application](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/ppyoloe/application)、[pphuman](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/pphuman)、[ppvehicle](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/ppvehicle)、[visdrone](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/visdrone) 和 [smalldet](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/smalldet)。
+- PaddleDetection团队也提供了**VOC数据集的各种YOLO模型**的配置文件和权重，用户也可以作为参考去使用自定义数据集。请参考 [voc](configs/voc)。
+- 训练自定义数据集之前请先**确保加载了对应COCO权重作为预训练**，将配置文件中的`pretrain_weights: `设置为对应COCO模型训好的权重，一般会提示head分类层卷积的通道数没对应上，属于正常现象，是由于自定义数据集一般和COCO数据集种类数不一致；
+- YOLO检测模型建议**总`batch_size`至少大于`64`**去训练，如果资源不够请**换小模型**或**减小模型的输入尺度**，为了保障较高检测精度，**尽量不要尝试单卡训和总`batch_size`小于`64`训**；
+
+</details>
+
+
+## 更新日志
+
+* 【2023/05/21】支持[RT-DETR](configs/rtdetr)、[YOLOv8](configs/yolov8)、[YOLOv5u](configs/yolov5/yolov5u)和[YOLOv7u](configs/yolov7/yolov7u)训练全流程，支持[YOLOv6Lite](configs/yolov6/yolov6lite)预测和部署；
+* 【2023/03/13】支持[YOLOv5u](configs/yolov5/yolov5u)和[YOLOv7u](configs/yolov7/yolov7u)预测和部署；
+* 【2023/01/10】支持[YOLOv8](configs/yolov8)预测和部署；
+* 【2022/09/29】支持[RTMDet](configs/rtmdet)预测和部署；
+* 【2022/09/26】发布[PaddleYOLO](https://github.com/PaddlePaddle/PaddleYOLO)模型套件，请参照[ModelZoo](docs/MODEL_ZOO_cn.md)；
+* 【2022/09/19】支持[YOLOv6](configs/yolov6)新版，包括n/t/s/m/l模型；
+* 【2022/08/23】发布`YOLOSeries`代码库: 支持`YOLOv3`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6`,`YOLOv7`等YOLO模型，支持`ConvNeXt`骨干网络高精度版`PP-YOLOE`,`YOLOX`和`YOLOv5`等模型，支持PaddleSlim无损加速量化训练`PP-YOLOE`,`YOLOv5`,`YOLOv6`和`YOLOv7`等模型，详情可阅读[此文章](https://mp.weixin.qq.com/s/Hki01Zs2lQgvLSLWS0btrA)；
+
+
+## <img src="https://user-images.githubusercontent.com/48054808/157793354-6e7f381a-0aa6-4bb7-845c-9acf2ecc05c3.png" width="20"/> 产品动态
+
+- 🔥 **2023.3.14：PaddleYOLO发布[release/2.6版本](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6)**
+  - 💡 模型套件：
+    - 支持`YOLOv8`,`YOLOv5u`,`YOLOv7u`等YOLO模型预测和部署；
+    - 支持`Swin-Transformer`、`ViT`、`FocalNet`骨干网络高精度版`PP-YOLOE+`等模型；
+    - 支持`YOLOv8`在[FastDeploy](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection)中多硬件快速部署；
+
+- 🔥 **2022.9.26：PaddleYOLO发布[release/2.5版本](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.5)**
+  - 💡 模型套件：
+    - 发布[PaddleYOLO](https://github.com/PaddlePaddle/PaddleYOLO)模型套件: 支持`YOLOv3`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6`,`YOLOv7`等YOLO模型，支持`ConvNeXt`骨干网络高精度版`PP-YOLOE`,`YOLOX`和`YOLOv5`等模型，支持PaddleSlim无损加速量化训练`PP-YOLOE`,`YOLOv5`,`YOLOv6`和`YOLOv7`等模型；
+
+- 🔥 **2022.8.26：PaddleDetection发布[release/2.5版本](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5)**
+  - 🗳 特色模型：
+    - 发布[PP-YOLOE+](configs/ppyoloe)，最高精度提升2.4% mAP，达到54.9% mAP，模型训练收敛速度提升3.75倍，端到端预测速度最高提升2.3倍；多个下游任务泛化性提升
+    - 发布[PicoDet-NPU](configs/picodet)模型，支持模型全量化部署；新增[PicoDet](configs/picodet)版面分析模型
+    - 发布[PP-TinyPose升级版](./configs/keypoint/tiny_pose/)增强版，在健身、舞蹈等场景精度提升9.1% AP，支持侧身、卧躺、跳跃、高抬腿等非常规动作
+  - 🔮 场景能力：
+    - 发布行人分析工具[PP-Human v2](./deploy/pipeline)，新增打架、打电话、抽烟、闯入四大行为识别，底层算法性能升级，覆盖行人检测、跟踪、属性三类核心算法能力，提供保姆级全流程开发及模型优化策略，支持在线视频流输入
+    - 首次发布[PP-Vehicle](./deploy/pipeline)，提供车牌识别、车辆属性分析（颜色、车型）、车流量统计以及违章检测四大功能，兼容图片、在线视频流、视频输入，提供完善的二次开发文档教程
+  - 💡 前沿算法：
+    - 全面覆盖的[YOLO家族](https://github.com/PaddlePaddle/PaddleYOLO)经典与最新模型: 包括YOLOv3，百度飞桨自研的实时高精度目标检测检测模型PP-YOLOE，以及前沿检测算法YOLOv4、YOLOv5、YOLOX，YOLOv6及YOLOv7
+    - 新增基于[ViT](configs/vitdet)骨干网络高精度检测模型，COCO数据集精度达到55.7% mAP；新增[OC-SORT](configs/mot/ocsort)多目标跟踪模型；新增[ConvNeXt](configs/convnext)骨干网络
+  - 📋 产业范例：新增[智能健身](https://aistudio.baidu.com/aistudio/projectdetail/4385813)、[打架识别](https://aistudio.baidu.com/aistudio/projectdetail/4086987?channelType=0&channel=0)、[来客分析](https://aistudio.baidu.com/aistudio/projectdetail/4230123?channelType=0&channel=0)、车辆结构化范例
+
+- 2022.3.24：PaddleDetection发布[release/2.4版本](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4)
+  - 发布高精度云边一体SOTA目标检测模型[PP-YOLOE](configs/ppyoloe)，提供s/m/l/x版本，l版本COCO test2017数据集精度51.6%，V100预测速度78.1 FPS，支持混合精度训练，训练较PP-YOLOv2加速33%，全系列多尺度模型，满足不同硬件算力需求，可适配服务器、边缘端GPU及其他服务器端AI加速卡。
+  - 发布边缘端和CPU端超轻量SOTA目标检测模型[PP-PicoDet增强版](configs/picodet)，精度提升2%左右，CPU预测速度提升63%，新增参数量0.7M的PicoDet-XS模型，提供模型稀疏化和量化功能，便于模型加速，各类硬件无需单独开发后处理模块，降低部署门槛。
+  - 发布实时行人分析工具[PP-Human](deploy/pipeline)，支持行人跟踪、人流量统计、人体属性识别与摔倒检测四大能力，基于真实场景数据特殊优化，精准识别各类摔倒姿势，适应不同环境背景、光线及摄像角度。
+  - 新增[YOLOX](configs/yolox)目标检测模型，支持nano/tiny/s/m/l/x版本，x版本COCO val2017数据集精度51.8%。
+
+- [更多版本发布](https://github.com/PaddlePaddle/PaddleDetection/releases)
+
+## <img title="" src="https://user-images.githubusercontent.com/48054808/157795569-9fc77c85-732f-4870-9be0-99a7fe2cff27.png" alt="" width="20"> 简介
+
+**PaddleDetection**为基于飞桨PaddlePaddle的端到端目标检测套件，内置**30+模型算法**及**250+预训练模型**，覆盖**目标检测、实例分割、跟踪、关键点检测**等方向，其中包括**服务器端和移动端高精度、轻量级**产业级SOTA模型、冠军方案和学术前沿算法，并提供配置化的网络模块组件、十余种数据增强策略和损失函数等高阶优化支持和多种部署方案，在打通数据处理、模型开发、训练、压缩、部署全流程的基础上，提供丰富的案例及教程，加速算法产业落地应用。
+
+<div  align="center">
+  <img src="https://user-images.githubusercontent.com/22989727/189026616-75f9c06c-b403-4a61-9372-0fcbed6e0662.gif" width="800"/>
+</div>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157799599-e6a66855-bac6-4e75-b9c0-96e13cb9612f.png" width="20"/> 特性
+
+- **模型丰富**: 包含**目标检测**、**实例分割**、**人脸检测**、****关键点检测****、**多目标跟踪**等**250+个预训练模型**，涵盖多种**全球竞赛冠军**方案。
+- **使用简洁**：模块化设计，解耦各个网络组件，开发者轻松搭建、试用各种检测模型及优化策略，快速得到高性能、定制化的算法。
+- **端到端打通**: 从数据增强、组网、训练、压缩、部署端到端打通，并完备支持**云端**/**边缘端**多架构、多设备部署。
+- **高性能**: 基于飞桨的高性能内核，模型训练速度及显存占用优势明显。支持FP16训练, 支持多机训练。
+
+<div  align="center">
+  <img src="https://user-images.githubusercontent.com/22989727/189026189-5d21e93a-5b33-40ce-bc36-c737122c1992.png" width="800"/>
+</div>
+
+## <img title="" src="https://user-images.githubusercontent.com/48054808/157800467-2a9946ad-30d1-49a9-b9db-ba33413d9c90.png" alt="" width="20"> 技术交流
+
+- 如果你发现任何PaddleDetection存在的问题或者是建议, 欢迎通过[GitHub Issues](https://github.com/PaddlePaddle/PaddleDetection/issues)给我们提issues。
+
+- **欢迎加入PaddleDetection 微信用户群（扫码填写问卷即可入群）**
+  - **入群福利 💎：获取PaddleDetection团队整理的重磅学习大礼包🎁**
+    - 📊 福利一：获取飞桨联合业界企业整理的开源数据集
+    - 👨‍🏫 福利二：获取PaddleDetection历次发版直播视频与最新直播咨询
+    - 🗳 福利三：获取垂类场景预训练模型集合，包括工业、安防、交通等5+行业场景
+    - 🗂 福利四：获取10+全流程产业实操范例，覆盖火灾烟雾检测、人流量计数等产业高频场景
+  <div align="center">
+  <img src="https://user-images.githubusercontent.com/34162360/177678712-4655747d-4290-4ad9-b7a1-4564a5418ac6.jpg"  width = "200" />  
+  </div>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157827140-03ffaff7-7d14-48b4-9440-c38986ea378c.png" width="20"/> 套件结构概览
+
+<table align="center">
+  <tbody>
+    <tr align="center" valign="bottom">
+      <td>
+        <b>Architectures</b>
+      </td>
+      <td>
+        <b>Backbones</b>
+      </td>
+      <td>
+        <b>Components</b>
+      </td>
+      <td>
+        <b>Data Augmentation</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+        <ul>
+        <details open><summary><b>Object Detection</b></summary>
+          <ul>
+            <li>YOLOv3</li>  
+            <li>YOLOv5</li>  
+            <li>YOLOv6</li>  
+            <li>YOLOv7</li>  
+            <li>YOLOv8</li>  
+            <li>PP-YOLOv1/v2</li>
+            <li>PP-YOLO-Tiny</li>
+            <li>PP-YOLOE</li>
+            <li>PP-YOLOE+</li>
+            <li>YOLOX</li>
+            <li>RTMDet</li>
+         </ul></details>
+      </ul>
+      </td>
+      <td>
+        <details open><summary><b>Details</b></summary>
+        <ul>
+          <li>ResNet(&vd)</li>
+          <li>CSPResNet</li>
+          <li>DarkNet</li>
+          <li>CSPDarkNet</li>
+          <li>ConvNeXt</li>
+          <li>EfficientRep</li>
+          <li>CSPBepBackbone</li>
+          <li>ELANNet</li>
+          <li>CSPNeXt</li>
+        </ul></details>
+      </td>
+      <td>
+        <details open><summary><b>Common</b></summary>
+          <ul>
+            <li>Sync-BN</li>
+            <li>Group Norm</li>
+            <li>DCNv2</li>
+            <li>EMA</li>
+          </ul> </details>
+        </ul>
+        <details open><summary><b>FPN</b></summary>
+          <ul>
+            <li>YOLOv3FPN</li>
+            <li>PPYOLOFPN</li>
+            <li>PPYOLOTinyFPN</li>
+            <li>PPYOLOPAN</li>
+            <li>YOLOCSPPAN</li>
+            <li>Custom-PAN</li>
+            <li>RepPAN</li>
+            <li>CSPRepPAN</li>
+            <li>ELANFPN</li>
+            <li>ELANFPNP6</li>
+            <li>CSPNeXtPAFPN</li>
+          </ul> </details>
+        </ul>  
+        <details open><summary><b>Loss</b></summary>
+          <ul>
+            <li>Smooth-L1</li>
+            <li>GIoU/DIoU/CIoU</li>  
+            <li>IoUAware</li>
+            <li>Focal Loss</li>
+            <li>VariFocal Loss</li>
+          </ul> </details>
+        </ul>  
+        <details open><summary><b>Post-processing</b></summary>
+          <ul>
+            <li>SoftNMS</li>
+            <li>MatrixNMS</li>  
+          </ul> </details>  
+        </ul>
+        <details open><summary><b>Speed</b></summary>
+          <ul>
+            <li>FP16 training</li>
+            <li>Multi-machine training </li>  
+          </ul> </details>  
+        </ul>  
+      </td>
+      <td>
+        <details open><summary><b>Details</b></summary>
+        <ul>
+          <li>Resize</li>  
+          <li>Lighting</li>  
+          <li>Flipping</li>  
+          <li>Expand</li>
+          <li>Crop</li>
+          <li>Color Distort</li>  
+          <li>Random Erasing</li>  
+          <li>Mixup </li>
+          <li>AugmentHSV</li>
+          <li>Mosaic</li>
+          <li>Cutmix </li>
+          <li>Grid Mask</li>
+          <li>Auto Augment</li>  
+          <li>Random Perspective</li>  
+        </ul> </details>  
+      </td>  
+    </tr>
+
+</td>
+    </tr>
+  </tbody>
+</table>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157801371-9a9a8c65-1690-4123-985a-e0559a7f9494.png" width="20"/> 模型性能概览
+
+<details>
+<summary><b> 云端模型性能对比</b></summary>
+
+各模型结构和骨干网络的代表模型在COCO数据集上精度mAP和单卡Tesla V100上预测速度(FPS)对比图。
+
+<div align="center">
+  <img src="docs/images/fps_map.png" />
+</div>
+
+**说明：**
+
+- `PP-YOLOE`是对`PP-YOLO v2`模型的进一步优化，在COCO数据集精度51.6%，Tesla V100预测速度78.1FPS
+- `PP-YOLOE+`是对`PPOLOE`模型的进一步优化，在COCO数据集精度53.3%，Tesla V100预测速度78.1FPS
+- 图中模型均可在[模型库](#模型库)中获取
+
+</details>
+
+<details>
+<summary><b> 移动端模型性能对比</b></summary>
+
+各移动端模型在COCO数据集上精度mAP和高通骁龙865处理器上预测速度(FPS)对比图。
+
+<div align="center">
+  <img src="docs/images/mobile_fps_map.png" width=600/>
+</div>
+
+**说明：**
+
+- 测试数据均使用高通骁龙865(4\*A77 + 4\*A55)处理器batch size为1, 开启4线程测试，测试使用NCNN预测库，测试脚本见[MobileDetBenchmark](https://github.com/JiweiMaster/MobileDetBenchmark)
+- [PP-PicoDet](configs/picodet)及[PP-YOLO-Tiny](configs/ppyolo)为PaddleDetection自研模型，其余模型PaddleDetection暂未提供
+
+</details>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157829890-a535b8a6-631c-4c87-b861-64d4b32b2d6a.png" width="20"/> 模型库
+
+<details>
+<summary><b> 1. 通用检测</b></summary>
+
+#### [PP-YOLOE+](./configs/ppyoloe)系列 推荐场景：Nvidia V100, T4等云端GPU和Jetson系列等边缘端设备
+
+| 模型名称       | COCO精度（mAP） | V100 TensorRT FP16速度(FPS) | 配置文件                                                  | 模型下载                                                                                 |
+|:---------- |:-----------:|:-------------------------:|:-----------------------------------------------------:|:------------------------------------------------------------------------------------:|
+| PP-YOLOE+_s | 43.9        | 333.3                     | [链接](configs/ppyoloe/ppyoloe_plus_crn_s_80e_coco.yml)     | [下载地址](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_s_80e_coco.pdparams)      |
+| PP-YOLOE+_m | 50.0        | 208.3                     | [链接](configs/ppyoloe/ppyoloe_plus_crn_m_80e_coco.yml)     | [下载地址](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_m_80e_coco.pdparams)     |
+| PP-YOLOE+_l | 53.3        | 149.2                     | [链接](configs/ppyoloe/ppyoloe_plus_crn_l_80e_coco.yml) | [下载地址](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_m_80e_coco.pdparams) |
+| PP-YOLOE+_x | 54.9        | 95.2                      | [链接](configs/ppyoloe/ppyoloe_plus_crn_x_80e_coco.yml) | [下载地址](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_x_80e_coco.pdparams) |
+
+#### 前沿检测算法
+
+| 模型名称                                                               | COCO精度（mAP） | V100 TensorRT FP16速度(FPS) | 配置文件                                                                                                         | 模型下载                                                                       |
+|:------------------------------------------------------------------ |:-----------:|:-------------------------:|:------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------:|
+| [YOLOX-l](configs/yolox)                                           | 50.1        | 107.5                     | [链接](configs/yolox/yolox_l_300e_coco.yml)                                                                    | [下载地址](https://paddledet.bj.bcebos.com/models/yolox_l_300e_coco.pdparams)  |
+| [YOLOv5-l](configs/yolov5) | 48.6        | 136.0                     | [链接](configs/yolov5/yolov5_l_300e_coco.yml) | [下载地址](https://paddledet.bj.bcebos.com/models/yolov5_l_300e_coco.pdparams) |
+| [YOLOv7-l](configs/yolov7) | 51.0        | 135.0                     | [链接](configs/yolov7/yolov7_l_300e_coco.yml) | [下载地址](https://paddledet.bj.bcebos.com/models/yolov7_l_300e_coco.pdparams) |
+
+</details>
+
+
+## <img src="https://user-images.githubusercontent.com/48054808/157828296-d5eb0ccb-23ea-40f5-9957-29853d7d13a9.png" width="20"/> 文档教程
+
+### 入门教程
+
+- [安装说明](docs/tutorials/INSTALL_cn.md)
+- [快速体验](docs/tutorials/QUICK_STARTED_cn.md)
+- [数据准备](docs/tutorials/data/README.md)
+- [PaddleDetection全流程使用](docs/tutorials/GETTING_STARTED_cn.md)
+- [FAQ/常见问题汇总](docs/tutorials/FAQ)
+
+### 进阶教程
+
+- 参数配置
+
+  - [PP-YOLO参数说明](docs/tutorials/config_annotation/ppyolo_r50vd_dcn_1x_coco_annotation.md)
+
+- 模型压缩(基于[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim))
+
+  - [剪裁/量化/蒸馏教程](configs/slim)
+
+- [推理部署](deploy/README.md)
+
+  - [模型导出教程](deploy/EXPORT_MODEL.md)
+  - [Paddle Inference部署](deploy/README.md)
+    - [Python端推理部署](deploy/python)
+    - [C++端推理部署](deploy/cpp)
+  - [Paddle-Lite部署](deploy/lite)
+  - [Paddle Serving部署](deploy/serving)
+  - [ONNX模型导出](deploy/EXPORT_ONNX_MODEL.md)
+  - [推理benchmark](deploy/BENCHMARK_INFER.md)
+
+- 进阶开发
+
+  - [数据处理模块](docs/advanced_tutorials/READER.md)
+  - [新增检测模型](docs/advanced_tutorials/MODEL_TECHNICAL.md)
+  - 二次开发教程
+    - [目标检测](docs/advanced_tutorials/customization/detection.md)
+
+
+## <img src="https://user-images.githubusercontent.com/48054808/157835981-ef6057b4-6347-4768-8fcc-cd07fcc3d8b0.png" width="20"/> 版本更新
+
+版本更新内容请参考[版本更新文档](docs/CHANGELOG.md)
+
+
+## <img title="" src="https://user-images.githubusercontent.com/48054808/157835345-f5d24128-abaf-4813-b793-d2e5bdc70e5a.png" alt="" width="20"> 许可证书
+
+本项目的发布受[GPL-3.0 license](LICENSE)许可认证。
+
+
+## <img src="https://user-images.githubusercontent.com/48054808/157835276-9aab9d1c-1c46-446b-bdd4-5ab75c5cfa48.png" width="20"/> 引用
+
+```
+@misc{ppdet2019,
+title={PaddleDetection, Object detection and instance segmentation toolkit based on PaddlePaddle.},
+author={PaddlePaddle Authors},
+howpublished = {\url{https://github.com/PaddlePaddle/PaddleDetection}},
+year={2019}
+}
+```
diff --git a/README_en.md b/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..947c8fe33fb37db0f49ba8ed9c0be98cebea2907
--- /dev/null
+++ b/README_en.md
@@ -0,0 +1,446 @@
+[简体中文](README_cn.md) | English
+
+## Introduction
+
+**PaddleYOLO** is a YOLO series toolbox based on [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection), **only relevant codes of YOLO series models are included**. It supports `YOLOv3`,`PP-YOLO`,`PP-YOLOv2`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6`,`YOLOv7`,`YOLOv8`,`YOLOv5u`,`YOLOv7u`,`RTMDet` and so on, see COCO dataset ModelZoo in [ModelZoo](docs/MODEL_ZOO_en.md) and [configs](configs/).
+
+<div  align="center">
+  <img src="https://user-images.githubusercontent.com/13104100/213197403-c8257486-9ac4-486f-a0d5-4e3fe27ca852.jpg" width="480"/>
+  <img src="https://user-images.githubusercontent.com/13104100/213197635-eeb55433-bb2d-44f6-b374-73c616cfab24.jpg" width="480"/>
+</div>
+
+**Notes：**
+ - The Licence of **PaddleYOLO** is **[GPL 3.0](LICENSE)**, the codes of [YOLOv5](configs/yolov5),[YOLOv6](configs/yolov6),[YOLOv7](configs/yolov7) and [YOLOv8](configs/yolov8) will not be merged into [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection). Except for these YOLO models, other YOLO models are recommended to use in [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection), **which will be the first to release the latest progress of PP-YOLO series detection model**;
+ - To use **PaddleYOLO**, **PaddlePaddle-2.3.2 or above is recommended**，please refer to the [official website](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html) to download the appropriate version. **For Windows platforms, please install the paddle develop version**;
+ - **PaddleYOLO's [Roadmap](https://github.com/PaddlePaddle/PaddleYOLO/issues/44)** issue collects feature requests from user, welcome to put forward any opinions and suggestions.
+
+## Tutorials
+
+<details open>
+<summary>Install</summary>
+
+Clone repo and install [requirements.txt](./requirements.txt) in a
+[**Python>=3.7.0**](https://www.python.org/) environment, including
+[**PaddlePaddle>=2.3.2**](https://www.paddlepaddle.org.cn/install/).
+
+```bash
+git clone https://github.com/PaddlePaddle/PaddleYOLO  # clone
+cd PaddleYOLO
+pip install -r requirements.txt  # install
+```
+
+</details>
+
+
+<details open>
+<summary>Training/Evaluation/Inference</summary>
+
+Write the following commands in a script file, such as ```run.sh```, and run as：```sh run.sh```. You can also run the command line sentence by sentence.
+
+```bash
+model_name=ppyoloe # yolov7
+job_name=ppyoloe_plus_crn_s_80e_coco # yolov7_tiny_300e_coco
+
+config=configs/${model_name}/${job_name}.yml
+log_dir=log_dir/${job_name}
+# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
+weights=output/${job_name}/model_final.pdparams
+
+# 1.training（single GPU / multi GPU）
+# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp
+python -m paddle.distributed.launch --log_dir=${log_dir} --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp
+
+# 2.eval
+CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c ${config} -o weights=${weights} --classwise
+
+# 3.infer
+CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_img=demo/000000014439_640x640.jpg --draw_threshold=0.5
+# CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_dir=demo/ --draw_threshold=0.5
+```
+
+</details>
+
+
+<details>
+<summary>Deployment/Speed</summary>
+
+Write the following commands in a script file, such as ```run.sh```, and run as：```sh run.sh```. You can also run the command line sentence by sentence.
+
+```bash
+model_name=ppyoloe # yolov7
+job_name=ppyoloe_plus_crn_s_80e_coco # yolov7_tiny_300e_coco
+
+config=configs/${model_name}/${job_name}.yml
+log_dir=log_dir/${job_name}
+# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
+weights=output/${job_name}/model_final.pdparams
+
+# 4.export
+CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} # trt=True
+
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_post_process=True # trt=True
+
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_nms=True # trt=True
+
+# 5.deploy infer
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU
+
+# 6.deploy speed, add '--run_mode=trt_fp16' to test in TensorRT FP16 mode
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU --run_benchmark=True # --run_mode=trt_fp16
+
+# 7.export onnx
+paddle2onnx --model_dir output_inference/${job_name} --model_filename model.pdmodel --params_filename model.pdiparams --opset_version 12 --save_file ${job_name}.onnx
+
+# 8.onnx speed
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp16
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp32
+```
+
+**Note：**
+- If you want to switch models, just modify the first two lines, such as:
+  ```
+  model_name=yolov7
+  job_name=yolov7_tiny_300e_coco
+  ```
+- For **exporting onnx**, you should install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) by `pip install paddle2onnx` at first.
+- For **FLOPs(G) and Params(M)**, you should install [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) by `pip install paddleslim` at first, then set `print_flops: True` and `print_params: True` in [runtime.yml](configs/runtime.yml). Make sure **single scale** like 640x640, **MACs are printed，FLOPs=2*MACs**.
+
+</details>
+
+
+<details open>
+<summary> [Training Custom dataset](https://github.com/PaddlePaddle/PaddleYOLO/issues/43) </summary>
+
+- Please refer to [doc](docs/MODEL_ZOO_en.md#CustomDataset) and [issue](https://github.com/PaddlePaddle/PaddleYOLO/issues/43).
+- PaddleDetection team provides various **feature detection models based on PP-YOLOE** , which can also be used as a reference to modify on your custom dataset. Please refer to [PP-YOLOE application](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/ppyoloe/application), [pphuman](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/pphuman), [ppvehicle](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/ppvehicle), [visdrone](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/visdrone) and [smalldet](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/smalldet).
+- PaddleDetection also provides **various YOLO models  for VOC dataset** , which can also be used as a reference to modify on your custom dataset. Please refer to [voc](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc).
+- Please **ensure the corresponding COCO trained weights are loaded as pre-train weights** at first. Set the `pretrain_weights: ` with corresponding COCO trained weights in the config file, and it will generally prompt that the number of channels convolved by the head classification layer does not correspond, which is a normal phenomenon, because the number of types of user-defined data sets is generally inconsistent with that of COCO data sets.
+- We recommend to use YOLO detection model **with a total `batch_size` at least greater than `64` to train**. If the resources are insufficient, please **use the smaller model** or **reduce the input size of the model**. To ensure high detection accuracy, **you'd better not try to using single GPU or total `batch_size` less than `64` for training**;
+
+</details>
+
+
+## Updates
+
+* 【2023/03/13】Support [YOLOv5u](configs/yolov5/yolov5u) and [YOLOv7u](configs/yolov7/yolov7u) inference and deploy;
+* 【2023/01/10】Support [YOLOv8](configs/yolov8) inference and deploy;
+* 【2022/09/29】Support [RTMDet](configs/rtmdet) inference and deploy;
+* 【2022/09/26】Release [PaddleYOLO](https://github.com/PaddlePaddle/PaddleYOLO), see [ModelZoo](docs/MODEL_ZOO_en.md);
+* 【2022/09/19】Support the new version of [YOLOv6](configs/yolov6), including n/t/s/m/l model;
+* 【2022/08/23】Release `YOLOSeries` codebase: support `YOLOv3`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6` and `YOLOv7`; support using `ConvNeXt` backbone to get high-precision version of `PP-YOLOE`,`YOLOX` and `YOLOv5`; support PaddleSlim accelerated quantitative training `PP-YOLOE`,`YOLOv5`,`YOLOv6` and `YOLOv7`. For details, please read this [article](https://mp.weixin.qq.com/s/Hki01Zs2lQgvLSLWS0btrA)；
+
+
+## <img src="https://user-images.githubusercontent.com/48054808/157793354-6e7f381a-0aa6-4bb7-845c-9acf2ecc05c3.png" width="20"/> Product Update
+
+- 🔥 **2023.3.14：Release PaddleYOLO [release/2.6](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6)**
+  - 💡 Model kit：
+    - Support `YOLOv8`,`YOLOv5u`,`YOLOv7u` inference and deploy.
+    - Support `Swin-Transformer`、`ViT`、`FocalNet` backbone to get high-precision version of `PP-YOLOE+`.
+    - Support `YOLOv8` in [FastDeploy](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection).
+
+- 🔥 **2022.9.26：Release PaddleYOLO [release/2.5](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.5)**
+  - 💡 Model kit：
+    - Release [PaddleYOLO](https://github.com/PaddlePaddle/PaddleYOLO): support `YOLOv3`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6` and `YOLOv7`; support using `ConvNeXt` backbone to get high-precision version of `PP-YOLOE`,`YOLOX` and `YOLOv5`; support PaddleSlim accelerated quantitative training `PP-YOLOE`,`YOLOv5`,`YOLOv6` and `YOLOv7`.
+
+- 🔥 **2022.8.26：PaddleDetection [release/2.5 version](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5)**
+
+  - 🗳 Model features：
+
+    - Release [PP-YOLOE+](configs/ppyoloe): Increased accuracy by a maximum of 2.4% mAP to 54.9% mAP, 3.75 times faster model training convergence rate, and up to 2.3 times faster end-to-end inference speed; improved generalization for multiple downstream tasks
+    - Release [PicoDet-NPU](configs/picodet) model which supports full quantization deployment of models; add [PicoDet](configs/picodet) layout analysis model
+    - Release [PP-TinyPose Plus](./configs/keypoint/tiny_pose/). With 9.1% AP accuracy improvement in physical exercise, dance, and other scenarios, our PP-TinyPose Plus supports unconventional movements such as turning to one side, lying down, jumping, and high lifts
+
+  - 🔮 Functions in different scenarios
+
+    - Release the pedestrian analysis tool [PP-Human v2](./deploy/pipeline). It introduces four new behavior recognition: fighting, telephoning, smoking, and trespassing. The underlying algorithm performance is optimized, covering three core algorithm capabilities: detection, tracking, and attributes of pedestrians. Our model provides end-to-end development and model optimization strategies for beginners and supports online video streaming input.
+    - First release [PP-Vehicle](./deploy/pipeline), which has four major functions: license plate recognition, vehicle attribute analysis (color, model), traffic flow statistics, and violation detection. It is compatible with input formats, including pictures, online video streaming, and video. And we also offer our users a comprehensive set of tutorials for customization.
+
+  - 💡 Cutting-edge algorithms：
+
+    - Covers [YOLO family](https://github.com/PaddlePaddle/PaddleYOLO) classic and latest models: YOLOv3, PP-YOLOE (a real-time high-precision object detection model developed by Baidu PaddlePaddle), and cutting-edge detection algorithms such as YOLOv4, YOLOv5, YOLOX, YOLOv6, and YOLOv7
+    - Newly add high precision detection model based on [ViT](configs/vitdet) backbone network, with a 55.7% mAP accuracy on COCO dataset; newly add multi-object tracking model [OC-SORT](configs/mot/ocsort); newly add [ConvNeXt](configs/convnext) backbone network.
+
+  - 📋 Industrial applications: Newly add [Smart Fitness](https://aistudio.baidu.com/aistudio/projectdetail/4385813), [Fighting recognition](https://aistudio.baidu.com/aistudio/projectdetail/4086987?channelType=0&channel=0),[ and Visitor Analysis](https://aistudio.baidu.com/aistudio/projectdetail/4230123?channelType=0&channel=0).
+
+- 2022.3.24：PaddleDetection released[release/2.4 version](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4)  
+  - Release high-performanace SOTA object detection model [PP-YOLOE](configs/ppyoloe). It integrates cloud and edge devices and provides S/M/L/X versions. In particular, Verson L has the accuracy as 51.4% on COCO test 2017 dataset, inference speed as 78.1 FPS on a single Test V100. It supports mixed precision training, 33% faster than PP-YOLOv2. Its full range of multi-sized models can meet different hardware arithmetic requirements, and adaptable to server, edge-device GPU and other AI accelerator cards on servers.
+  - Release ultra-lightweight SOTA object detection model [PP-PicoDet Plus](configs/picodet) with 2% improvement in accuracy and 63% improvement in CPU inference speed. Add PicoDet-XS model with a 0.7M parameter, providing model sparsification and quantization functions for model acceleration. No specific post processing module is required for all the hardware, simplifying the deployment.  
+  - Release the real-time pedestrian analysis tool [PP-Human](deploy/pphuman). It has four major functions: pedestrian tracking, visitor flow statistics, human attribute recognition and falling detection. For falling detection, it is optimized based on real-life data with accurate recognition of various types of falling posture. It can adapt to different environmental background, light and camera angle.
+  - Add [YOLOX](configs/yolox) object detection model with nano/tiny/S/M/L/X. X version has the accuracy as 51.8% on COCO  Val2017 dataset.
+
+- [More releases](https://github.com/PaddlePaddle/PaddleDetection/releases)
+
+## <img title="" src="https://user-images.githubusercontent.com/48054808/157795569-9fc77c85-732f-4870-9be0-99a7fe2cff27.png" alt="" width="20"> Brief Introduction
+
+**PaddleDetection** is an end-to-end object detection development kit based on PaddlePaddle. Providing **over 30 model algorithm** and **over 250 pre-trained models**, it covers object detection, instance segmentation, keypoint detection, multi-object tracking. In particular, PaddleDetection offers **high- performance & light-weight** industrial SOTA models on **servers and mobile** devices, champion solution and cutting-edge algorithm. PaddleDetection provides various data augmentation methods, configurable network components, loss functions and other advanced optimization & deployment schemes. In addition to running through the whole process of data processing, model development, training, compression and deployment, PaddlePaddle also provides rich cases and tutorials to accelerate the industrial application of algorithm.
+
+<div  align="center">
+  <img src="https://user-images.githubusercontent.com/22989727/189122825-ee1c1db2-b5f9-42c0-88b4-7975e1ec239d.gif" width="800"/>
+</div>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157799599-e6a66855-bac6-4e75-b9c0-96e13cb9612f.png" width="20"/> Features
+
+- **Rich model library**: PaddleDetection provides over 250 pre-trained models including **object detection, instance segmentation, face recognition, multi-object tracking**. It covers a variety of **global competition champion** schemes.
+- **Simple to use**: Modular design, decoupling each network component, easy for developers to build and try various detection models and optimization strategies, quick access to high-performance, customized algorithm.
+- **Getting Through End to End**: PaddlePaddle gets through end to end from data augmentation, constructing models, training, compression, depolyment. It also supports multi-architecture, multi-device deployment for **cloud and edge** device.
+- **High Performance**: Due to the high performance core, PaddlePaddle has clear advantages in training speed and memory occupation. It also supports FP16 training and multi-machine training.
+
+<div  align="center">
+  <img src="https://user-images.githubusercontent.com/22989727/189066615-89d1dde2-54bc-4946-887e-fce50069206e.png" width="800"/>
+</div>
+
+## <img title="" src="https://user-images.githubusercontent.com/48054808/157800467-2a9946ad-30d1-49a9-b9db-ba33413d9c90.png" alt="" width="20"> Exchanges
+
+- If you have any question or suggestion, please give us your valuable input via [GitHub Issues](https://github.com/PaddlePaddle/PaddleDetection/issues)
+
+  Welcome to join PaddleDetection user groups on WeChat (scan the QR code, add and reply "D" to the assistant)
+
+  <div align="center">
+  <img src="https://user-images.githubusercontent.com/34162360/177678712-4655747d-4290-4ad9-b7a1-4564a5418ac6.jpg"  width = "200" />  
+  </div>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157827140-03ffaff7-7d14-48b4-9440-c38986ea378c.png" width="20"/> Kit Structure
+
+<table align="center">
+  <tbody>
+    <tr align="center" valign="bottom">
+      <td>
+        <b>Architectures</b>
+      </td>
+      <td>
+        <b>Backbones</b>
+      </td>
+      <td>
+        <b>Components</b>
+      </td>
+      <td>
+        <b>Data Augmentation</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+        <ul>
+        <details open><summary><b>Object Detection</b></summary>
+          <ul>
+            <li>YOLOv3</li>  
+            <li>YOLOv5</li>  
+            <li>YOLOv6</li>  
+            <li>YOLOv7</li>  
+            <li>YOLOv8</li>  
+            <li>PP-YOLOv1/v2</li>
+            <li>PP-YOLO-Tiny</li>
+            <li>PP-YOLOE</li>
+            <li>PP-YOLOE+</li>
+            <li>YOLOX</li>
+            <li>RTMDet</li>
+         </ul></details>
+      </ul>
+      </td>
+      <td>
+        <details open><summary><b>Details</b></summary>
+        <ul>
+          <li>ResNet(&vd)</li>
+          <li>CSPResNet</li>
+          <li>DarkNet</li>
+          <li>CSPDarkNet</li>
+          <li>ConvNeXt</li>
+          <li>EfficientRep</li>
+          <li>CSPBepBackbone</li>
+          <li>ELANNet</li>
+          <li>CSPNeXt</li>
+        </ul></details>
+      </td>
+      <td>
+        <details open><summary><b>Common</b></summary>
+          <ul>
+            <li>Sync-BN</li>
+            <li>Group Norm</li>
+            <li>DCNv2</li>
+            <li>EMA</li>
+          </ul> </details>
+        </ul>
+        <details open><summary><b>FPN</b></summary>
+          <ul>
+            <li>YOLOv3FPN</li>
+            <li>PPYOLOFPN</li>
+            <li>PPYOLOTinyFPN</li>
+            <li>PPYOLOPAN</li>
+            <li>YOLOCSPPAN</li>
+            <li>Custom-PAN</li>
+            <li>RepPAN</li>
+            <li>CSPRepPAN</li>
+            <li>ELANFPN</li>
+            <li>ELANFPNP6</li>
+            <li>CSPNeXtPAFPN</li>
+          </ul> </details>
+        </ul>  
+        <details open><summary><b>Loss</b></summary>
+          <ul>
+            <li>Smooth-L1</li>
+            <li>GIoU/DIoU/CIoU</li>  
+            <li>IoUAware</li>
+            <li>Focal Loss</li>
+            <li>VariFocal Loss</li>
+          </ul> </details>
+        </ul>  
+        <details open><summary><b>Post-processing</b></summary>
+          <ul>
+            <li>SoftNMS</li>
+            <li>MatrixNMS</li>  
+          </ul> </details>  
+        </ul>
+        <details open><summary><b>Speed</b></summary>
+          <ul>
+            <li>FP16 training</li>
+            <li>Multi-machine training </li>  
+          </ul> </details>  
+        </ul>  
+      </td>
+      <td>
+        <details open><summary><b>Details</b></summary>
+        <ul>
+          <li>Resize</li>  
+          <li>Lighting</li>  
+          <li>Flipping</li>  
+          <li>Expand</li>
+          <li>Crop</li>
+          <li>Color Distort</li>  
+          <li>Random Erasing</li>  
+          <li>Mixup </li>
+          <li>AugmentHSV</li>
+          <li>Mosaic</li>
+          <li>Cutmix </li>
+          <li>Grid Mask</li>
+          <li>Auto Augment</li>  
+          <li>Random Perspective</li>  
+        </ul> </details>  
+      </td>  
+    </tr>
+
+</td>
+    </tr>
+  </tbody>
+</table>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157801371-9a9a8c65-1690-4123-985a-e0559a7f9494.png" width="20"/> Model Performance
+
+<details>
+<summary><b> Performance comparison of Cloud models</b></summary>
+
+The comparison between COCO mAP and FPS on Tesla V100 of representative models of each architectures and backbones.
+
+<div align="center">
+  <img src="docs/images/fps_map.png" />
+</div>
+
+**Clarification：**
+
+- `PP-YOLOE` are optimized `PP-YOLO v2`. It reached accuracy as 51.4% on COCO dataset, inference speed as 78.1 FPS on Tesla V100
+- `PP-YOLOE+` are optimized `PP-YOLOE`. It reached accuracy as 53.3% on COCO dataset, inference speed as 78.1 FPS on Tesla V100
+- The models in the figure are available in the[ model library](#模型库)
+
+</details>
+
+<details>
+<summary><b> Performance omparison on mobiles</b></summary>
+
+The comparison between COCO mAP and FPS on Qualcomm Snapdragon 865 processor of models on mobile devices.
+
+<div align="center">
+  <img src="docs/images/mobile_fps_map.png" width=600/>
+</div>
+
+**Clarification：**
+
+- Tests were conducted on Qualcomm Snapdragon 865 (4 \*A77 + 4 \*A55) batch_size=1, 4 thread, and NCNN inference library, test script see [MobileDetBenchmark](https://github.com/JiweiMaster/MobileDetBenchmark)
+- [PP-PicoDet](configs/picodet) and [PP-YOLO-Tiny](configs/ppyolo) are self-developed models of PaddleDetection, and other models are not tested yet.
+
+</details>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157829890-a535b8a6-631c-4c87-b861-64d4b32b2d6a.png" width="20"/> Model libraries
+
+<details>
+<summary><b> 1. General detection</b></summary>
+
+#### PP-YOLOE series Recommended scenarios: Cloud GPU such as Nvidia V100, T4 and edge devices such as Jetson series
+
+| Model      | COCO Accuracy（mAP） | V100 TensorRT FP16 Speed(FPS) | Configuration                                           | Download                                                                                 |
+|:---------- |:------------------:|:-----------------------------:|:-------------------------------------------------------:|:----------------------------------------------------------------------------------------:|
+| PP-YOLOE+_s | 43.9        | 333.3                     | [link](configs/ppyoloe/ppyoloe_plus_crn_s_80e_coco.yml)     | [download](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_s_80e_coco.pdparams)      |
+| PP-YOLOE+_m | 50.0        | 208.3                     | [link](configs/ppyoloe/ppyoloe_plus_crn_m_80e_coco.yml)     | [download](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_m_80e_coco.pdparams)     |
+| PP-YOLOE+_l | 53.3        | 149.2                     | [link](configs/ppyoloe/ppyoloe_plus_crn_l_80e_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_m_80e_coco.pdparams) |
+| PP-YOLOE+_x | 54.9        | 95.2                      | [link](configs/ppyoloe/ppyoloe_plus_crn_x_80e_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_x_80e_coco.pdparams) |
+
+#### Frontier detection algorithm
+
+| Model    | COCO Accuracy（mAP） | V100 TensorRT FP16 speed(FPS) | Configuration                                                                                                  | Download                                                                       |
+|:-------- |:------------------:|:-----------------------------:|:--------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------:|
+| [YOLOX-l](configs/yolox)  | 50.1               | 107.5                         | [Link](configs/yolox/yolox_l_300e_coco.yml)                                                                    | [Download](https://paddledet.bj.bcebos.com/models/yolox_l_300e_coco.pdparams)  |
+| [YOLOv5-l](configs/yolov5) | 48.6               | 136.0                         | [Link](configs/yolov5/yolov5_l_300e_coco.yml) | [Download](https://paddledet.bj.bcebos.com/models/yolov5_l_300e_coco.pdparams) |
+| [YOLOv7-l](configs/yolov7) | 51.0        | 135.0                     | [链接](configs/yolov7/yolov7_l_300e_coco.yml) | [下载地址](https://paddledet.bj.bcebos.com/models/yolov7_l_300e_coco.pdparams) |
+
+</details>
+
+## <img src="https://user-images.githubusercontent.com/48054808/157828296-d5eb0ccb-23ea-40f5-9957-29853d7d13a9.png" width="20"/>Document tutorials
+
+### Introductory tutorials
+
+- [Installation](docs/tutorials/INSTALL_cn.md)
+- [Quick start](docs/tutorials/QUICK_STARTED_cn.md)
+- [Data preparation](docs/tutorials/data/README.md)
+- [Geting Started on PaddleDetection](docs/tutorials/GETTING_STARTED_cn.md)
+- [FAQ]((docs/tutorials/FAQ)
+
+### Advanced tutorials
+
+- Configuration
+
+  - [PP-YOLO Configuration](docs/tutorials/config_annotation/ppyolo_r50vd_dcn_1x_coco_annotation.md)
+
+- Compression based on [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)
+
+  - [Pruning/Quantization/Distillation Tutorial](configs/slim)
+
+- [Inference deployment](deploy/README.md)
+
+  - [Export model for inference](deploy/EXPORT_MODEL.md)
+
+  - [Paddle Inference deployment](deploy/README.md)
+
+    - [Inference deployment with Python](deploy/python)
+    - [Inference deployment with C++](deploy/cpp)
+
+  - [Paddle-Lite deployment](deploy/lite)
+
+  - [Paddle Serving deployment](deploy/serving)
+
+  - [ONNX model export](deploy/EXPORT_ONNX_MODEL.md)
+
+  - [Inference benchmark](deploy/BENCHMARK_INFER.md)
+
+- Advanced development
+
+  - [Data processing module](docs/advanced_tutorials/READER.md)
+  - [New object detection models](docs/advanced_tutorials/MODEL_TECHNICAL.md)
+  - Custumization
+    - [Object detection](docs/advanced_tutorials/customization/detection.md)
+
+
+## <img src="https://user-images.githubusercontent.com/48054808/157835981-ef6057b4-6347-4768-8fcc-cd07fcc3d8b0.png" width="20"/> Version updates
+
+Please refer to the[ Release note ](https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-2.3.0-Release-Note-EN)for more details about the updates
+
+## <img title="" src="https://user-images.githubusercontent.com/48054808/157835345-f5d24128-abaf-4813-b793-d2e5bdc70e5a.png" alt="" width="20">  License
+
+PaddleYOLO is provided under the [GPL-3.0 license](LICENSE)
+
+## <img src="https://user-images.githubusercontent.com/48054808/157835276-9aab9d1c-1c46-446b-bdd4-5ab75c5cfa48.png" width="20"/> Quote
+
+```
+@misc{ppdet2019,
+title={PaddleDetection, Object detection and instance segmentation toolkit based on PaddlePaddle.},
+author={PaddlePaddle Authors},
+howpublished = {\url{https://github.com/PaddlePaddle/PaddleDetection}},
+year={2019}
+}
+```
diff --git a/asserts/Algorithm_principle.png b/asserts/Algorithm_principle.png
new file mode 100644
index 0000000000000000000000000000000000000000..f89b410451c34db2ef69520311beec9f8b47e2ac
Binary files /dev/null and b/asserts/Algorithm_principle.png differ
diff --git a/asserts/Backbone.png b/asserts/Backbone.png
new file mode 100644
index 0000000000000000000000000000000000000000..ebf87bbfa7004f447b8bab8a2f0b41f3b2bc7d9d
Binary files /dev/null and b/asserts/Backbone.png differ
diff --git a/asserts/model.properties b/asserts/model.properties
new file mode 100644
index 0000000000000000000000000000000000000000..7d058ca71c47ff11dd8cb085b62cce3a7670e1db
--- /dev/null
+++ b/asserts/model.properties
@@ -0,0 +1,10 @@
+# 模型唯一标识
+modelCode = 469
+# 模型名称
+modelName=yolov5_paddle
+# 模型描述
+modelDescription=yolov5是一种基于深度学习的目标检测算法,可以广泛应用于各种计算机视觉和人工智能领域的应用中
+# 应用场景
+appScenario=推理,训练,金融,交通,教育
+# 框架类型
+frameType=paddle
\ No newline at end of file
diff --git a/asserts/result_000000014439_640x640.jpg b/asserts/result_000000014439_640x640.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d1f46e9a256c3b4c714203a15706946c2043f877
Binary files /dev/null and b/asserts/result_000000014439_640x640.jpg differ
diff --git a/configs/datasets/coco_detection.yml b/configs/datasets/coco_detection.yml
new file mode 100644
index 0000000000000000000000000000000000000000..df50dab1bc6f3f72a186bcb7bcecf4b7134c14ef
--- /dev/null
+++ b/configs/datasets/coco_detection.yml
@@ -0,0 +1,20 @@
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  name: COCODataSet
+  image_dir: train2017
+  anno_path: annotations/instances_train2017.json
+  dataset_dir: dataset/coco
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  name: COCODataSet
+  image_dir: val2017
+  anno_path: annotations/instances_val2017.json
+  dataset_dir: dataset/coco
+
+TestDataset:
+  name: ImageFolder
+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
diff --git a/configs/datasets/objects365_detection.yml b/configs/datasets/objects365_detection.yml
new file mode 100644
index 0000000000000000000000000000000000000000..735ebf96dcea828459428016bad764c8461e8ee8
--- /dev/null
+++ b/configs/datasets/objects365_detection.yml
@@ -0,0 +1,21 @@
+metric: COCO
+num_classes: 365
+
+TrainDataset:
+  !COCODataSet
+    image_dir: train
+    anno_path: annotations/zhiyuan_objv2_train.json
+    dataset_dir: dataset/objects365
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val
+    anno_path: annotations/zhiyuan_objv2_val.json
+    dataset_dir: dataset/objects365
+    allow_empty: true
+
+TestDataset:
+  !ImageFolder
+    anno_path: annotations/zhiyuan_objv2_val.json
+    dataset_dir: dataset/objects365/
diff --git a/configs/datasets/roadsign_voc.yml b/configs/datasets/roadsign_voc.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9a081611aa8dafef5d5c6f1af1476cc038db5702
--- /dev/null
+++ b/configs/datasets/roadsign_voc.yml
@@ -0,0 +1,21 @@
+metric: VOC
+map_type: integral
+num_classes: 4
+
+TrainDataset:
+  name: VOCDataSet
+  dataset_dir: dataset/roadsign_voc
+  anno_path: train.txt
+  label_list: label_list.txt
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+EvalDataset:
+  name: VOCDataSet
+  dataset_dir: dataset/roadsign_voc
+  anno_path: valid.txt
+  label_list: label_list.txt
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+TestDataset:
+  name: ImageFolder
+  anno_path: dataset/roadsign_voc/label_list.txt
diff --git a/configs/datasets/visdrone_detection.yml b/configs/datasets/visdrone_detection.yml
new file mode 100644
index 0000000000000000000000000000000000000000..37feb6e2618ff9d83ce2842a9e581dcfd31efc78
--- /dev/null
+++ b/configs/datasets/visdrone_detection.yml
@@ -0,0 +1,22 @@
+metric: COCO
+num_classes: 10
+
+TrainDataset:
+  !COCODataSet
+    image_dir: VisDrone2019-DET-train
+    anno_path: train.json
+    dataset_dir: dataset/visdrone
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  !COCODataSet
+    image_dir: VisDrone2019-DET-val
+    anno_path: val.json
+    # image_dir: test_dev
+    # anno_path: test_dev.json
+    dataset_dir: dataset/visdrone
+
+TestDataset:
+  !ImageFolder
+    anno_path: val.json
+    dataset_dir: dataset/visdrone
diff --git a/configs/datasets/voc.yml b/configs/datasets/voc.yml
new file mode 100644
index 0000000000000000000000000000000000000000..72182bed9d17ca076c94a1872613ce7ad29d36d9
--- /dev/null
+++ b/configs/datasets/voc.yml
@@ -0,0 +1,21 @@
+metric: VOC
+map_type: 11point
+num_classes: 20
+
+TrainDataset:
+  name: VOCDataSet
+  dataset_dir: dataset/voc
+  anno_path: trainval.txt
+  label_list: label_list.txt
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+EvalDataset:
+  name: VOCDataSet
+  dataset_dir: dataset/voc
+  anno_path: test.txt
+  label_list: label_list.txt
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+TestDataset:
+  name: ImageFolder
+  anno_path: dataset/voc/label_list.txt
diff --git a/configs/runtime.yml b/configs/runtime.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a58b171ce774e045f4db2e0894a6781a25e0ec03
--- /dev/null
+++ b/configs/runtime.yml
@@ -0,0 +1,16 @@
+use_gpu: true
+use_xpu: false
+use_mlu: false
+use_npu: false
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+print_params: false
+
+# Exporting the model
+export:
+  post_process: True  # Whether post-processing is included in the network when export model.
+  nms: True           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+  fuse_conv_bn: False
diff --git a/configs/yolov5/README.md b/configs/yolov5/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7a002d8ac76df8a29cc6ad722445a4df033c1f95
--- /dev/null
+++ b/configs/yolov5/README.md
@@ -0,0 +1,245 @@
+# YOLOv5
+
+## 内容
+- [模型库](#模型库)
+- [使用说明](#使用说明)
+- [速度测试](#速度测试)
+
+## 模型库
+
+### 基础模型
+
+| 网络网络        | 输入尺寸   | 图片数/GPU | 学习率策略 | 模型推理耗时(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv5-n        |  640     |    16     |   300e    |     1.5    |  28.0  | 45.7 |  1.87  | 4.52 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_n_300e_coco.pdparams) | [配置文件](./yolov5_n_300e_coco.yml) |
+| YOLOv5-s        |  640     |    16      |   300e   |     2.6    |  37.6  | 56.7 |  7.24  | 16.54 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams) | [配置文件](./yolov5_s_300e_coco.yml) |
+| YOLOv5-m        |  640     |    16      |   300e   |     5.2    |  45.4  | 64.1 |  21.19  | 49.08 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_m_300e_coco.pdparams) | [配置文件](./yolov5_m_300e_coco.yml) |
+| YOLOv5-l        |  640     |    16      |   300e   |     7.9    |  48.9  | 67.1 |  46.56  | 109.32 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_l_300e_coco.pdparams) | [配置文件](./yolov5_l_300e_coco.yml) |
+| YOLOv5-x        |  640     |    16      |   300e   |     13.7   |  50.6  | 68.7 |  86.75  | 205.92 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_x_300e_coco.pdparams) | [配置文件](./yolov5_x_300e_coco.yml) |
+| YOLOv5-s ConvNeXt|  640    |    8       |   36e    |      -     |  42.4  | 65.3 |  34.54  |  17.96 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_convnext_s_36e_coco.pdparams) | [配置文件](../convnext/yolov5_convnext_s_36e_coco.yml) |
+
+### SSOD预训练模型
+
+| 网络网络        | 输入尺寸   | 图片数/GPU | 学习率策略 | 模型推理耗时(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv5-s        |  640     |    16      |   80e   |     2.6    |  38.8  | 58.2 |  7.24  | 16.54 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_s_80e_ssod_finetune_coco.pdparams) | [配置文件](./yolov5_s_80e_ssod_finetune_coco.yml) |
+
+### Objects-365预训练模型
+
+| 网络网络        | 输入尺寸   | 图片数/GPU | 学习率策略 | 模型推理耗时(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv5-l        |  640     |    16      |   30e   |     7.9    |  49.3  | 67.8 |  46.56  | 109.32 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_l_30e_obj365_finetune_coco.pdparams) | [配置文件](./yolov5_l_30e_obj365_finetune_coco.yml) |
+
+### P6大尺度模型
+
+| 网络网络        | 输入尺寸   | 图片数/GPU | 学习率策略 | 模型推理耗时(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv5p6-n        |  1280     |    16     |   300e    |     -    |  35.9  | 54.2 |  3.25  | 9.23 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5p6_n_300e_coco.pdparams) | [配置文件](./yolov5p6_n_300e_coco.yml) |
+| YOLOv5p6-s        |  1280     |    16     |   300e    |     -    |  44.5  | 63.3 |  12.63  | 33.81 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5p6_s_300e_coco.pdparams) | [配置文件](./yolov5p6_s_300e_coco.yml) |
+| YOLOv5p6-m        |  1280     |    16     |   300e    |     -    |  51.1  | 69.0 |  35.73  | 100.21 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5p6_m_300e_coco.pdparams) | [配置文件](./yolov5p6_m_300e_coco.yml) |
+| YOLOv5p6-l        |  1280     |    8      |   300e    |     -    |  53.4  | 71.0 |  76.77  | 223.09 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5p6_l_300e_coco.pdparams) | [配置文件](./yolov5p6_l_300e_coco.yml) |
+| YOLOv5p6-x        |  1280     |    8      |   300e    |     -    |  54.7  | 72.4 |  140.80 | 420.03 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5p6_x_300e_coco.pdparams) | [配置文件](./yolov5p6_x_300e_coco.yml) |
+
+### [YOLOv5u](../yolov5u)
+
+| 网络网络        | 输入尺寸   | 图片数/GPU | 学习率策略 | 模型推理耗时(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv5u-n        |  640     |    16      |   300e   |     1.61    |  34.5  | 49.7 |  2.65  | 7.79 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5u_n_300e_coco.pdparams) | [配置文件](./yolov5u/yolov5u_n_300e_coco.yml) |
+| YOLOv5u-s        |  640     |    16      |   300e   |     2.66    |  43.0  | 59.7 |  9.15   | 24.12 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5u_s_300e_coco.pdparams) | [配置文件](./yolov5u/yolov5u_s_300e_coco.yml) |
+| YOLOv5u-m        |  640     |    16      |   300e   |     5.50    |  49.0  | 65.7 |  25.11  | 64.42 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5u_m_300e_coco.pdparams) | [配置文件](./yolov5u/yolov5u_m_300e_coco.yml) |
+| YOLOv5u-l        |  640     |    16      |   300e   |     8.73    |  52.2  | 69.0 |  53.23  | 135.34 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5u_l_300e_coco.pdparams) | [配置文件](./yolov5u/yolov5u_l_300e_coco.yml) |
+| YOLOv5u-x        |  640     |    16      |   300e   |     15.49   |  53.1  | 69.9 |  97.28  | 246.89 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5u_x_300e_coco.pdparams) | [配置文件](./yolov5u/yolov5u_x_300e_coco.yml) |
+
+
+**注意:**
+  - YOLOv5模型训练使用COCO train2017作为训练集，Box AP为在COCO val2017上的`mAP(IoU=0.5:0.95)`结果；
+  - YOLOv5u 模型表示YOLOv5结构使用YOLOv8的head和loss，是Anchor Free的检测方案，具体可参照[YOLOv5u](../yolov5u)；
+  - YOLOv5模型训练过程中默认使用8 GPUs进行混合精度训练，默认lr为0.01为8卡总batch_size的设置，如果**GPU卡数**或者每卡**batch size**发生改动，也不需要改动学习率，但为了保证高精度最好使用**总batch size大于64**的配置去训练；
+  - 模型推理耗时(ms)为TensorRT-FP16下测试的耗时，不包含数据预处理和模型输出后处理(NMS)的耗时。测试采用单卡Tesla T4 GPU，batch size=1，测试环境为**paddlepaddle-2.3.2**, **CUDA 11.2**, **CUDNN 8.2**, **GCC-8.2**, **TensorRT 8.0.3.4**，具体请参考[速度测试](#速度测试)。
+  - 如果你设置了`--run_benchmark=True`, 你首先需要安装以下依赖`pip install pynvml psutil GPUtil`。
+
+### 部署模型
+
+| 网络模型     | 输入尺寸 | 导出后的权重(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| YOLOv5-n |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_wo_nms.onnx) |
+| YOLOv5-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_wo_nms.onnx) |
+| YOLOv5-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_wo_nms.onnx) |
+| YOLOv5-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_wo_nms.onnx) |
+| YOLOv5-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_wo_nms.onnx) |
+
+## 使用教程
+
+### 0. **一键运行全流程**
+
+将以下命令写在一个脚本文件里如```run.sh```，一键运行命令为：```sh run.sh```，也可命令行一句句去运行。
+
+```bash
+model_name=yolov5 # 可修改，如 ppyoloe
+job_name=yolov5_s_300e_coco # 可修改，如 ppyoloe_plus_crn_s_80e_coco
+
+config=configs/${model_name}/${job_name}.yml
+log_dir=log_dir/${job_name}
+# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
+weights=output/${job_name}/model_final.pdparams
+
+# 1.训练（单卡/多卡），加 --eval 表示边训边评估，加 --amp 表示混合精度训练
+# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp
+python -m paddle.distributed.launch --log_dir=${log_dir} --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp
+
+# 2.评估，加 --classwise 表示输出每一类mAP
+CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c ${config} -o weights=${weights} --classwise
+
+# 3.预测 (单张图/图片文件夹）
+CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_img=demo/000000014439_640x640.jpg --draw_threshold=0.5
+# CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_dir=demo/ --draw_threshold=0.5
+
+# 4.导出模型，以下3种模式选一种
+## 普通导出，加trt表示用于trt加速，对NMS和silu激活函数提速明显
+CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} # trt=True
+
+## exclude_post_process去除后处理导出，返回和YOLOv5导出ONNX时相同格式的concat后的1个Tensor，是未缩放回原图的坐标+分类置信度
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_post_process=True # trt=True
+
+## exclude_nms去除NMS导出，返回2个Tensor，是缩放回原图后的坐标和分类置信度
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_nms=True # trt=True
+
+# 5.部署预测，注意不能使用 去除后处理 或 去除NMS 导出后的模型去预测
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU
+
+# 6.部署测速，加 “--run_mode=trt_fp16” 表示在TensorRT FP16模式下测速，注意如需用到 trt_fp16 则必须为加 trt=True 导出的模型
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU --run_benchmark=True # --run_mode=trt_fp16
+
+# 7.onnx导出，一般结合 exclude_post_process去除后处理导出的模型
+paddle2onnx --model_dir output_inference/${job_name} --model_filename model.pdmodel --params_filename model.pdiparams --opset_version 12 --save_file ${job_name}.onnx
+
+# 8.onnx trt测速
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp16
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp32
+```
+
+### 1. 训练
+执行以下指令使用混合精度训练YOLOv5
+```bash
+python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/yolov5/yolov5_s_300e_coco.yml --amp --eval
+```
+**注意:**
+- `--amp`表示开启混合精度训练以避免显存溢出，`--eval`表示边训边验证。
+
+### 2. 评估
+执行以下命令在单个GPU上评估COCO val2017数据集
+```bash
+CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams
+```
+
+### 3. 推理
+使用以下命令在单张GPU上预测图片，使用`--infer_img`推理单张图片以及使用`--infer_dir`推理文件中的所有图片。
+```bash
+# 推理单张图片
+CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams --infer_img=demo/000000014439_640x640.jpg
+
+# 推理文件中的所有图片
+CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams --infer_dir=demo
+```
+
+### 4.导出模型
+YOLOv5在GPU上推理部署或benchmark测速等需要通过`tools/export_model.py`导出模型。
+
+当你**使用Paddle Inference但不使用TensorRT**时，运行以下的命令导出模型
+
+```bash
+python tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams
+```
+
+当你**使用Paddle Inference且使用TensorRT**时，需要指定`-o trt=True`来导出模型。
+
+```bash
+python tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams trt=True
+```
+
+如果你想将YOLOv5模型导出为**ONNX格式**，参考
+[PaddleDetection模型导出为ONNX格式教程](../../deploy/EXPORT_ONNX_MODEL.md)，运行以下命令：
+
+```bash
+
+# 导出推理模型
+python tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml --output_dir=output_inference -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams
+
+# 安装paddle2onnx
+pip install paddle2onnx
+
+# 转换成onnx格式
+paddle2onnx --model_dir output_inference/yolov5_s_300e_coco --model_filename model.pdmodel --params_filename model.pdiparams --opset_version 11 --save_file yolov5_s_300e_coco.onnx
+```
+
+**注意：** ONNX模型目前只支持batch_size=1
+
+
+### 5.推理部署
+YOLOv5可以使用以下方式进行部署：
+  - Paddle Inference [Python](../../deploy/python) & [C++](../../deploy/cpp)
+  - [Paddle-TensorRT](../../deploy/TENSOR_RT.md)
+  - [PaddleServing](https://github.com/PaddlePaddle/Serving)
+  - [PaddleSlim模型量化](../slim)
+
+运行以下命令导出模型
+
+```bash
+python tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams trt=True
+```
+
+**注意：**
+- trt=True表示**使用Paddle Inference且使用TensorRT**进行测速，速度会更快，默认不加即为False，表示**使用Paddle Inference但不使用TensorRT**进行测速。
+- 如果是使用Paddle Inference在TensorRT FP16模式下部署，需要参考[Paddle Inference文档](https://www.paddlepaddle.org.cn/inference/master/user_guides/download_lib.html#python)，下载并安装与你的CUDA, CUDNN和TensorRT相应的wheel包。
+
+#### 5.1.Python部署
+`deploy/python/infer.py`使用上述导出后的Paddle Inference模型用于推理和benchnark测速，如果设置了`--run_benchmark=True`, 首先需要安装以下依赖`pip install pynvml psutil GPUtil`。
+
+```bash
+# Python部署推理单张图片
+python deploy/python/infer.py --model_dir=output_inference/yolov5_s_300e_coco --image_file=demo/000000014439_640x640.jpg --device=gpu
+
+# 推理文件夹下的所有图片
+python deploy/python/infer.py --model_dir=output_inference/yolov5_s_300e_coco --image_dir=demo/ --device=gpu
+```
+
+#### 5.2. C++部署
+`deploy/cpp/build/main`使用上述导出后的Paddle Inference模型用于C++推理部署, 首先按照[docs](../../deploy/cpp/docs)编译安装环境。
+```bash
+# C++部署推理单张图片
+./deploy/cpp/build/main --model_dir=output_inference/yolov5_s_300e_coco/ --image_file=demo/000000014439_640x640.jpg --run_mode=paddle --device=GPU --threshold=0.5 --output_dir=cpp_infer_output/yolov5_s_300e_coco
+```
+
+
+## 速度测试
+
+为了公平起见，在[模型库](#模型库)中的速度测试结果均为不包含数据预处理和模型输出后处理(NMS)的数据(与[YOLOv4(AlexyAB)](https://github.com/AlexeyAB/darknet)测试方法一致)，需要在导出模型时指定`-o exclude_nms=True`。测速需设置`--run_benchmark=True`, 首先需要安装以下依赖`pip install pynvml psutil GPUtil`。
+
+**使用Paddle Inference但不使用TensorRT**进行测速，执行以下命令：
+
+```bash
+# 导出模型
+python tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams exclude_nms=True
+
+# 速度测试，使用run_benchmark=True
+python deploy/python/infer.py --model_dir=output_inference/yolov5_s_300e_coco --image_file=demo/000000014439_640x640.jpg --run_mode=paddle --device=gpu --run_benchmark=True
+```
+
+**使用Paddle Inference且使用TensorRT**进行测速，执行以下命令：
+
+```bash
+# 导出模型，使用trt=True
+python tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams exclude_nms=True trt=True
+
+# 速度测试，使用run_benchmark=True
+python deploy/python/infer.py --model_dir=output_inference/yolov5_s_300e_coco --image_file=demo/000000014439_640x640.jpg --device=gpu --run_benchmark=True
+
+# tensorRT-FP32测速
+python deploy/python/infer.py --model_dir=output_inference/yolov5_s_300e_coco --image_file=demo/000000014439_640x640.jpg --device=gpu --run_benchmark=True --run_mode=trt_fp32
+
+# tensorRT-FP16测速
+python deploy/python/infer.py --model_dir=output_inference/yolov5_s_300e_coco --image_file=demo/000000014439_640x640.jpg --device=gpu --run_benchmark=True --run_mode=trt_fp16
+```
+**注意:**
+- 导出模型时指定`-o exclude_nms=True`仅作为测速时用，这样导出的模型其推理部署预测的结果不是最终检出框的结果。
+- [模型库](#模型库)中的速度测试结果为tensorRT-FP16测速后的最快速度，为不包含数据预处理和模型输出后处理(NMS)的耗时。
diff --git a/configs/yolov5/_base_/optimizer_300e.yml b/configs/yolov5/_base_/optimizer_300e.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d5ba562e9a9ab943e9a469a2c7f883c40bfc2d3e
--- /dev/null
+++ b/configs/yolov5/_base_/optimizer_300e.yml
@@ -0,0 +1,19 @@
+epoch: 300
+
+LearningRate:
+  base_lr: 0.01
+  schedulers:
+  - !YOLOv5LRDecay
+    max_epochs: 300
+    min_lr_ratio: 0.01
+  - !ExpWarmup
+    epochs: 3
+
+OptimizerBuilder:
+  optimizer:
+    type: Momentum
+    momentum: 0.937
+    use_nesterov: True
+  regularizer:
+    factor: 0.0005
+    type: L2
diff --git a/configs/yolov5/_base_/optimizer_300e_high.yml b/configs/yolov5/_base_/optimizer_300e_high.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2d5b7c0b81e5469db3d70d69655dc3031b22f834
--- /dev/null
+++ b/configs/yolov5/_base_/optimizer_300e_high.yml
@@ -0,0 +1,19 @@
+epoch: 300
+
+LearningRate:
+  base_lr: 0.01
+  schedulers:
+  - !YOLOv5LRDecay
+    max_epochs: 300
+    min_lr_ratio: 0.1 #
+  - !ExpWarmup
+    epochs: 3
+
+OptimizerBuilder:
+  optimizer:
+    type: Momentum
+    momentum: 0.937
+    use_nesterov: True
+  regularizer:
+    factor: 0.0005
+    type: L2
diff --git a/configs/yolov5/_base_/yolov5_cspdarknet.yml b/configs/yolov5/_base_/yolov5_cspdarknet.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5fa1e496ed1e251c53aa3e3dccb96b899139187a
--- /dev/null
+++ b/configs/yolov5/_base_/yolov5_cspdarknet.yml
@@ -0,0 +1,46 @@
+architecture: YOLOv5
+norm_type: sync_bn
+use_ema: True
+ema_decay: 0.9999
+ema_decay_type: "exponential"
+act: silu
+find_unused_parameters: True
+
+depth_mult: 1.0
+width_mult: 1.0
+
+
+YOLOv5:
+  backbone: CSPDarkNet
+  neck: YOLOCSPPAN
+  yolo_head: YOLOv5Head
+  post_process: ~
+
+CSPDarkNet:
+  arch: "P5"
+  return_idx: [2, 3, 4]
+  depthwise: false
+
+YOLOCSPPAN:
+  depthwise: false
+
+YOLOv5Head:
+  anchors: [[10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+  stride: [8, 16, 32]
+  loss: YOLOv5Loss
+  nms:
+    name: MultiClassNMS
+    nms_top_k: 3000
+    keep_top_k: 300
+    score_threshold: 0.001
+    nms_threshold: 0.65
+
+YOLOv5Loss:
+  downsample_ratios: [8, 16, 32]
+  balance: [4.0, 1.0, 0.4]
+  box_weight: 0.05
+  obj_weight: 1.0
+  cls_weght: 0.5
diff --git a/configs/yolov5/_base_/yolov5_cspresnet.yml b/configs/yolov5/_base_/yolov5_cspresnet.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ae1394f9456cb0d992f16d9061038edf2e9ad8ec
--- /dev/null
+++ b/configs/yolov5/_base_/yolov5_cspresnet.yml
@@ -0,0 +1,22 @@
+_BASE_: [
+  'yolov5_cspdarknet.yml',
+]
+
+YOLOv5:
+  backbone: CSPResNet
+  neck: CustomCSPPAN
+  yolo_head: YOLOv5Head
+  post_process: ~
+
+CSPResNet:
+  layers: [3, 6, 6, 3]
+  channels: [64, 128, 256, 512, 1024]
+  return_idx: [1, 2, 3]
+  use_large_stem: True
+
+CustomCSPPAN:
+  out_channels: [768, 384, 192]
+  stage_num: 1
+  block_num: 3
+  act: 'swish'
+  spp: true
diff --git a/configs/yolov5/_base_/yolov5_reader.yml b/configs/yolov5/_base_/yolov5_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c99e4d83888b57587a11d55ceb067f9cfa621bcd
--- /dev/null
+++ b/configs/yolov5/_base_/yolov5_reader.yml
@@ -0,0 +1,46 @@
+input_height: &input_height 640
+input_width: &input_width 640
+input_size: &input_size [*input_height, *input_width]
+mosaic_epoch: &mosaic_epoch 300
+
+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - DecodeNormResize: {target_size: *input_size, mosaic: True}
+    - MosaicPerspective: {mosaic_prob: 1.0, target_size: *input_size}
+    - RandomHSV: {hgain: 0.015, sgain: 0.7, vgain: 0.4}
+    - RandomFlip: {}
+    - BboxXYXY2XYWH: {}
+    - NormalizeBox: {}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - RGBReverse: {} # bgr->rgb
+    - Permute: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: False
+  use_shared_memory: False
+  collate_batch: False
+  mosaic_epoch: *mosaic_epoch
+
+
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
+    - Pad: {size: *input_size, fill_value: [114., 114., 114.]}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 1
+
+
+TestReader:
+  inputs_def:
+    image_shape: [3, 640, 640]
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
+    - Pad: {size: *input_size, fill_value: [114., 114., 114.]}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 1
+  fuse_normalize: False
diff --git a/configs/yolov5/_base_/yolov5_reader_high_aug.yml b/configs/yolov5/_base_/yolov5_reader_high_aug.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f44882dddc970f632b3c30b65da8a547ac3d6186
--- /dev/null
+++ b/configs/yolov5/_base_/yolov5_reader_high_aug.yml
@@ -0,0 +1,46 @@
+input_height: &input_height 640
+input_width: &input_width 640
+input_size: &input_size [*input_height, *input_width]
+mosaic_epoch: &mosaic_epoch 300
+
+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - DecodeNormResize: {target_size: *input_size, mosaic: True}
+    - MosaicPerspective: {mosaic_prob: 1.0, target_size: *input_size, scale: 0.9, mixup_prob: 0.1, copy_paste_prob: 0.1}
+    - RandomHSV: {hgain: 0.015, sgain: 0.7, vgain: 0.4}
+    - RandomFlip: {}
+    - BboxXYXY2XYWH: {}
+    - NormalizeBox: {}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - RGBReverse: {} # bgr->rgb
+    - Permute: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: False
+  use_shared_memory: False
+  collate_batch: False
+  mosaic_epoch: *mosaic_epoch
+
+
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
+    - Pad: {size: *input_size, fill_value: [114., 114., 114.]}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 1
+
+
+TestReader:
+  inputs_def:
+    image_shape: [3, 640, 640]
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
+    - Pad: {size: *input_size, fill_value: [114., 114., 114.]}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 1
+  fuse_normalize: False
diff --git a/configs/yolov5/_base_/yolov5p2_cspdarknet.yml b/configs/yolov5/_base_/yolov5p2_cspdarknet.yml
new file mode 100644
index 0000000000000000000000000000000000000000..99103dc4ff8e1111359392b8ae410fa09ed34e96
--- /dev/null
+++ b/configs/yolov5/_base_/yolov5p2_cspdarknet.yml
@@ -0,0 +1,19 @@
+_BASE_: [
+  'yolov5_cspdarknet.yml',
+]
+
+CSPDarkNet:
+  arch: "P5"
+  return_idx: [1, 2, 3, 4]
+
+YOLOv5Head:
+  anchors: [[5, 6], [8, 14], [15, 11],
+            [10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]
+  stride: [4, 8, 16, 32]
+
+YOLOv5Loss:
+  downsample_ratios: [4, 8, 16, 32]
+  balance: [4.0, 1.0, 0.25, 0.06]
diff --git a/configs/yolov5/_base_/yolov5p6_cspdarknet.yml b/configs/yolov5/_base_/yolov5p6_cspdarknet.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e1043e0bb7faffeda10292c249ddf930de2f9e13
--- /dev/null
+++ b/configs/yolov5/_base_/yolov5p6_cspdarknet.yml
@@ -0,0 +1,22 @@
+_BASE_: [
+  'yolov5_cspdarknet.yml',
+]
+
+CSPDarkNet:
+  arch: "P6"
+  return_idx: [2, 3, 4, 5]
+
+YOLOv5Head:
+  anchors: [[19, 27], [44, 40], [38, 94],
+            [96, 68], [86, 152], [180, 137],
+            [140, 301], [303, 264], [238, 542],
+            [436, 615], [739, 380], [925, 792]]
+  anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]
+  stride: [8, 16, 32, 64]
+
+YOLOv5Loss:
+  downsample_ratios: [8, 16, 32, 64]
+  balance: [4.0, 1.0, 0.25, 0.06]
+  box_weight: 0.05
+  obj_weight: 0.7
+  cls_weght: 0.3
diff --git a/configs/yolov5/_base_/yolov5p6_reader.yml b/configs/yolov5/_base_/yolov5p6_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cacd30d268bbb5fe5fd5035c76f17a26e36aa3c2
--- /dev/null
+++ b/configs/yolov5/_base_/yolov5p6_reader.yml
@@ -0,0 +1,45 @@
+input_height: &input_height 1280
+input_width: &input_width 1280
+input_size: &input_size [*input_height, *input_width]
+mosaic_epoch: &mosaic_epoch 300
+
+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - DecodeNormResize: {target_size: *input_size, mosaic: True}
+    - MosaicPerspective: {mosaic_prob: 1.0, target_size: *input_size, scale: 0.9, mixup_prob: 0.1, copy_paste_prob: 0.1}
+    - RandomHSV: {hgain: 0.015, sgain: 0.7, vgain: 0.4}
+    - RandomFlip: {}
+    - BboxXYXY2XYWH: {}
+    - NormalizeBox: {}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: False
+  use_shared_memory: False
+  collate_batch: False
+  mosaic_epoch: *mosaic_epoch
+
+
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
+    - Pad: {size: *input_size, fill_value: [114., 114., 114.]}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 2
+
+
+TestReader:
+  inputs_def:
+    image_shape: [3, 1280, 1280]
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
+    - Pad: {size: *input_size, fill_value: [114., 114., 114.]}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 1
+  fuse_normalize: False
diff --git a/configs/yolov5/yolov5_l_300e_coco.yml b/configs/yolov5/yolov5_l_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2a4179b47d823cdb352a2f10d1ada8ed3e17bd1e
--- /dev/null
+++ b/configs/yolov5/yolov5_l_300e_coco.yml
@@ -0,0 +1,22 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5_cspdarknet.yml',
+  '_base_/yolov5_reader_high_aug.yml',
+]
+depth_mult: 1.0
+width_mult: 1.0
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5_l_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
+
+
+YOLOv5Loss:
+  obj_weight: 0.7
+  cls_weght: 0.3
diff --git a/configs/yolov5/yolov5_l_30e_obj365_finetune_coco.yml b/configs/yolov5/yolov5_l_30e_obj365_finetune_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b802e6f011debac07edc9301d824655c95115a66
--- /dev/null
+++ b/configs/yolov5/yolov5_l_30e_obj365_finetune_coco.yml
@@ -0,0 +1,43 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5_cspdarknet.yml',
+  '_base_/yolov5_reader_high_aug.yml',
+]
+depth_mult: 1.0
+width_mult: 1.0
+
+log_iter: 100
+snapshot_epoch: 5
+weights: output/yolov5_l_30e_obj365_finetune_coco/model_final
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/yolov5_l_300e_obj365.pdparams
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
+
+
+YOLOv5Loss:
+  obj_weight: 0.7
+  cls_weght: 0.3
+
+
+epoch: 30
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !YOLOv5LRDecay
+    max_epochs: 30
+    min_lr_ratio: 0.1
+  - !ExpWarmup
+    epochs: 3
+
+OptimizerBuilder:
+  optimizer:
+    type: Momentum
+    momentum: 0.937
+    use_nesterov: True
+  regularizer:
+    factor: 0.0005
+    type: L2
diff --git a/configs/yolov5/yolov5_m_300e_coco.yml b/configs/yolov5/yolov5_m_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0b82c764f1f1a693747249abbb01255d478f6dfb
--- /dev/null
+++ b/configs/yolov5/yolov5_m_300e_coco.yml
@@ -0,0 +1,22 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5_cspdarknet.yml',
+  '_base_/yolov5_reader_high_aug.yml',
+]
+depth_mult: 0.67
+width_mult: 0.75
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5_m_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
+
+
+YOLOv5Loss:
+  obj_weight: 0.7
+  cls_weght: 0.3
diff --git a/configs/yolov5/yolov5_n_300e_coco.yml b/configs/yolov5/yolov5_n_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aa4d21693ad6e2e1200dc072582456fbf6f2c76d
--- /dev/null
+++ b/configs/yolov5/yolov5_n_300e_coco.yml
@@ -0,0 +1,17 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e.yml',
+  '_base_/yolov5_cspdarknet.yml',
+  '_base_/yolov5_reader.yml',
+]
+depth_mult: 0.33
+width_mult: 0.25
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5_n_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
diff --git a/configs/yolov5/yolov5_s_300e_coco.yml b/configs/yolov5/yolov5_s_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5b4b8edf03e3518dfb2789e639079340750b67e9
--- /dev/null
+++ b/configs/yolov5/yolov5_s_300e_coco.yml
@@ -0,0 +1,17 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e.yml',
+  '_base_/yolov5_cspdarknet.yml',
+  '_base_/yolov5_reader.yml',
+]
+depth_mult: 0.33
+width_mult: 0.50
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5_s_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
diff --git a/configs/yolov5/yolov5_s_80e_ssod_finetune_coco.yml b/configs/yolov5/yolov5_s_80e_ssod_finetune_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..24b4b6725ac8e457216bc2a262d0c3d71ca3771d
--- /dev/null
+++ b/configs/yolov5/yolov5_s_80e_ssod_finetune_coco.yml
@@ -0,0 +1,38 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e.yml',
+  '_base_/yolov5_cspdarknet.yml',
+  '_base_/yolov5_reader.yml',
+]
+depth_mult: 0.33
+width_mult: 0.50
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5_s_80e_ssod_finetune_coco/model_final
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/yolov5_s_300e_ssod_coco.pdparams
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
+
+
+epoch: 80
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !YOLOv5LRDecay
+    max_epochs: 80
+    min_lr_ratio: 0.01
+  - !ExpWarmup
+    epochs: 3
+
+OptimizerBuilder:
+  optimizer:
+    type: Momentum
+    momentum: 0.937
+    use_nesterov: True
+  regularizer:
+    factor: 0.0005
+    type: L2
diff --git a/configs/yolov5/yolov5_x_300e_coco.yml b/configs/yolov5/yolov5_x_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7817960fecd634d61cd3f845a08ea9e87782d098
--- /dev/null
+++ b/configs/yolov5/yolov5_x_300e_coco.yml
@@ -0,0 +1,22 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5_cspdarknet.yml',
+  '_base_/yolov5_reader_high_aug.yml',
+]
+depth_mult: 1.33
+width_mult: 1.25
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5_x_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
+
+
+YOLOv5Loss:
+  obj_weight: 0.7
+  cls_weght: 0.3
diff --git a/configs/yolov5/yolov5p6_l_300e_coco.yml b/configs/yolov5/yolov5p6_l_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aec6ecdc63f656cfcca7b7b870280c226db7bde4
--- /dev/null
+++ b/configs/yolov5/yolov5p6_l_300e_coco.yml
@@ -0,0 +1,17 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5p6_cspdarknet.yml',
+  '_base_/yolov5p6_reader.yml',
+]
+depth_mult: 1.0
+width_mult: 1.0
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5p6_l_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 8 # default 8 gpus, total bs = 64
diff --git a/configs/yolov5/yolov5p6_m_300e_coco.yml b/configs/yolov5/yolov5p6_m_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d46634b453d45229cf1b9da8414040b4e2d89141
--- /dev/null
+++ b/configs/yolov5/yolov5p6_m_300e_coco.yml
@@ -0,0 +1,17 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5p6_cspdarknet.yml',
+  '_base_/yolov5p6_reader.yml',
+]
+depth_mult: 0.67
+width_mult: 0.75
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5p6_m_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
diff --git a/configs/yolov5/yolov5p6_n_300e_coco.yml b/configs/yolov5/yolov5p6_n_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6522d8d124027567dae7e71d96e9f54a9d5840a6
--- /dev/null
+++ b/configs/yolov5/yolov5p6_n_300e_coco.yml
@@ -0,0 +1,17 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5p6_cspdarknet.yml',
+  '_base_/yolov5p6_reader.yml',
+]
+depth_mult: 0.33
+width_mult: 0.25
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5p6_n_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
diff --git a/configs/yolov5/yolov5p6_s_300e_coco.yml b/configs/yolov5/yolov5p6_s_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e086784e477cbbe7ad2a19246e78f59c8b76f31d
--- /dev/null
+++ b/configs/yolov5/yolov5p6_s_300e_coco.yml
@@ -0,0 +1,17 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5p6_cspdarknet.yml',
+  '_base_/yolov5p6_reader.yml',
+]
+depth_mult: 0.33
+width_mult: 0.50
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5p6_s_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 16 # default 8 gpus, total bs = 128
diff --git a/configs/yolov5/yolov5p6_x_300e_coco.yml b/configs/yolov5/yolov5p6_x_300e_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..628b5e5c67f5cd8039d636a1b95ed90750419b4c
--- /dev/null
+++ b/configs/yolov5/yolov5p6_x_300e_coco.yml
@@ -0,0 +1,17 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_300e_high.yml',
+  '_base_/yolov5p6_cspdarknet.yml',
+  '_base_/yolov5p6_reader.yml',
+]
+depth_mult: 1.33
+width_mult: 1.25
+
+log_iter: 100
+snapshot_epoch: 10
+weights: output/yolov5p6_x_300e_coco/model_final
+
+
+TrainReader:
+  batch_size: 8 # default 8 gpus, total bs = 64
diff --git a/dataset/coco/download_coco.py b/dataset/coco/download_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..47659fa76dd2c1183404667efac3a48de9b099c2
--- /dev/null
+++ b/dataset/coco/download_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PadleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'coco')
diff --git a/dataset/roadsign_voc/download_roadsign_voc.py b/dataset/roadsign_voc/download_roadsign_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cb517d3cf362e3ad2ec7b4ebf3bff54acb244d4
--- /dev/null
+++ b/dataset/roadsign_voc/download_roadsign_voc.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PadleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'roadsign_voc')
diff --git a/dataset/roadsign_voc/label_list.txt b/dataset/roadsign_voc/label_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1be460f457a2fdbec91d3a69377c232ae4a6beb0
--- /dev/null
+++ b/dataset/roadsign_voc/label_list.txt
@@ -0,0 +1,4 @@
+speedlimit
+crosswalk
+trafficlight
+stop
\ No newline at end of file
diff --git a/dataset/voc/create_list.py b/dataset/voc/create_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ab804222aa14cce988fd0aa083a460372afb484
--- /dev/null
+++ b/dataset/voc/create_list.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PadleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import create_voc_list
+
+logging.basicConfig(level=logging.INFO)
+
+voc_path = osp.split(osp.realpath(sys.argv[0]))[0]
+create_voc_list(voc_path)
diff --git a/dataset/voc/download_voc.py b/dataset/voc/download_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4c449ce207f27f07379a1e93cebcb27ace0b8fe
--- /dev/null
+++ b/dataset/voc/download_voc.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PadleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'voc')
diff --git a/dataset/voc/label_list.txt b/dataset/voc/label_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8420ab35ede7400974f25836a6bb543024686a0e
--- /dev/null
+++ b/dataset/voc/label_list.txt
@@ -0,0 +1,20 @@
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
diff --git a/demo/000000014439.jpg b/demo/000000014439.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0abbdab06eb5950b93908cc91adfa640e8a3ac78
Binary files /dev/null and b/demo/000000014439.jpg differ
diff --git a/demo/000000014439_640x640.jpg b/demo/000000014439_640x640.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..58e9d3e228af43c9b55d8d0cb385ce82ebb8b996
Binary files /dev/null and b/demo/000000014439_640x640.jpg differ
diff --git a/demo/000000087038.jpg b/demo/000000087038.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9f77f5d5f057b6f92dc096da704ecb8dee99bdf5
Binary files /dev/null and b/demo/000000087038.jpg differ
diff --git a/demo/000000570688.jpg b/demo/000000570688.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cb304bd56c4010c08611a30dcca58ea9140cea54
Binary files /dev/null and b/demo/000000570688.jpg differ
diff --git a/demo/road554.png b/demo/road554.png
new file mode 100644
index 0000000000000000000000000000000000000000..7733e57f922b0fee893775da4f698c202804966f
Binary files /dev/null and b/demo/road554.png differ
diff --git a/demo/visdrone_0000315_01601_d_0000509.jpg b/demo/visdrone_0000315_01601_d_0000509.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cc7a3602c1c015213ca1f7e27b0d006e827ee935
Binary files /dev/null and b/demo/visdrone_0000315_01601_d_0000509.jpg differ
diff --git a/deploy/BENCHMARK_INFER.md b/deploy/BENCHMARK_INFER.md
new file mode 100644
index 0000000000000000000000000000000000000000..988cf30f6c672195d4b3833fe9a186b497a11c2e
--- /dev/null
+++ b/deploy/BENCHMARK_INFER.md
@@ -0,0 +1,60 @@
+# 推理Benchmark
+
+## 一、环境准备
+- 1、测试环境:
+  - CUDA 10.1
+  - CUDNN 7.6
+  - TensorRT-6.0.1
+  - PaddlePaddle v2.0.1
+  - GPU分别为: Tesla V100和GTX 1080Ti和Jetson AGX Xavier
+- 2、测试方式:
+  - 为了方便比较不同模型的推理速度，输入采用同样大小的图片，为 3x640x640，采用 `demo/000000014439_640x640.jpg` 图片。
+  - Batch Size=1
+  - 去掉前100轮warmup时间，测试100轮的平均时间，单位ms/image，包括网络计算时间、数据拷贝至CPU的时间。
+  - 采用Fluid C++预测引擎: 包含Fluid C++预测、Fluid-TensorRT预测，下面同时测试了Float32 (FP32) 和Float16 (FP16)的推理速度。
+
+**注意：**  TensorRT中固定尺寸和动态尺寸区别请参考文档[TENSOR教程](TENSOR_RT.md)。由于固定尺寸下对两阶段模型支持不完善，所以faster rcnn模型采用动态尺寸测试。固定尺寸和动态尺寸支持融合的OP不完全一样，因此同一个模型在固定尺寸和动态尺寸下测试的性能可能会有一点差异。
+
+## 二、推理速度
+
+### 1、Linux系统
+#### （1）Tesla V100
+
+| 模型                            | backbone     | 是否固定尺寸 | 入网尺寸     | paddle_inference | trt_fp32 | trt_fp16 |
+|-------------------------------|--------------|--------|----------|------------------|----------|----------|
+| Faster RCNN FPN            | ResNet50    | 否      | 640x640  | 27.99            | 26.15    | 21.92    |
+| Faster RCNN FPN   | ResNet50    | 否      | 800x1312 | 32.49            | 25.54    | 21.70    |
+| YOLOv3           | Mobilenet\_v1 | 是      | 608x608  | 9.74             | 8.61     | 6.28     |
+| YOLOv3              | Darknet53    | 是      | 608x608  | 17.84            | 15.43    | 9.86     |
+| PPYOLO              | ResNet50    | 是      | 608x608  | 20.77            | 18.40    | 13.53    |
+| SSD              | Mobilenet\_v1 | 是      | 300x300  | 5.17             | 4.43     | 4.29     |
+| TTFNet              | Darknet53    | 是      | 512x512  | 10.14            | 8.71     | 5.55     |
+| FCOS              | ResNet50    | 是      | 640x640  | 35.47            | 35.02    | 34.24    |
+
+
+#### （2）Jetson AGX Xavier
+
+| 模型                            | backbone     | 是否固定尺寸 | 入网尺寸     | paddle_inference | trt_fp32 | trt_fp16 |
+|-------------------------------|--------------|--------|----------|------------------|----------|----------|
+| Faster RCNN FPN            | ResNet50     | 否      | 640x640  | 169.45           | 158.92   | 119.25   |
+| Faster RCNN FPN  | ResNet50     | 否      | 800x1312 | 228.07           | 156.39   | 117.03   |
+| YOLOv3           | Mobilenet\_v1 | 是      | 608x608  | 48.76            | 43.83    | 18.41    |
+| YOLOv3              | Darknet53    | 是      | 608x608  | 121.61           | 110.30   | 42.38    |
+| PPYOLO              | ResNet50     | 是      | 608x608  | 111.80           | 99.40    | 48.05    |
+| SSD              | Mobilenet\_v1 | 是      | 300x300  | 10.52            | 8.84     | 8.77     |
+| TTFNet              | Darknet53    | 是      | 512x512  | 73.77            | 64.03    | 31.46    |
+| FCOS              | ResNet50     | 是      | 640x640  | 217.11           | 214.38   | 205.78   |
+
+### 2、Windows系统
+#### （1）GTX 1080Ti
+
+| 模型                            | backbone     | 是否固定尺寸 | 入网尺寸     | paddle_inference | trt_fp32 | trt_fp16 |
+|-------------------------------|--------------|--------|----------|------------------|----------|----------|
+| Faster RCNN FPN           | ResNet50     | 否      | 640x640  | 50.74            | 57.17    | 62.08    |
+| Faster RCNN FPN  | ResNet50     | 否      | 800x1312 | 50.31            | 57.61    | 62.05    |
+| YOLOv3           | Mobilenet\_v1 | 是      | 608x608  | 14.51            | 11.23    | 11.13    |
+| YOLOv3             | Darknet53    | 是      | 608x608  | 30.26            | 23.92    | 24.02    |
+| PPYOLO              | ResNet50     | 是      | 608x608  | 38.06            | 31.40    | 31.94    |
+| SSD              | Mobilenet\_v1 | 是      | 300x300  | 16.47            | 13.87    | 13.76    |
+| TTFNet              | Darknet53    | 是      | 512x512  | 21.83            | 17.14    | 17.09    |
+| FCOS              | ResNet50     | 是      | 640x640  | 71.88            | 69.93    | 69.52    |
diff --git a/deploy/BENCHMARK_INFER_en.md b/deploy/BENCHMARK_INFER_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0b92b6cc142bb6b07a703ccadc5a017f8080956
--- /dev/null
+++ b/deploy/BENCHMARK_INFER_en.md
@@ -0,0 +1,61 @@
+# Inference Benchmark
+
+## 一、Prepare the Environment
+- 1、Test Environment:
+  - CUDA 10.1
+  - CUDNN 7.6
+  - TensorRT-6.0.1
+  - PaddlePaddle v2.0.1
+  - The GPUS are Tesla V100 and GTX 1080 Ti and Jetson AGX Xavier
+- 2、Test Method:
+  - In order to compare the inference speed of different models, the input shape is 3x640x640, use `demo/000000014439_640x640.jpg`.
+  - Batch_size=1
+  - Delete the warmup time of the first 100 rounds and test the average time of 100 rounds in ms/image, including network calculation time and data copy time to CPU.
+  - Using Fluid C++ prediction engine: including Fluid C++ prediction, Fluid TensorRT prediction, the following test Float32 (FP32) and Float16 (FP16) inference speed.
+
+**Attention:**  For TensorRT, please refer to the [TENSOR tutorial](TENSOR_RT.md) for the difference between fixed and dynamic dimensions. Due to the imperfect support for the two-stage model under fixed size, dynamic size test was adopted for the Faster RCNN model. Fixed size and dynamic size do not support exactly the same OP for fusion, so the performance of the same model tested at fixed size and dynamic size may differ slightly.
+
+
+## 二、Inferring Speed
+
+### 1、Linux System
+#### （1）Tesla V100
+
+| Model           | backbone      | Fixed size or not | The net size | paddle_inference | trt_fp32 | trt_fp16 |
+| --------------- | ------------- | ----------------- | ------------ | ---------------- | -------- | -------- |
+| Faster RCNN FPN | ResNet50      | no                | 640x640      | 27.99            | 26.15    | 21.92    |
+| Faster RCNN FPN | ResNet50      | no                | 800x1312     | 32.49            | 25.54    | 21.70    |
+| YOLOv3          | Mobilenet\_v1 | yes               | 608x608      | 9.74             | 8.61     | 6.28     |
+| YOLOv3          | Darknet53     | yes               | 608x608      | 17.84            | 15.43    | 9.86     |
+| PPYOLO          | ResNet50      | yes               | 608x608      | 20.77            | 18.40    | 13.53    |
+| SSD             | Mobilenet\_v1 | yes               | 300x300      | 5.17             | 4.43     | 4.29     |
+| TTFNet          | Darknet53     | yes               | 512x512      | 10.14            | 8.71     | 5.55     |
+| FCOS            | ResNet50      | yes               | 640x640      | 35.47            | 35.02    | 34.24    |
+
+
+#### （2）Jetson AGX Xavier
+
+| Model           | backbone      | Fixed size or not | The net size | paddle_inference | trt_fp32 | trt_fp16 |
+| --------------- | ------------- | ----------------- | ------------ | ---------------- | -------- | -------- |
+| Faster RCNN FPN | ResNet50      | no                | 640x640      | 169.45           | 158.92   | 119.25   |
+| Faster RCNN FPN | ResNet50      | no                | 800x1312     | 228.07           | 156.39   | 117.03   |
+| YOLOv3          | Mobilenet\_v1 | yes               | 608x608      | 48.76            | 43.83    | 18.41    |
+| YOLOv3          | Darknet53     | yes               | 608x608      | 121.61           | 110.30   | 42.38    |
+| PPYOLO          | ResNet50      | yes               | 608x608      | 111.80           | 99.40    | 48.05    |
+| SSD             | Mobilenet\_v1 | yes               | 300x300      | 10.52            | 8.84     | 8.77     |
+| TTFNet          | Darknet53     | yes               | 512x512      | 73.77            | 64.03    | 31.46    |
+| FCOS            | ResNet50      | yes               | 640x640      | 217.11           | 214.38   | 205.78   |
+
+### 2、Windows System
+#### （1）GTX 1080Ti
+
+| Model           | backbone      | Fixed size or not | The net size | paddle_inference | trt_fp32 | trt_fp16 |
+| --------------- | ------------- | ----------------- | ------------ | ---------------- | -------- | -------- |
+| Faster RCNN FPN | ResNet50      | no                | 640x640      | 50.74            | 57.17    | 62.08    |
+| Faster RCNN FPN | ResNet50      | no                | 800x1312     | 50.31            | 57.61    | 62.05    |
+| YOLOv3          | Mobilenet\_v1 | yes               | 608x608      | 14.51            | 11.23    | 11.13    |
+| YOLOv3          | Darknet53     | yes               | 608x608      | 30.26            | 23.92    | 24.02    |
+| PPYOLO          | ResNet50      | yes               | 608x608      | 38.06            | 31.40    | 31.94    |
+| SSD             | Mobilenet\_v1 | yes               | 300x300      | 16.47            | 13.87    | 13.76    |
+| TTFNet          | Darknet53     | yes               | 512x512      | 21.83            | 17.14    | 17.09    |
+| FCOS            | ResNet50      | yes               | 640x640      | 71.88            | 69.93    | 69.52    |
diff --git a/deploy/EXPORT_MODEL.md b/deploy/EXPORT_MODEL.md
new file mode 100644
index 0000000000000000000000000000000000000000..91f34b5860d6384baf773e71a39ffa4ec773dee6
--- /dev/null
+++ b/deploy/EXPORT_MODEL.md
@@ -0,0 +1,54 @@
+# PaddleDetection模型导出教程
+
+## 一、模型导出
+本章节介绍如何使用`tools/export_model.py`脚本导出模型。
+### 1、导出模输入输出说明
+- 输入变量以及输入形状如下：
+
+  | 输入名称 | 输入形状 | 表示含义 |
+  | :---------: | ----------- | ---------- |
+  | image |  [None, 3, H, W] | 输入网络的图像，None表示batch维度，如果输入图像大小为变长，则H,W为None |
+  | im_shape | [None, 2] | 图像经过resize后的大小，表示为H,W, None表示batch维度 |
+  | scale_factor | [None, 2] | 输入图像大小比真实图像大小，表示为scale_y, scale_x |
+
+**注意**具体预处理方式可参考配置文件中TestReader部分。
+
+
+- PaddleDetection中动转静导出模型输出统一为：
+
+  - bbox, NMS的输出，形状为[N, 6], 其中N为预测框的个数，6为[class_id, score, x1, y1, x2, y2]。
+  - bbox\_num, 每张图片对应预测框的个数，例如batch_size为2，输出为[N1, N2], 表示第一张图包含N1个预测框，第二张图包含N2个预测框，并且预测框的总个数和NMS输出的第一维N相同
+  - mask，如果网络中包含mask，则会输出mask分支
+
+**注意**模型动转静导出不支持模型结构中包含numpy相关操作的情况。
+
+
+### 2、启动参数说明
+
+|      FLAG      |      用途      |    默认值    |                 备注                      |
+|:--------------:|:--------------:|:------------:|:-----------------------------------------:|
+|       -c       |  指定配置文件  |     None     |                                           |
+|  --output_dir  |  模型保存路径  |  `./output_inference`  |  模型默认保存在`output/配置文件名/`路径下 |
+
+### 3、使用示例
+
+使用训练得到的模型进行试用，脚本如下
+
+```bash
+# 导出YOLOv3模型
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \
+ -o weights=weights/yolov3_darknet53_270e_coco.pdparams
+```
+
+预测模型会导出到`inference_model/yolov3_darknet53_270e_coco`目录下，分别为`infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel`。
+
+
+### 4、设置导出模型的输入大小
+
+使用Fluid-TensorRT进行预测时，由于<=TensorRT 5.1的版本仅支持定长输入，保存模型的`data`层的图片大小需要和实际输入图片大小一致。而Fluid C++预测引擎没有此限制。设置TestReader中的`image_shape`可以修改保存模型中的输入图片大小。示例如下:
+
+```bash
+# 导出YOLOv3模型，输入是3x640x640
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \
+ -o weights=weights/yolov3_darknet53_270e_coco.pdparams TestReader.inputs_def.image_shape=[3,640,640]
+```
diff --git a/deploy/EXPORT_MODEL_en.md b/deploy/EXPORT_MODEL_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..d2828edeb8388b4633b7d8489923a059ef96321c
--- /dev/null
+++ b/deploy/EXPORT_MODEL_en.md
@@ -0,0 +1,53 @@
+# PaddleDetection Model Export Tutorial
+
+## 一、Model Export
+This section describes how to use the `tools/export_model.py` script to export models.
+### Export model input and output description
+- Input variables and input shapes are as follows:
+
+  |  Input Name  | Input Shape     | Meaning                                                                                                                   |
+  | :----------: | --------------- | ------------------------------------------------------------------------------------------------------------------------- |
+  |    image     | [None, 3, H, W] | Enter the network image. None indicates the Batch dimension. If the input image size is variable length, H and W are None |
+  |   im_shape   | [None, 2]       | The size of the image after resize is expressed as H,W, and None represents the Batch dimension                           |
+  | scale_factor | [None, 2]       | The input image size is larger than the real image size, denoted byscale_y, scale_x                                       |
+
+**Attention**For details about the preprocessing method, see the Test Reader section in the configuration file.
+
+
+-The output of the dynamic and static derived model in Paddle Detection is unified as follows:
+
+  - bbox, the output of NMS, in the shape of [N, 6], where N is the number of prediction boxes, and 6 is [class_id, score, x1, y1, x2, y2].
+  - bbox\_num, Each picture corresponds to the number of prediction boxes. For example, batch size is 2 and the output is [N1, N2], indicating that the first picture contains N1 prediction boxes and the second picture contains N2 prediction boxes, and the total number of prediction boxes is the same as the first dimension N output by NMS
+  - mask, If the network contains a mask, the mask branch is printed
+
+**Attention**The model-to-static export does not support cases where numpy operations are included in the model structure.
+
+
+### 2、Start Parameters
+
+|     FLAG     |               USE               |       DEFAULT        |                                 NOTE                                  |
+| :----------: | :-----------------------------: | :------------------: | :-------------------------------------------------------------------: |
+|      -c      | Specifying a configuration file |         None         |                                                                       |
+| --output_dir |         Model save path         | `./output_inference` | The model is saved in the `output/default_file_name/` path by default |
+
+### 3、Example
+
+Using the trained model for trial use, the script is as follows:
+
+```bash
+# The YOLOv3 model is exported
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \
+ -o weights=weights/yolov3_darknet53_270e_coco.pdparams
+```
+The prediction model will be exported to the `inference_model/yolov3_darknet53_270e_coco` directory. `infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel` respectively.
+
+
+### 4、Sets the input size of the export model
+When using Fluid TensorRT for prediction, since <= TensorRT 5.1 only supports fixed-length input, the image size of the `data` layer of the saved model needs to be the same as the actual input image size. Fluid C++ prediction engine does not have this limitation. Setting `image_shape` in Test Reader changes the size of the input image in the saved model. The following is an example:
+
+
+```bash
+#Export the YOLOv3 model with the input 3x640x640
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \
+ -o weights=weights/yolov3_darknet53_270e_coco.pdparams TestReader.inputs_def.image_shape=[3,640,640]
+```
diff --git a/deploy/EXPORT_ONNX_MODEL.md b/deploy/EXPORT_ONNX_MODEL.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1f4027833973a9c37fb9f144e77beeead3acb41
--- /dev/null
+++ b/deploy/EXPORT_ONNX_MODEL.md
@@ -0,0 +1,112 @@
+# PaddleDetection模型导出为ONNX格式教程
+
+PaddleDetection模型支持保存为ONNX格式，目前测试支持的列表如下
+| 模型  | OP版本 | 备注 |
+| :---- | :----- | :--- |
+| YOLOv3 |  11   |  仅支持batch=1推理；模型导出需固定shape |
+| PP-YOLO | 11 | 仅支持batch=1推理；MatrixNMS将被转换NMS，精度略有变化；模型导出需固定shape |
+| PP-YOLOv2 | 11 | 仅支持batch=1推理；MatrixNMS将被转换NMS，精度略有变化；模型导出需固定shape |
+| PP-YOLO Tiny | 11 | 仅支持batch=1推理；模型导出需固定shape |
+| PP-YOLOE | 11 | 仅支持batch=1推理；模型导出需固定shape |
+| PP-PicoDet | 11 | 仅支持batch=1推理；模型导出需固定shape |
+| FCOS | 11 |仅支持batch=1推理 |
+| PAFNet | 11 |- |
+| TTFNet | 11 |-|
+| SSD | 11 |仅支持batch=1推理 |
+| PP-TinyPose | 11 | - |
+| Faster RCNN | 16 | 仅支持batch=1推理, 依赖0.9.7及以上版本|
+| Mask RCNN | 16 | 仅支持batch=1推理, 依赖0.9.7及以上版本|
+| Cascade RCNN | 16 | 仅支持batch=1推理, 依赖0.9.7及以上版本|
+| Cascade Mask RCNN | 16 | 仅支持batch=1推理, 依赖0.9.7及以上版本|
+
+保存ONNX的功能由[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX)提供，如在转换中有相关问题反馈，可在Paddle2ONNX的Github项目中通过[ISSUE](https://github.com/PaddlePaddle/Paddle2ONNX/issues)与工程师交流。
+
+## 导出教程
+
+### 步骤一、导出PaddlePaddle部署模型
+
+
+导出步骤参考文档[PaddleDetection部署模型导出教程](./EXPORT_MODEL.md), 导出示例如下
+
+- 非RCNN系列模型, 以YOLOv3为例
+```
+cd PaddleDetection
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml \
+                             -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams \
+                             TestReader.inputs_def.image_shape=[3,608,608] \
+                             --output_dir inference_model
+```
+导出后的模型保存在`inference_model/yolov3_darknet53_270e_coco/`目录中，结构如下
+```
+yolov3_darknet
+  ├── infer_cfg.yml          # 模型配置文件信息
+  ├── model.pdiparams        # 静态图模型参数
+  ├── model.pdiparams.info   # 参数额外信息，一般无需关注
+  └── model.pdmodel          # 静态图模型文件
+```
+> 注意导出时的参数`TestReader.inputs_def.image_shape`，对于YOLO系列模型注意导出时指定该参数，否则无法转换成功
+
+- RCNN系列模型，以Faster RCNN为例
+
+RCNN系列模型导出ONNX模型时，需要去除模型中的控制流，因此需要额外添加`export_onnx=True` 字段
+```
+cd PaddleDetection
+python tools/export_model.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml \
+                             -o weights=https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams \
+                             export_onnx=True \
+                             --output_dir inference_model
+```
+
+导出的模型保存在`inference_model/faster_rcnn_r50_fpn_1x_coco/`目录中，结构如下
+```
+faster_rcnn_r50_fpn_1x_coco
+  ├── infer_cfg.yml          # 模型配置文件信息
+  ├── model.pdiparams        # 静态图模型参数
+  ├── model.pdiparams.info   # 参数额外信息，一般无需关注
+  └── model.pdmodel          # 静态图模型文件
+```
+
+### 步骤二、将部署模型转为ONNX格式
+安装Paddle2ONNX（高于或等于0.9.7版本)
+```
+pip install paddle2onnx
+```
+使用如下命令转换
+```
+# YOLOv3
+paddle2onnx --model_dir inference_model/yolov3_darknet53_270e_coco \
+            --model_filename model.pdmodel \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file yolov3.onnx
+
+# Faster RCNN
+paddle2onnx --model_dir inference_model/faster_rcnn_r50_fpn_1x_coco \
+            --model_filename model.pdmodel \
+            --params_filename model.pdiparams \
+            --opset_version 16 \
+            --save_file faster_rcnn.onnx
+```
+转换后的模型即为在当前路径下的`yolov3.onnx`和`faster_rcnn.onnx`
+
+### 步骤三、使用onnxruntime进行推理
+安装onnxruntime
+```
+pip install onnxruntime
+```
+推理代码示例在[deploy/third_engine/onnx](./third_engine/onnx)下
+
+使用如下命令进行推理：
+```
+# YOLOv3
+python deploy/third_engine/onnx/infer.py
+            --infer_cfg inference_model/yolov3_darknet53_270e_coco/infer_cfg.yml \
+            --onnx_file yolov3.onnx \
+            --image_file demo/000000014439.jpg
+
+# Faster RCNN
+python deploy/third_engine/onnx/infer.py
+            --infer_cfg inference_model/faster_rcnn_r50_fpn_1x_coco/infer_cfg.yml \
+            --onnx_file faster_rcnn.onnx \
+            --image_file demo/000000014439.jpg
+```
diff --git a/deploy/EXPORT_ONNX_MODEL_en.md b/deploy/EXPORT_ONNX_MODEL_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..750959062dc20cc68600bbd89e9264468c11e4d6
--- /dev/null
+++ b/deploy/EXPORT_ONNX_MODEL_en.md
@@ -0,0 +1,110 @@
+# PaddleDetection Model Export as ONNX Format Tutorial
+
+PaddleDetection Model support is saved in ONNX format and the list of current test support is as follows
+| Model  | OP Version | NOTE |
+| :---- | :----- | :--- |
+| YOLOv3 |  11   |  Only batch=1 inferring is supported. Model export needs fixed shape |
+| PP-YOLO | 11 | Only batch=1 inferring is supported. A MatrixNMS will be converted to an NMS with slightly different precision; Model export needs fixed shape |
+| PP-YOLOv2 | 11 | Only batch=1 inferring is supported. MatrixNMS will be converted to NMS with slightly different precision; Model export needs fixed shape |
+| PP-YOLO Tiny | 11 | Only batch=1 inferring is supported. Model export needs fixed shape |
+| PP-YOLOE | 11 | Only batch=1 inferring is supported. Model export needs fixed shape |
+| PP-PicoDet | 11 | Only batch=1 inferring is supported. Model export needs fixed shape |
+| FCOS | 11 |Only batch=1 inferring is supported |
+| PAFNet | 11 |- |
+| TTFNet | 11 |-|
+| SSD | 11 |Only batch=1 inferring is supported |
+| PP-TinyPose | 11 | - |
+| Faster RCNN | 16 | Only batch=1 inferring is supported, require paddle2onnx>=0.9.7|
+| Mask RCNN | 16 | Only batch=1 inferring is supported, require paddle2onnx>=0.9.7|
+| Cascade RCNN | 16 | Only batch=1 inferring is supported, require paddle2onnx>=0.9.7|
+| Cascade Mask RCNN | 16 | Only batch=1 inferring is supported, require paddle2onnx>=0.9.7|
+
+
+The function of saving ONNX is provided by [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX). If there is feedback on related problems during conversion, Communicate with engineers in Paddle2ONNX's Github project via [ISSUE](https://github.com/PaddlePaddle/Paddle2ONNX/issues).
+
+## Export Tutorial
+
+### Step 1. Export the Paddle deployment model
+Export procedure reference document[Tutorial on PaddleDetection deployment model export](./EXPORT_MODEL_en.md), for example:
+
+- Models except RCNN series, take YOLOv3 as example
+```
+cd PaddleDetection
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml \
+                             -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams \
+                             TestReader.inputs_def.image_shape=[3,608,608] \
+                             --output_dir inference_model
+```
+The derived models were saved in `inference_model/yolov3_darknet53_270e_coco/`, with the structure as follows
+```
+yolov3_darknet
+  ├── infer_cfg.yml          # Model configuration file information
+  ├── model.pdiparams        # Static diagram model parameters
+  ├── model.pdiparams.info   # Parameter Information is not required
+  └── model.pdmodel          # Static diagram model file
+```
+> check`TestReader.inputs_def.image_shape`, For YOLO series models, specify this parameter when exporting; otherwise, the conversion fails
+
+- RCNN series models, take Faster RCNN as example
+
+The conditional block needs to be removed in RCNN series when export ONNX model. Add `export_onnx=True` in command line
+```
+cd PaddleDetection
+python tools/export_model.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml \
+                             -o weights=https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams \
+                             export_onnx=True \
+                             --output_dir inference_model
+```
+The derived models were saved in `inference_model/faster_rcnn_r50_fpn_1x_coco/`, with the structure as follows
+```
+faster_rcnn_r50_fpn_1x_coco
+  ├── infer_cfg.yml          # Model configuration file information
+  ├── model.pdiparams        # Static diagram model parameters
+  ├── model.pdiparams.info   # Parameter Information is not required
+  └── model.pdmodel          # Static diagram model file
+```
+
+### Step 2. Convert the deployment model to ONNX format
+Install Paddle2ONNX (version 0.9.7 or higher)
+```
+pip install paddle2onnx
+```
+Use the following command to convert
+```
+# YOLOv3
+paddle2onnx --model_dir inference_model/yolov3_darknet53_270e_coco \
+            --model_filename model.pdmodel \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file yolov3.onnx
+
+# Faster RCNN
+paddle2onnx --model_dir inference_model/faster_rcnn_r50_fpn_1x_coco \
+            --model_filename model.pdmodel \
+            --params_filename model.pdiparams \
+            --opset_version 16 \
+            --save_file faster_rcnn.onnx
+```
+The transformed model is under the current path`yolov3.onnx` and `faster_rcnn.onnx`
+
+### Step 3. Inference with onnxruntime
+Install onnxruntime
+```
+pip install onnxruntime
+```
+Inference code examples are in [deploy/third_engine/onnx](./third_engine/onnx)
+
+Use the following commands for inference：
+```
+# YOLOv3
+python deploy/third_engine/onnx/infer.py
+            --infer_cfg inference_model/yolov3_darknet53_270e_coco/infer_cfg.yml \
+            --onnx_file yolov3.onnx \
+            --image_file demo/000000014439.jpg
+
+# Faster RCNN
+python deploy/third_engine/onnx/infer.py
+            --infer_cfg inference_model/faster_rcnn_r50_fpn_1x_coco/infer_cfg.yml \
+            --onnx_file faster_rcnn.onnx \
+            --image_file demo/000000014439.jpg
+```
diff --git a/deploy/README.md b/deploy/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac1ba72f61c760d04376a510af55ed6bd4ac75b7
--- /dev/null
+++ b/deploy/README.md
@@ -0,0 +1,84 @@
+# PaddleDetection 预测部署
+
+PaddleDetection提供了Paddle Inference、Paddle Serving、Paddle-Lite多种部署形式，支持服务端、移动端、嵌入式等多种平台，提供了完善的Python和C++部署方案。
+
+## PaddleDetection支持的部署形式说明
+|形式|语言|教程|设备/平台|
+|-|-|-|-|
+|Paddle Inference|Python|已完善|Linux(ARM\X86)、Windows
+|Paddle Inference|C++|已完善|Linux(ARM\X86)、Windows|
+|Paddle Serving|Python|已完善|Linux(ARM\X86)、Windows|
+|Paddle-Lite|C++|已完善|Android、IOS、FPGA、RK...
+
+
+## 1.Paddle Inference部署
+
+### 1.1 导出模型
+
+使用`tools/export_model.py`脚本导出模型以及部署时使用的配置文件，配置文件名字为`infer_cfg.yml`。模型导出脚本如下：
+```bash
+# 导出YOLOv3模型
+python tools/export_model.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o weights=output/yolov3_mobilenet_v1_roadsign/best_model.pdparams
+```
+预测模型会导出到`output_inference/yolov3_mobilenet_v1_roadsign`目录下，分别为`infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel`。
+模型导出具体请参考文档[PaddleDetection模型导出教程](EXPORT_MODEL.md)。
+
+### 1.2 使用PaddleInference进行预测
+* Python部署 支持`CPU`、`GPU`和`XPU`环境，支持，windows、linux系统，支持NV Jetson嵌入式设备上部署。参考文档[python部署](python/README.md)
+* C++部署 支持`CPU`、`GPU`和`XPU`环境，支持，windows、linux系统，支持NV Jetson嵌入式设备上部署。参考文档[C++部署](cpp/README.md)
+* PaddleDetection支持TensorRT加速,相关文档请参考[TensorRT预测部署教程](TENSOR_RT.md)
+
+**注意:**  Paddle预测库版本需要>=2.1，batch_size>1仅支持YOLOv3和PP-YOLO。
+
+##  2.PaddleServing部署
+### 2.1 导出模型
+
+如果需要导出`PaddleServing`格式的模型，需要设置`export_serving_model=True`:
+```buildoutcfg
+python tools/export_model.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o weights=output/yolov3_mobilenet_v1_roadsign/best_model.pdparams --export_serving_model=True
+```
+预测模型会导出到`output_inference/yolov3_darknet53_270e_coco`目录下，分别为`infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel`, `serving_client/`文件夹, `serving_server/`文件夹。
+
+模型导出具体请参考文档[PaddleDetection模型导出教程](EXPORT_MODEL.md)。
+
+### 2.2 使用PaddleServing进行预测
+* [安装PaddleServing](https://github.com/PaddlePaddle/Serving/blob/develop/README.md#installation)
+* [使用PaddleServing](./serving/README.md)
+
+
+## 3.PaddleLite部署
+- [使用PaddleLite部署PaddleDetection模型](./lite/README.md)
+- 详细案例请参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)部署。更多内容，请参考[Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite)
+
+
+## 4.第三方部署（MNN、NCNN、Openvino） 
+- 第三方部署提供PicoDet、TinyPose案例，其他模型请参考修改
+- TinyPose部署推荐工具：Intel CPU端推荐使用Openvino，GPU端推荐使用PaddleInference，ARM/ANDROID端推荐使用PaddleLite或者MNN
+
+| Third_Engine | MNN  | NCNN  | OPENVINO   |
+| ------------ | ---- | ----- | ---------- |
+| PicoDet      | [PicoDet_MNN](./third_engine/demo_mnn/README.md)       | [PicoDet_NCNN](./third_engine/demo_ncnn/README.md) | [PicoDet_OPENVINO](./third_engine/demo_openvino/README.md)   |
+| TinyPose     | [TinyPose_MNN](./third_engine/demo_mnn_kpts/README.md) | -                                                  | [TinyPose_OPENVINO](./third_engine/demo_openvino_kpts/README.md) |
+
+
+
+## 5.Benchmark测试
+- 使用导出的模型，运行Benchmark批量测试脚本：
+```shell
+sh deploy/benchmark/benchmark.sh {model_dir} {model_name}
+```
+**注意** 如果是量化模型，请使用`deploy/benchmark/benchmark_quant.sh`脚本。
+- 将测试结果log导出至Excel中：
+```
+python deploy/benchmark/log_parser_excel.py --log_path=./output_pipeline --output_name=benchmark_excel.xlsx
+```
+
+## 6.常见问题QA
+- 1、`Paddle 1.8.4`训练的模型，可以用`Paddle2.0`部署吗？
+  Paddle 2.0是兼容Paddle 1.8.4的，因此是可以的。但是部分模型(如SOLOv2)使用到了Paddle 2.0中新增OP，这类模型不可以。
+
+- 2、Windows编译时，预测库是VS2015编译的，选择VS2017或VS2019会有问题吗？
+  关于VS兼容性问题请参考：[C++Visual Studio 2015、2017和2019之间的二进制兼容性](https://docs.microsoft.com/zh-cn/cpp/porting/binary-compat-2015-2017?view=msvc-160)
+
+- 3、cuDNN 8.0.4连续预测会发生内存泄漏吗？
+  经QA测试，发现cuDNN 8系列连续预测时都有内存泄漏问题，且cuDNN 8性能差于cuDNN 7，推荐使用CUDA + cuDNN7.6.4的方式进行部署。
diff --git a/deploy/README_en.md b/deploy/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f587b56b99e7a6b7c7ed31c5ae6307ade6e18126
--- /dev/null
+++ b/deploy/README_en.md
@@ -0,0 +1,83 @@
+# PaddleDetection Predict deployment
+
+PaddleDetection provides multiple deployment forms of Paddle Inference, Paddle Serving and Paddle-Lite, supports multiple platforms such as server, mobile and embedded, and provides a complete Python and C++ deployment solution
+
+## PaddleDetection This section describes the supported deployment modes
+| formalization    | language | Tutorial    | Equipment/Platform        |
+| ---------------- | -------- | ----------- | ------------------------- |
+| Paddle Inference | Python   | Has perfect | Linux(ARM\X86)、Windows   |
+| Paddle Inference | C++      | Has perfect | Linux(ARM\X86)、Windows   |
+| Paddle Serving   | Python   | Has perfect | Linux(ARM\X86)、Windows   |
+| Paddle-Lite      | C++      | Has perfect | Android、IOS、FPGA、RK... |
+
+
+## 1.Paddle Inference Deployment
+
+### 1.1 The export model
+
+Use the `tools/export_model.py` script to export the model and the configuration file used during deployment. The configuration file name is `infer_cfg.yml`. The model export script is as follows
+
+```bash
+# The YOLOv3 model is derived
+python tools/export_model.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o weights=output/yolov3_mobilenet_v1_roadsign/best_model.pdparams
+```
+The prediction model will be exported to the `output_inference/yolov3_mobilenet_v1_roadsign` directory `infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel`. For details on model export, please refer to the documentation [Tutorial on Paddle Detection MODEL EXPORT](./EXPORT_MODEL_en.md).
+
+### 1.2 Use Paddle Inference to Make Predictions
+* Python deployment supports `CPU`, `GPU` and `XPU` environments, Windows, Linux, and NV Jetson embedded devices. Reference Documentation [Python Deployment](python/README.md)
+* C++ deployment supports `CPU`, `GPU` and `XPU` environments, Windows and Linux systems, and NV Jetson embedded devices. Reference documentation [C++ deployment](cpp/README.md)
+* PaddleDetection supports TensorRT acceleration. Please refer to the documentation for [TensorRT Predictive Deployment Tutorial](TENSOR_RT.md)
+
+**Attention:**  Paddle prediction library version requires >=2.1, and batch_size>1 only supports YOLOv3 and PP-YOLO.
+
+##  2.PaddleServing Deployment
+### 2.1 Export model
+
+If you want to export the model in `PaddleServing` format, set `export_serving_model=True`:
+```buildoutcfg
+python tools/export_model.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o weights=output/yolov3_mobilenet_v1_roadsign/best_model.pdparams --export_serving_model=True
+```
+The prediction model will be exported to the `output_inference/yolov3_darknet53_270e_coco` directory `infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel`, `serving_client/` and `serving_server/` folder.
+
+For details on model export, please refer to the documentation [Tutorial on Paddle Detection MODEL EXPORT](./EXPORT_MODEL_en.md).
+
+### 2.2 Predictions are made using Paddle Serving
+* [Install PaddleServing](https://github.com/PaddlePaddle/Serving/blob/develop/README.md#installation)
+* [Use PaddleServing](./serving/README.md)
+
+
+## 3. PaddleLite Deployment
+- [Deploy the PaddleDetection model using PaddleLite](./lite/README.md)
+- For details, please refer to [Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo) deployment. For more information, please refer to [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite)
+
+
+## 4.Third-Engine deploy(MNN、NCNN、Openvino)
+- The Third-Engine deploy take example of PicoDet、TinyPose，the others model is the same
+- Suggestion for TinyPose: For Intel CPU Openvino is recommended，for Nvidia GPU PaddleInference is recommended，and for ARM/ANDROID PaddleLite or MNN is recommended.
+
+| Third_Engine | MNN                                                    | NCNN                                               | OPENVINO                                                     |
+| ------------ | ------------------------------------------------------ | -------------------------------------------------- | ------------------------------------------------------------ |
+| PicoDet      | [PicoDet_MNN](./third_engine/demo_mnn/README.md)       | [PicoDet_NCNN](./third_engine/demo_ncnn/README.md) | [PicoDet_OPENVINO](./third_engine/demo_openvino/README.md)   |
+| TinyPose     | [TinyPose_MNN](./third_engine/demo_mnn_kpts/README.md) | -                                                  | [TinyPose_OPENVINO](./third_engine/demo_openvino_kpts/README.md) |
+
+
+## 5. Benchmark Test
+- Using the exported model, run the Benchmark batch test script:
+```shell
+sh deploy/benchmark/benchmark.sh {model_dir} {model_name}
+```
+**Attention** If it is a quantitative model, please use the `deploy/benchmark/benchmark_quant.sh` script.
+- Export the test result log to Excel：
+```
+python deploy/benchmark/log_parser_excel.py --log_path=./output_pipeline --output_name=benchmark_excel.xlsx
+```
+
+## 6. FAQ
+- 1、Can `Paddle 1.8.4` trained models be deployed with `Paddle2.0`?
+  Paddle 2.0 is compatible with Paddle 1.8.4, so it is ok. However, some models (such as SOLOv2) use the new OP in Paddle 2.0, which is not allowed.
+
+- 2、When compiling for Windows, the prediction library is compiled with VS2015, will it be a problem to choose VS2017 or VS2019?
+  For compatibility issues with VS, please refer to: [C++ Visual Studio 2015, 2017 and 2019 binary compatibility](https://docs.microsoft.com/zh-cn/cpp/porting/binary-compat-2015-2017?view=msvc-160)
+
+- 3、Does cuDNN 8.0.4 continuously predict memory leaks?
+  QA tests show that cuDNN 8 series have memory leakage problems in continuous prediction, and cuDNN 8 performance is worse than cuDNN7. CUDA + cuDNN7.6.4 is recommended for deployment.
diff --git a/deploy/TENSOR_RT.md b/deploy/TENSOR_RT.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1dd29789540746cce5f7ea3ce0a783e2178438d
--- /dev/null
+++ b/deploy/TENSOR_RT.md
@@ -0,0 +1,98 @@
+# TensorRT预测部署教程
+TensorRT是NVIDIA提出的用于统一模型部署的加速库，可以应用于V100、JETSON Xavier等硬件，它可以极大提高预测速度。Paddle TensorRT教程请参考文档[使用Paddle-TensorRT库预测](https://www.paddlepaddle.org.cn/inference/optimize/paddle_trt.html)
+
+## 1. 安装PaddleInference预测库
+- Python安装包，请从[这里](https://paddleinference.paddlepaddle.org.cn/user_guides/download_lib.html#python) 下载带有tensorrt的安装包进行安装
+
+- CPP预测库，请从[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) 下载带有TensorRT编译的预测库
+
+- 如果Python和CPP官网没有提供已编译好的安装包或预测库，请参考[源码安装](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/linux-compile.html) 自行编译
+
+**注意：**
+- 您的机器上TensorRT的版本需要跟您使用的预测库中TensorRT版本保持一致。
+- PaddleDetection中部署预测要求TensorRT版本 > 6.0。
+
+## 2. 导出模型
+模型导出具体请参考文档[PaddleDetection模型导出教程](./EXPORT_MODEL.md)。
+
+## 3. 开启TensorRT加速
+### 3.1 配置TensorRT
+在使用Paddle预测库构建预测器配置config时，打开TensorRT引擎就可以了：
+
+```
+config->EnableUseGpu(100, 0); // 初始化100M显存，使用GPU ID为0
+config->GpuDeviceId();        // 返回正在使用的GPU ID
+// 开启TensorRT预测，可提升GPU预测性能，需要使用带TensorRT的预测库
+config->EnableTensorRtEngine(1 << 20             /*workspace_size*/,
+                             batch_size        /*max_batch_size*/,
+                             3                 /*min_subgraph_size*/,
+                             AnalysisConfig::Precision::kFloat32 /*precision*/,
+                             false             /*use_static*/,
+                             false             /*use_calib_mode*/);
+
+```
+**注意:**
+  --run_benchmark如果设置为True，则需要安装依赖`pip install pynvml psutil GPUtil`。
+
+### 3.2 TensorRT固定尺寸预测
+
+例如在模型Reader配置文件中设置：
+```yaml
+TestReader:
+  inputs_def:
+    image_shape: [3,608,608]
+  ...
+```
+或者在导出模型时设置`-o TestReader.inputs_def.image_shape=[3,608,608]`，模型将会进行固定尺寸预测，具体请参考[PaddleDetection模型导出教程](./EXPORT_MODEL.md) 。
+
+可以通过[visualdl](https://www.paddlepaddle.org.cn/paddle/visualdl/demo/graph) 打开`model.pdmodel`文件，查看输入的第一个Tensor尺寸是否是固定的，如果不指定，尺寸会用`？`表示，如下图所示：
+![img](../docs/images/input_shape.png)
+
+
+注意：由于TesnorRT不支持在batch维度进行slice操作，Faster RCNN 和 Mask RCNN不能使用固定尺寸输入预测，所以不能设置`TestReader.inputs_def.image_shape`字段。
+
+以`YOLOv3`为例，使用固定尺寸输入预测：
+```
+python python/infer.py --model_dir=./output_inference/yolov3_darknet53_270e_coco/ --image_file=./demo/000000014439.jpg --device=GPU --run_mode=trt_fp32 --run_benchmark=True
+```
+
+### 3.3 TensorRT动态尺寸预测
+
+TensorRT版本>=6时，使用TensorRT预测时，可以支持动态尺寸输入。如果模型Reader配置文件中没有设置例如`TestReader.inputs_def.image_shape=[3,608,608]`的字段，或者`image_shape=[3.-1,-1]`，导出模型将以动态尺寸进行预测。一般RCNN系列模型使用动态图尺寸预测。
+Paddle预测库关于动态尺寸输入请查看[Paddle CPP预测](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/native_infer.html) 的`SetTRTDynamicShapeInfo`函数说明。
+
+`python/infer.py`设置动态尺寸输入参数说明：
+
+- trt_min_shape 用于设定TensorRT的输入图像height、width中的最小尺寸，默认值：1
+
+- trt_max_shape 用于设定TensorRT的输入图像height、width中的最大尺寸，默认值：1280
+
+- trt_opt_shape 用于设定TensorRT的输入图像height、width中的最优尺寸，默认值：640
+
+**注意：`TensorRT`中动态尺寸设置是4维的，这里只设置输入图像的尺寸。**
+
+以`Faster RCNN`为例，使用动态尺寸输入预测：
+```
+python python/infer.py --model_dir=./output_inference/faster_rcnn_r50_fpn_1x_coco/ --image_file=./demo/000000014439.jpg --device=GPU --run_mode=trt_fp16 --run_benchmark=True --trt_max_shape=1280 --trt_min_shape=800 --trt_opt_shape=960
+```
+
+## 4、常见问题QA
+**Q:** 提示没有`tensorrt_op`</br>
+**A:** 请检查是否使用带有TensorRT的Paddle Python包或预测库。
+
+**Q:** 提示`op out of memory`</br>
+**A:** 检查GPU是否是别人也在使用，请尝试使用空闲GPU
+
+**Q:** 提示`some trt inputs dynamic shape info not set`</br>
+**A:** 这是由于`TensorRT`会把网络结果划分成多个子图，我们只设置了输入数据的动态尺寸，划分的其他子图的输入并未设置动态尺寸。有两个解决方法：
+
+- 方法一：通过增大`min_subgraph_size`，跳过对这些子图的优化。根据提示，设置min_subgraph_size大于并未设置动态尺寸输入的子图中OP个数即可。
+`min_subgraph_size`的意思是，在加载TensorRT引擎的时候，大于`min_subgraph_size`的OP才会被优化，并且这些OP是连续的且是TensorRT可以优化的。
+
+- 方法二：找到子图的这些输入，按照上面方式也设置子图的输入动态尺寸。
+
+**Q:** 如何打开日志</br>
+**A:** 预测库默认是打开日志的，只要注释掉`config.disable_glog_info()`就可以打开日志
+
+**Q:** 开启TensorRT，预测时提示Slice on batch axis is not supported in TensorRT</br>
+**A:** 请尝试使用动态尺寸输入
diff --git a/deploy/auto_compression/README.md b/deploy/auto_compression/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..11e50138663e7e41788301638e479912029d7b6a
--- /dev/null
+++ b/deploy/auto_compression/README.md
@@ -0,0 +1,186 @@
+# 自动化压缩
+
+目录：
+- [1.简介](#1简介)
+- [2.Benchmark](#2Benchmark)
+- [3.开始自动压缩](#自动压缩流程)
+  - [3.1 环境准备](#31-准备环境)
+  - [3.2 准备数据集](#32-准备数据集)
+  - [3.3 准备预测模型](#33-准备预测模型)
+  - [3.4 测试模型精度](#34-测试模型精度)
+  - [3.5 自动压缩并产出模型](#35-自动压缩并产出模型)
+- [4.预测部署](#4预测部署)
+
+## 1. 简介
+本示例使用PaddleDetection中Inference部署模型进行自动化压缩，使用的自动化压缩策略为量化蒸馏。
+
+
+## 2.Benchmark
+
+### PP-YOLOE+
+
+| 模型  | Base mAP | 离线量化mAP | ACT量化mAP | TRT-FP32 | TRT-FP16 | TRT-INT8 |  配置文件 | 量化模型  |
+| :-------- |:-------- |:--------: | :---------------------: | :----------------: | :----------------: | :---------------: | :----------------------: | :---------------------: |
+| PP-YOLOE+_s	 | 43.7  |  - | 42.9  |   -  |   -   |  -  |  [config](./configs/ppyoloe_plus_s_qat_dis.yaml) | [Quant Model](https://bj.bcebos.com/v1/paddledet/deploy/Inference/ppyoloe_plus_s_qat_dis.tar) |
+| PP-YOLOE+_m | 49.8  |  - | 49.3  |   -  |   -   |  -  |  [config](./configs/ppyoloe_plus_m_qat_dis.yaml) | [Quant Model](https://bj.bcebos.com/v1/paddledet/deploy/Inference/ppyoloe_plus_m_qat_dis.tar) |
+| PP-YOLOE+_l | 52.9  |  - | 52.6  |   -  |   -   |  -  |  [config](./configs/ppyoloe_plus_l_qat_dis.yaml) | [Quant Model](https://bj.bcebos.com/v1/paddledet/deploy/Inference/ppyoloe_plus_l_qat_dis.tar) |
+| PP-YOLOE+_x | 54.7  |  - | 54.4  |   -  |   -   |  -  |  [config](./configs/ppyoloe_plus_x_qat_dis.yaml) | [Quant Model](https://bj.bcebos.com/v1/paddledet/deploy/Inference/ppyoloe_plus_x_qat_dis.tar) |
+
+- mAP的指标均在COCO val2017数据集中评测得到，IoU=0.5:0.95。
+
+### YOLOv8
+
+| 模型  | Base mAP | 离线量化mAP | ACT量化mAP | TRT-FP32 | TRT-FP16 | TRT-INT8 |  配置文件 | 量化模型  |
+| :-------- |:-------- |:--------: | :---------------------: | :----------------: | :----------------: | :---------------: | :----------------------: | :---------------------: |
+| YOLOv8-s | 44.9 |  43.9 | 44.3  |   9.27ms  |   4.65ms   |  **3.78ms**  |  [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/yolov8_s_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov8_s_500e_coco_trt_nms_quant.tar) |
+
+**注意：**
+- 表格中YOLOv8模型均为带NMS的模型，可直接在TRT中部署，如果需要对齐测试标准，需要测试不带NMS的模型。
+- mAP的指标均在COCO val2017数据集中评测得到，IoU=0.5:0.95。
+- 表格中的性能在Tesla T4的GPU环境下测试，并且开启TensorRT，batch_size=1。
+
+### PP-YOLOE
+
+| 模型  | Base mAP | 离线量化mAP | ACT量化mAP | TRT-FP32 | TRT-FP16 | TRT-INT8 |  配置文件 | 量化模型  |
+| :-------- |:-------- |:--------: | :---------------------: | :----------------: | :----------------: | :---------------: | :----------------------: | :---------------------: |
+| PP-YOLOE-l | 50.9  |  - | 50.6  |   11.2ms  |   7.7ms   |  **6.7ms**  |  [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/deploy/auto_compression/configs/ppyoloe_l_qat_dis.yaml) | [Quant Model](https://bj.bcebos.com/v1/paddle-slim-models/act/ppyoloe_crn_l_300e_coco_quant.tar) |
+| PP-YOLOE-SOD | 38.5  |  - | 37.6  |   -  |   -   |  -  |  [config](./configs/ppyoloe_crn_l_80e_sliced_visdrone_640_025_qat.yml) | [Quant Model](https://bj.bcebos.com/v1/paddle-slim-models/act/ppyoloe_sod_visdrone.tar) |
+
+git
+- PP-YOLOE-l mAP的指标在COCO val2017数据集中评测得到，IoU=0.5:0.95。
+- PP-YOLOE-l模型在Tesla V100的GPU环境下测试，并且开启TensorRT，batch_size=1，包含NMS，测试脚本是[benchmark demo](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy/python)。
+- PP-YOLOE-SOD 的指标在VisDrone-DET数据集切图后的COCO格式[数据集](https://bj.bcebos.com/v1/paddledet/data/smalldet/visdrone_sliced.zip)中评测得到，IoU=0.5:0.95。定义文件[ppyoloe_crn_l_80e_sliced_visdrone_640_025.yml](../../configs/smalldet/ppyoloe_crn_l_80e_sliced_visdrone_640_025.yml)
+
+### PP-PicoDet
+
+| 模型  | 策略 | mAP | FP32 | FP16 | INT8 |  配置文件 | 模型  |
+| :-------- |:-------- |:--------: | :----------------: | :----------------: | :---------------: | :----------------------: | :---------------------: |
+| PicoDet-S-NPU | Baseline | 30.1   |   -   |  -  |  -  | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/picodet/picodet_s_416_coco_npu.yml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/picodet_s_416_coco_npu.tar) |
+| PicoDet-S-NPU |  量化训练 | 29.7  |   -  |   -   |  -  |  [config](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/full_quantization/detection/configs/picodet_s_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/picodet_s_npu_quant.tar) |
+
+- mAP的指标均在COCO val2017数据集中评测得到，IoU=0.5:0.95。
+
+### RT-DETR
+
+| 模型              | Base mAP | ACT量化mAP | TRT-FP32 | TRT-FP16 |  TRT-INT8  |                           配置文件                           |                           量化模型                           |
+| :---------------- | :------- | :--------: | :------: | :------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| RT-DETR-R50       | 53.1     |    53.0    | 32.05ms  |  9.12ms  | **6.96ms** | [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/rtdetr_r50vd_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/rtdetr_r50vd_6x_coco_quant.tar) |
+| RT-DETR-R101      | 54.3     |    54.1    | 54.13ms  | 12.68ms  | **9.20ms** | [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/rtdetr_r101vd_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/rtdetr_r101vd_6x_coco_quant.tar) |
+| RT-DETR-HGNetv2-L | 53.0     |    52.9    | 26.16ms  |  8.54ms  | **6.65ms** | [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/rtdetr_hgnetv2_l_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/rtdetr_hgnetv2_l_6x_coco_quant.tar) |
+| RT-DETR-HGNetv2-X | 54.8     |    54.6    | 49.22ms  | 12.50ms  | **9.24ms** | [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/rtdetr_hgnetv2_x_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/rtdetr_hgnetv2_x_6x_coco_quant.tar) |
+
+- 上表测试环境：Tesla T4，TensorRT 8.6.0，CUDA 11.7，batch_size=1。
+
+| 模型              | Base mAP | ACT量化mAP | TRT-FP32 | TRT-FP16 |  TRT-INT8  |                           配置文件                           |                           量化模型                           |
+| :---------------- | :------- | :--------: | :------: | :------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| RT-DETR-R50       | 53.1     |    53.0    |  9.64ms  |  5.00ms  | **3.99ms** | [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/rtdetr_r50vd_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/rtdetr_r50vd_6x_coco_quant.tar) |
+| RT-DETR-R101      | 54.3     |    54.1    | 14.93ms  |  7.15ms  | **5.12ms** | [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/rtdetr_r101vd_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/rtdetr_r101vd_6x_coco_quant.tar) |
+| RT-DETR-HGNetv2-L | 53.0     |    52.9    |  8.17ms  |  4.77ms  | **4.00ms** | [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/rtdetr_hgnetv2_l_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/rtdetr_hgnetv2_l_6x_coco_quant.tar) |
+| RT-DETR-HGNetv2-X | 54.8     |    54.6    | 12.81ms  |  6.97ms  | **5.32ms** | [config](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/detection/configs/rtdetr_hgnetv2_x_qat_dis.yaml) | [Model](https://bj.bcebos.com/v1/paddle-slim-models/act/rtdetr_hgnetv2_x_6x_coco_quant.tar) |
+
+- 上表测试环境：A10，TensorRT 8.6.0，CUDA 11.6，batch_size=1。
+- mAP的指标均在COCO val2017数据集中评测得到，IoU=0.5:0.95。
+
+## 3. 自动压缩流程
+
+#### 3.1 准备环境
+- PaddlePaddle >= 2.4 （可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装）
+- PaddleSlim >= 2.4.1
+- PaddleDet >= 2.5
+- opencv-python
+
+安装paddlepaddle：
+```shell
+# CPU
+pip install paddlepaddle
+# GPU
+pip install paddlepaddle-gpu
+```
+
+安装paddleslim：
+```shell
+pip install paddleslim
+```
+
+安装paddledet：
+```shell
+pip install paddledet
+```
+
+**注意：** YOLOv8模型的自动化压缩需要依赖安装最新[Develop Paddle](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)和[Develop PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim#%E5%AE%89%E8%A3%85)版本。
+
+#### 3.2 准备数据集
+
+本案例默认以COCO数据进行自动压缩实验，如果自定义COCO数据，或者其他格式数据，请参考[数据准备文档](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/docs/tutorials/data/PrepareDataSet.md) 来准备数据。
+
+如果数据集为非COCO格式数据，请修改[configs](./configs)中reader配置文件中的Dataset字段。
+
+以PP-YOLOE模型为例，如果已经准备好数据集，请直接修改[./configs/yolo_reader.yml]中`EvalDataset`的`dataset_dir`字段为自己数据集路径即可。
+
+#### 3.3 准备预测模型
+
+预测模型的格式为：`model.pdmodel` 和 `model.pdiparams`两个，带`pdmodel`的是模型文件，带`pdiparams`后缀的是权重文件。
+
+
+根据[PaddleDetection文档](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/docs/tutorials/GETTING_STARTED_cn.md#8-%E6%A8%A1%E5%9E%8B%E5%AF%BC%E5%87%BA) 导出Inference模型，具体可参考下方PP-YOLOE模型的导出示例：
+- 下载代码
+```
+git clone https://github.com/PaddlePaddle/PaddleDetection.git
+```
+- 导出预测模型
+
+PPYOLOE-l模型，包含NMS：如快速体验，可直接下载[PP-YOLOE-l导出模型](https://bj.bcebos.com/v1/paddle-slim-models/act/ppyoloe_crn_l_300e_coco.tar)
+```shell
+python tools/export_model.py \
+        -c configs/ppyoloe/ppyoloe_crn_l_300e_coco.yml \
+        -o weights=https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams \
+        trt=True \
+```
+
+YOLOv8-s模型，包含NMS，具体可参考[YOLOv8模型文档](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.5/configs/yolov8), 然后执行：
+```shell
+python tools/export_model.py \
+        -c configs/yolov8/yolov8_s_500e_coco.yml \
+        -o weights=https://paddledet.bj.bcebos.com/models/yolov8_s_500e_coco.pdparams \
+        trt=True
+```
+
+如快速体验，可直接下载[YOLOv8-s导出模型](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov8_s_500e_coco_trt_nms.tar)
+
+#### 3.4 自动压缩并产出模型
+
+蒸馏量化自动压缩示例通过run.py脚本启动，会使用接口```paddleslim.auto_compression.AutoCompression```对模型进行自动压缩。配置config文件中模型路径、蒸馏、量化、和训练等部分的参数，配置完成后便可对模型进行量化和蒸馏。具体运行命令为：
+
+- 单卡训练：
+```
+export CUDA_VISIBLE_DEVICES=0
+python run.py --config_path=./configs/ppyoloe_l_qat_dis.yaml --save_dir='./output/'
+```
+
+- 多卡训练：
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 run.py \
+          --config_path=./configs/ppyoloe_l_qat_dis.yaml --save_dir='./output/'
+```
+
+#### 3.5 测试模型精度
+
+使用eval.py脚本得到模型的mAP：
+```
+export CUDA_VISIBLE_DEVICES=0
+python eval.py --config_path=./configs/ppyoloe_l_qat_dis.yaml
+```
+
+使用paddle inference并使用trt int8得到模型的mAP:
+```
+export CUDA_VISIBLE_DEVICES=0
+python paddle_inference_eval.py --model_path ./output/ --reader_config configs/ppyoloe_reader.yml --precision int8 --use_trt=True
+```
+
+**注意**：
+- 要测试的模型路径可以在配置文件中`model_dir`字段下进行修改。
+- --precision 默认为paddle，如果使用trt，需要设置--use_trt=True，同时--precision 可设置为fp32/fp16/int8
+
+## 4.预测部署
+
+- 可以参考[PaddleDetection部署教程](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/deploy)，GPU上量化模型开启TensorRT并设置trt_int8模式进行部署。
diff --git a/deploy/auto_compression/configs/picodet_reader.yml b/deploy/auto_compression/configs/picodet_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..952a978ae32723e5a98bc63989e473d04e480c7c
--- /dev/null
+++ b/deploy/auto_compression/configs/picodet_reader.yml
@@ -0,0 +1,32 @@
+metric: COCO
+num_classes: 80
+
+
+# Datset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: dataset/coco/
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco/
+
+worker_num: 6
+eval_height: &eval_height 416
+eval_width: &eval_width 416
+eval_size: &eval_size [*eval_height, *eval_width]
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False}
+  - NormalizeImage: {mean: [0, 0, 0], std: [1, 1, 1], is_scale: True}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  batch_size: 8
+  shuffle: false
diff --git a/deploy/auto_compression/configs/picodet_s_qat_dis.yaml b/deploy/auto_compression/configs/picodet_s_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5012be15a1e6791b27a9053417709ed96830bb0
--- /dev/null
+++ b/deploy/auto_compression/configs/picodet_s_qat_dis.yaml
@@ -0,0 +1,34 @@
+Global:
+  reader_config: ./configs/picodet_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./picodet_s_416_coco_npu/
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: l2
+
+QuantAware:
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  weight_bits: 8
+  activation_bits: 8
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 8000
+  eval_iter: 1000
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00001
+    T_max: 8000
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 4.0e-05
+
+
diff --git a/deploy/auto_compression/configs/ppyoloe_crn_l_80e_sliced_visdrone_640_025_qat.yml b/deploy/auto_compression/configs/ppyoloe_crn_l_80e_sliced_visdrone_640_025_qat.yml
new file mode 100644
index 0000000000000000000000000000000000000000..84132455cad5a69b6e37d74343c6c7bae66c2c1a
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_crn_l_80e_sliced_visdrone_640_025_qat.yml
@@ -0,0 +1,34 @@
+
+Global:
+  reader_config: configs/ppyoloe_crn_l_80e_sliced_visdrone_640_025_reader.yml
+  input_list: ['image', 'scale_factor']
+  arch: YOLO
+  include_nms: True
+  Evaluation: True
+  model_dir: ../../output_inference/ppyoloe_crn_l_80e_sliced_visdrone_640_025
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  onnx_format: True
+  use_pact: False
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 8000
+  eval_iter: 500
+  learning_rate:
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 4.0e-05
diff --git a/deploy/auto_compression/configs/ppyoloe_crn_l_80e_sliced_visdrone_640_025_reader.yml b/deploy/auto_compression/configs/ppyoloe_crn_l_80e_sliced_visdrone_640_025_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0869a4a10ac8355cd20a80ba0d71cddd5a6f0008
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_crn_l_80e_sliced_visdrone_640_025_reader.yml
@@ -0,0 +1,25 @@
+metric: COCO
+num_classes: 10
+
+# Datset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train_images_640_025
+    anno_path: train_640_025.json
+    dataset_dir: dataset/visdrone_sliced
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val_images_640_025
+    anno_path: val_640_025.json
+    dataset_dir: dataset/visdrone_sliced
+worker_num: 0
+
+# preprocess reader in test
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
+    #- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 16
diff --git a/deploy/auto_compression/configs/ppyoloe_l_qat_dis.yaml b/deploy/auto_compression/configs/ppyoloe_l_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df346d2b00ec24b351f2d62974a13e33293f431b
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_l_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/ppyoloe_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./ppyoloe_crn_l_300e_coco
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 5000
+  eval_iter: 1000
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
+
diff --git a/deploy/auto_compression/configs/ppyoloe_plus_crn_t_auxhead_300e_coco_qat.yml b/deploy/auto_compression/configs/ppyoloe_plus_crn_t_auxhead_300e_coco_qat.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7f8c48ea482630bf2111de274e0316176406fcb8
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_plus_crn_t_auxhead_300e_coco_qat.yml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/ppyoloe_plus_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ../../output_inference/ppyoloe_plus_crn_t_auxhead_300e_coco/
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  onnx_format: True
+  use_pact: False
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 8000
+  eval_iter: 1000
+  learning_rate:
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 4.0e-05
diff --git a/deploy/auto_compression/configs/ppyoloe_plus_l_qat_dis.yaml b/deploy/auto_compression/configs/ppyoloe_plus_l_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd03aed09d9a1ed3a67eec3283ef227224e941fb
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_plus_l_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/ppyoloe_plus_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./ppyoloe_plus_crn_l_80e_coco  
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 5000
+  eval_iter: 1000
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
+
diff --git a/deploy/auto_compression/configs/ppyoloe_plus_m_qat_dis.yaml b/deploy/auto_compression/configs/ppyoloe_plus_m_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4d31332f5e3745604e03c50ad2f9db62376c1373
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_plus_m_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/ppyoloe_plus_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./ppyoloe_plus_crn_m_80e_coco
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 5000
+  eval_iter: 1000
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
+
diff --git a/deploy/auto_compression/configs/ppyoloe_plus_reader.yml b/deploy/auto_compression/configs/ppyoloe_plus_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5f3795f29be025e6836a7c88b51dd79ecb04a9f4
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_plus_reader.yml
@@ -0,0 +1,26 @@
+metric: COCO
+num_classes: 80
+
+# Datset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: dataset/coco/
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco/
+
+worker_num: 0
+
+# preprocess reader in test
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 4
diff --git a/deploy/auto_compression/configs/ppyoloe_plus_s_qat_dis.yaml b/deploy/auto_compression/configs/ppyoloe_plus_s_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41bfde1e47855cdd1c543d13292d387781b8c0d6
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_plus_s_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/ppyoloe_plus_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./ppyoloe_plus_crn_s_80e_coco
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 5000
+  eval_iter: 1000
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
+
diff --git a/deploy/auto_compression/configs/ppyoloe_plus_sod_crn_l_qat_dis.yaml b/deploy/auto_compression/configs/ppyoloe_plus_sod_crn_l_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..793afa1a0eba1c45a16042d4d2bf96d7c2777353
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_plus_sod_crn_l_qat_dis.yaml
@@ -0,0 +1,33 @@
+
+Global:
+  reader_config: configs/ppyoloe_plus_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ../../output_inference/ppyoloe_plus_sod_crn_l_80e_coco  
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  onnx_format: True
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 1
+  eval_iter: 1
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
+
diff --git a/deploy/auto_compression/configs/ppyoloe_plus_x_qat_dis.yaml b/deploy/auto_compression/configs/ppyoloe_plus_x_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac62e7ca2d22bae19ffcf99f8265a05ea7e1331c
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_plus_x_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/ppyoloe_plus_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./ppyoloe_plus_crn_x_80e_coco  
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 5000
+  eval_iter: 1000
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
+
diff --git a/deploy/auto_compression/configs/ppyoloe_reader.yml b/deploy/auto_compression/configs/ppyoloe_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d1061453051e8f7408f4e605078956a8b634f13c
--- /dev/null
+++ b/deploy/auto_compression/configs/ppyoloe_reader.yml
@@ -0,0 +1,26 @@
+metric: COCO
+num_classes: 80
+
+# Datset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: dataset/coco/
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco/
+
+worker_num: 0
+
+# preprocess reader in test
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
+    - Permute: {}
+  batch_size: 4
diff --git a/deploy/auto_compression/configs/rtdetr_hgnetv2_l_qat_dis.yaml b/deploy/auto_compression/configs/rtdetr_hgnetv2_l_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83ebd78b1a343c3401830076ab90157a83947026
--- /dev/null
+++ b/deploy/auto_compression/configs/rtdetr_hgnetv2_l_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/rtdetr_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./rtdetr_hgnetv2_l_6x_coco/
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  onnx_format: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+  - matmul_v2
+
+TrainConfig:
+  train_iter: 200
+  eval_iter: 50
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 10000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
diff --git a/deploy/auto_compression/configs/rtdetr_hgnetv2_x_qat_dis.yaml b/deploy/auto_compression/configs/rtdetr_hgnetv2_x_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4e2889bbd3c69e758199f99b6fd517d227799d0
--- /dev/null
+++ b/deploy/auto_compression/configs/rtdetr_hgnetv2_x_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/rtdetr_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./rtdetr_r50vd_6x_coco/ 
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  onnx_format: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+  - matmul_v2
+
+TrainConfig:
+  train_iter: 500
+  eval_iter: 100
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 10000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
diff --git a/deploy/auto_compression/configs/rtdetr_r101vd_qat_dis.yaml b/deploy/auto_compression/configs/rtdetr_r101vd_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd96d085e8a01cabba890f2602e9619855bf9d5a
--- /dev/null
+++ b/deploy/auto_compression/configs/rtdetr_r101vd_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/rtdetr_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./rtdetr_hgnetv2_x_6x_coco/
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  onnx_format: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+  - matmul_v2
+
+TrainConfig:
+  train_iter: 200
+  eval_iter: 50
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 10000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
diff --git a/deploy/auto_compression/configs/rtdetr_r50vd_qat_dis.yaml b/deploy/auto_compression/configs/rtdetr_r50vd_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4e2889bbd3c69e758199f99b6fd517d227799d0
--- /dev/null
+++ b/deploy/auto_compression/configs/rtdetr_r50vd_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/rtdetr_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./rtdetr_r50vd_6x_coco/ 
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  onnx_format: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+  - matmul_v2
+
+TrainConfig:
+  train_iter: 500
+  eval_iter: 100
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 10000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
diff --git a/deploy/auto_compression/configs/rtdetr_reader.yml b/deploy/auto_compression/configs/rtdetr_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7b213ffa202f8812f337f223c721a829fd8a55df
--- /dev/null
+++ b/deploy/auto_compression/configs/rtdetr_reader.yml
@@ -0,0 +1,26 @@
+metric: COCO
+num_classes: 80
+
+# Datset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: dataset/coco/
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco/
+
+worker_num: 0
+
+# preprocess reader in test
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
diff --git a/deploy/auto_compression/configs/yolov5_reader.yml b/deploy/auto_compression/configs/yolov5_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6ad321a04d12f822e98facd179d9d72b0d8aa741
--- /dev/null
+++ b/deploy/auto_compression/configs/yolov5_reader.yml
@@ -0,0 +1,26 @@
+metric: COCO
+num_classes: 80
+
+# Datset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: dataset/coco/
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco/
+
+worker_num: 0
+
+# preprocess reader in test
+TestReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: True, interp: 1}
+    - Pad: {size: [640, 640], fill_value: [114., 114., 114.]}
+    - Permute: {}
+  batch_size: 1
diff --git a/deploy/auto_compression/configs/yolov5_s_qat_dis.yml b/deploy/auto_compression/configs/yolov5_s_qat_dis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..309977ef696ab23cc859fa224486e2ed7e91900e
--- /dev/null
+++ b/deploy/auto_compression/configs/yolov5_s_qat_dis.yml
@@ -0,0 +1,29 @@
+
+Global:
+  reader_config: configs/yolov5_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./yolov5_s_300e_coco
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 3000
+  eval_iter: 1000
+  learning_rate: 0.00001
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 4.0e-05
+  target_metric: 0.365
diff --git a/deploy/auto_compression/configs/yolov6mt_s_qat_dis.yaml b/deploy/auto_compression/configs/yolov6mt_s_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e134494fe2833333f3b2bcf87edb71e0b870a56f
--- /dev/null
+++ b/deploy/auto_compression/configs/yolov6mt_s_qat_dis.yaml
@@ -0,0 +1,30 @@
+
+Global:
+  reader_config: configs/yolov5_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./yolov6mt_s_400e_coco
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 8000
+  eval_iter: 1000
+  learning_rate: 
+    type: CosineAnnealingDecay 
+    learning_rate: 0.00003
+    T_max: 8000
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 0.00004
diff --git a/deploy/auto_compression/configs/yolov7_l_qat_dis.yaml b/deploy/auto_compression/configs/yolov7_l_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..801ccb4057c4f36fe379c281a21965ddc63a2e8b
--- /dev/null
+++ b/deploy/auto_compression/configs/yolov7_l_qat_dis.yaml
@@ -0,0 +1,30 @@
+
+Global:
+  reader_config: configs/yolov5_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./yolov7_l_300e_coco
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 8000
+  eval_iter: 1000
+  learning_rate: 
+    type: CosineAnnealingDecay 
+    learning_rate: 0.00003
+    T_max: 8000
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 0.00004
\ No newline at end of file
diff --git a/deploy/auto_compression/configs/yolov8_reader.yml b/deploy/auto_compression/configs/yolov8_reader.yml
new file mode 100644
index 0000000000000000000000000000000000000000..202a49415572201811ed53fe806c2b31c9051fde
--- /dev/null
+++ b/deploy/auto_compression/configs/yolov8_reader.yml
@@ -0,0 +1,27 @@
+metric: COCO
+num_classes: 80
+
+# Dataset configuration
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: dataset/coco/
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco/
+
+worker_num: 0
+
+# preprocess reader in test
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: True, interp: 1}
+    - Pad: {size: [640, 640], fill_value: [114., 114., 114.]}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 4
diff --git a/deploy/auto_compression/configs/yolov8_s_qat_dis.yaml b/deploy/auto_compression/configs/yolov8_s_qat_dis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c93e203e918d798e055e260d73f747a6ef9d5cb
--- /dev/null
+++ b/deploy/auto_compression/configs/yolov8_s_qat_dis.yaml
@@ -0,0 +1,32 @@
+
+Global:
+  reader_config: configs/yolov8_reader.yml
+  include_nms: True
+  Evaluation: True
+  model_dir: ./yolov8_s_500e_coco_trt_nms/
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+QuantAware:
+  onnx_format: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+TrainConfig:
+  train_iter: 8000
+  eval_iter: 1000
+  learning_rate:  
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 10000
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 4.0e-05
+
diff --git a/deploy/auto_compression/eval.py b/deploy/auto_compression/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de8aff85ce5f3cffa4119a1a3c26e318101db74
--- /dev/null
+++ b/deploy/auto_compression/eval.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import numpy as np
+import argparse
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.core.workspace import create
+from ppdet.metrics import COCOMetric, VOCMetric, KeyPointTopDownCOCOEval
+from paddleslim.auto_compression.config_helpers import load_config as load_slim_config
+from post_process import PPYOLOEPostProcess
+
+
+def argsparser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default=None,
+        help="path of compression strategy config.",
+        required=True)
+    parser.add_argument(
+        '--devices',
+        type=str,
+        default='gpu',
+        help="which device used to compress.")
+
+    return parser
+
+
+def reader_wrapper(reader, input_list):
+    def gen():
+        for data in reader:
+            in_dict = {}
+            if isinstance(input_list, list):
+                for input_name in input_list:
+                    in_dict[input_name] = data[input_name]
+            elif isinstance(input_list, dict):
+                for input_name in input_list.keys():
+                    in_dict[input_list[input_name]] = data[input_name]
+            yield in_dict
+
+    return gen
+
+
+def convert_numpy_data(data, metric):
+    data_all = {}
+    data_all = {k: np.array(v) for k, v in data.items()}
+    if isinstance(metric, VOCMetric):
+        for k, v in data_all.items():
+            if not isinstance(v[0], np.ndarray):
+                tmp_list = []
+                for t in v:
+                    tmp_list.append(np.array(t))
+                data_all[k] = np.array(tmp_list)
+    else:
+        data_all = {k: np.array(v) for k, v in data.items()}
+    return data_all
+
+
+def eval():
+
+    place = paddle.CUDAPlace(0) if FLAGS.devices == 'gpu' else paddle.CPUPlace()
+    exe = paddle.static.Executor(place)
+
+    val_program, feed_target_names, fetch_targets = paddle.static.load_inference_model(
+        global_config["model_dir"].rstrip('/'),
+        exe,
+        model_filename=global_config["model_filename"],
+        params_filename=global_config["params_filename"])
+    print('Loaded model from: {}'.format(global_config["model_dir"]))
+
+    metric = global_config['metric']
+    for batch_id, data in enumerate(val_loader):
+        data_all = convert_numpy_data(data, metric)
+        data_input = {}
+        for k, v in data.items():
+            if isinstance(global_config['input_list'], list):
+                if k in global_config['input_list']:
+                    data_input[k] = np.array(v)
+            elif isinstance(global_config['input_list'], dict):
+                if k in global_config['input_list'].keys():
+                    data_input[global_config['input_list'][k]] = np.array(v)
+
+        outs = exe.run(val_program,
+                       feed=data_input,
+                       fetch_list=fetch_targets,
+                       return_numpy=False)
+        res = {}
+        if 'arch' in global_config and global_config['arch'] == 'PPYOLOE':
+            postprocess = PPYOLOEPostProcess(
+                score_threshold=0.01, nms_threshold=0.6)
+            res = postprocess(np.array(outs[0]), data_all['scale_factor'])
+        else:
+            for out in outs:
+                v = np.array(out)
+                if len(v.shape) > 1:
+                    res['bbox'] = v
+                else:
+                    res['bbox_num'] = v
+        metric.update(data_all, res)
+        if batch_id % 100 == 0:
+            print('Eval iter:', batch_id)
+    metric.accumulate()
+    metric.log()
+    metric.reset()
+
+
+def main():
+    global global_config
+    all_config = load_slim_config(FLAGS.config_path)
+    assert "Global" in all_config, "Key 'Global' not found in config file."
+    global_config = all_config["Global"]
+    reader_cfg = load_config(global_config['reader_config'])
+
+    dataset = reader_cfg['EvalDataset']
+    global val_loader
+    val_loader = create('EvalReader')(reader_cfg['EvalDataset'],
+                                      reader_cfg['worker_num'],
+                                      return_list=True)
+    metric = None
+    if reader_cfg['metric'] == 'COCO':
+        clsid2catid = {v: k for k, v in dataset.catid2clsid.items()}
+        anno_file = dataset.get_anno()
+        metric = COCOMetric(
+            anno_file=anno_file, clsid2catid=clsid2catid, IouType='bbox')
+    elif reader_cfg['metric'] == 'VOC':
+        metric = VOCMetric(
+            label_list=dataset.get_label_list(),
+            class_num=reader_cfg['num_classes'],
+            map_type=reader_cfg['map_type'])
+    elif reader_cfg['metric'] == 'KeyPointTopDownCOCOEval':
+        anno_file = dataset.get_anno()
+        metric = KeyPointTopDownCOCOEval(anno_file,
+                                         len(dataset), 17, 'output_eval')
+    else:
+        raise ValueError("metric currently only supports COCO and VOC.")
+    global_config['metric'] = metric
+
+    eval()
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    parser = argsparser()
+    FLAGS = parser.parse_args()
+    assert FLAGS.devices in ['cpu', 'gpu', 'xpu', 'npu']
+    paddle.set_device(FLAGS.devices)
+
+    main()
diff --git a/deploy/auto_compression/paddle_inference_eval.py b/deploy/auto_compression/paddle_inference_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..053ee35e752c17fdb0069c6953b5e8f43f6aded1
--- /dev/null
+++ b/deploy/auto_compression/paddle_inference_eval.py
@@ -0,0 +1,499 @@
+#opyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import time
+import sys
+import cv2
+import numpy as np
+
+import paddle
+from paddle.inference import Config
+from paddle.inference import create_predictor
+from ppdet.core.workspace import load_config, create
+from ppdet.metrics import COCOMetric
+
+from post_process import PPYOLOEPostProcess
+
+
+def argsparser():
+    """
+    argsparser func
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path", type=str, help="inference model filepath")
+    parser.add_argument(
+        "--image_file",
+        type=str,
+        default=None,
+        help="image path, if set image_file, it will not eval coco.")
+    parser.add_argument(
+        "--reader_config",
+        type=str,
+        default=None,
+        help="path of datset and reader config.")
+    parser.add_argument(
+        "--benchmark",
+        type=bool,
+        default=False,
+        help="Whether run benchmark or not.")
+    parser.add_argument(
+        "--use_trt",
+        type=bool,
+        default=False,
+        help="Whether use TensorRT or not.")
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="paddle",
+        help="mode of running(fp32/fp16/int8)")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="GPU",
+        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU",
+    )
+    parser.add_argument(
+        "--use_dynamic_shape",
+        type=bool,
+        default=True,
+        help="Whether use dynamic shape or not.")
+    parser.add_argument(
+        "--use_mkldnn",
+        type=bool,
+        default=False,
+        help="Whether use mkldnn or not.")
+    parser.add_argument(
+        "--cpu_threads", type=int, default=10, help="Num of cpu threads.")
+    parser.add_argument("--img_shape", type=int, default=640, help="input_size")
+    parser.add_argument(
+        '--include_nms',
+        type=bool,
+        default=True,
+        help="Whether include nms or not.")
+
+    return parser
+
+
+CLASS_LABEL = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush'
+]
+
+
+def generate_scale(im, target_shape, keep_ratio=True):
+    """
+    Args:
+        im (np.ndarray): image (np.ndarray)
+    Returns:
+        im_scale_x: the resize ratio of X
+        im_scale_y: the resize ratio of Y
+    """
+    origin_shape = im.shape[:2]
+    if keep_ratio:
+        im_size_min = np.min(origin_shape)
+        im_size_max = np.max(origin_shape)
+        target_size_min = np.min(target_shape)
+        target_size_max = np.max(target_shape)
+        im_scale = float(target_size_min) / float(im_size_min)
+        if np.round(im_scale * im_size_max) > target_size_max:
+            im_scale = float(target_size_max) / float(im_size_max)
+        im_scale_x = im_scale
+        im_scale_y = im_scale
+    else:
+        resize_h, resize_w = target_shape
+        im_scale_y = resize_h / float(origin_shape[0])
+        im_scale_x = resize_w / float(origin_shape[1])
+    return im_scale_y, im_scale_x
+
+
+def image_preprocess(img_path, target_shape):
+    """
+    image_preprocess func
+    """
+    img = cv2.imread(img_path)
+    im_scale_y, im_scale_x = generate_scale(img, target_shape, keep_ratio=False)
+    img = cv2.resize(
+        img, (target_shape[0], target_shape[0]),
+        interpolation=cv2.INTER_LANCZOS4)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, [2, 0, 1]) / 255
+    img = np.expand_dims(img, 0)
+    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    scale_factor = np.array([[im_scale_y, im_scale_x]])
+    return img.astype(np.float32), scale_factor.astype(np.float32)
+
+
+def get_color_map_list(num_classes):
+    """
+    get_color_map_list func
+    """
+    color_map = num_classes * [0, 0, 0]
+    for i in range(0, num_classes):
+        j = 0
+        lab = i
+        while lab:
+            color_map[i * 3] |= ((lab >> 0) & 1) << (7 - j)
+            color_map[i * 3 + 1] |= ((lab >> 1) & 1) << (7 - j)
+            color_map[i * 3 + 2] |= ((lab >> 2) & 1) << (7 - j)
+            j += 1
+            lab >>= 3
+    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+    return color_map
+
+
+def draw_box(image_file, results, class_label, threshold=0.5):
+    """
+    draw_box func
+    """
+    srcimg = cv2.imread(image_file, 1)
+    for i in range(len(results)):
+        color_list = get_color_map_list(len(class_label))
+        clsid2color = {}
+        classid, conf = int(results[i, 0]), results[i, 1]
+        if conf < threshold:
+            continue
+        xmin, ymin, xmax, ymax = int(results[i, 2]), int(results[i, 3]), int(
+            results[i, 4]), int(results[i, 5])
+
+        if classid not in clsid2color:
+            clsid2color[classid] = color_list[classid]
+        color = tuple(clsid2color[classid])
+
+        cv2.rectangle(srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2)
+        print(class_label[classid] + ": " + str(round(conf, 3)))
+        cv2.putText(
+            srcimg,
+            class_label[classid] + ":" + str(round(conf, 3)),
+            (xmin, ymin - 10),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8,
+            (0, 255, 0),
+            thickness=2, )
+    return srcimg
+
+
+def load_predictor(
+        model_dir,
+        precision="fp32",
+        use_trt=False,
+        use_mkldnn=False,
+        batch_size=1,
+        device="CPU",
+        min_subgraph_size=3,
+        use_dynamic_shape=False,
+        trt_min_shape=1,
+        trt_max_shape=1280,
+        trt_opt_shape=640,
+        cpu_threads=1, ):
+    """set AnalysisConfig, generate AnalysisPredictor
+    Args:
+        model_dir (str): root path of __model__ and __params__
+        precision (str): mode of running(fp32/fp16/int8)
+        use_trt (bool): whether use TensorRT or not.
+        use_mkldnn (bool): whether use MKLDNN or not in CPU.
+        device (str): Choose the device you want to run, it can be: CPU/GPU, default is CPU
+        use_dynamic_shape (bool): use dynamic shape or not
+        trt_min_shape (int): min shape for dynamic shape in trt
+        trt_max_shape (int): max shape for dynamic shape in trt
+        trt_opt_shape (int): opt shape for dynamic shape in trt
+    Returns:
+        predictor (PaddlePredictor): AnalysisPredictor
+    Raises:
+        ValueError: predict by TensorRT need device == 'GPU'.
+    """
+    rerun_flag = False
+    if device != "GPU" and use_trt:
+        raise ValueError(
+            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".
+            format(precision, device))
+    config = Config(
+        os.path.join(model_dir, "model.pdmodel"),
+        os.path.join(model_dir, "model.pdiparams"))
+    if device == "GPU":
+        # initial GPU memory(M), device ID
+        config.enable_use_gpu(200, 0)
+        # optimize graph and fuse op
+        config.switch_ir_optim(True)
+    else:
+        config.disable_gpu()
+        config.set_cpu_math_library_num_threads(cpu_threads)
+        config.switch_ir_optim()
+        if use_mkldnn:
+            config.enable_mkldnn()
+            if precision == "int8":
+                config.enable_mkldnn_int8(
+                    {"conv2d", "depthwise_conv2d", "transpose2", "pool2d"})
+
+    precision_map = {
+        "int8": Config.Precision.Int8,
+        "fp32": Config.Precision.Float32,
+        "fp16": Config.Precision.Half,
+    }
+    if precision in precision_map.keys() and use_trt:
+        config.enable_tensorrt_engine(
+            workspace_size=(1 << 25) * batch_size,
+            max_batch_size=batch_size,
+            min_subgraph_size=min_subgraph_size,
+            precision_mode=precision_map[precision],
+            use_static=True,
+            use_calib_mode=False, )
+
+        if use_dynamic_shape:
+            dynamic_shape_file = os.path.join(FLAGS.model_path,
+                                              "dynamic_shape.txt")
+            if os.path.exists(dynamic_shape_file):
+                config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
+                                                           True)
+                print("trt set dynamic shape done!")
+            else:
+                config.collect_shape_range_info(dynamic_shape_file)
+                print("Start collect dynamic shape...")
+                rerun_flag = True
+
+    # enable shared memory
+    config.enable_memory_optim()
+    predictor = create_predictor(config)
+    return predictor, rerun_flag
+
+
+def get_current_memory_mb():
+    """
+    It is used to Obtain the memory usage of the CPU and GPU during the running of the program.
+    And this function Current program is time-consuming.
+    """
+    try:
+        pkg.require('pynvml')
+    except:
+        from pip._internal import main
+        main(['install', 'pynvml'])
+    try:
+        pkg.require('psutil')
+    except:
+        from pip._internal import main
+        main(['install', 'psutil'])
+    try:
+        pkg.require('GPUtil')
+    except:
+        from pip._internal import main
+        main(['install', 'GPUtil'])
+    import pynvml
+    import psutil
+    import GPUtil
+
+    gpu_id = int(os.environ.get("CUDA_VISIBLE_DEVICES", 0))
+
+    pid = os.getpid()
+    p = psutil.Process(pid)
+    info = p.memory_full_info()
+    cpu_mem = info.uss / 1024.0 / 1024.0
+    gpu_mem = 0
+    gpu_percent = 0
+    gpus = GPUtil.getGPUs()
+    if gpu_id is not None and len(gpus) > 0:
+        gpu_percent = gpus[gpu_id].load
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        gpu_mem = meminfo.used / 1024.0 / 1024.0
+    return round(cpu_mem, 4), round(gpu_mem, 4)
+
+
+def predict_image(predictor,
+                  image_file,
+                  image_shape=[640, 640],
+                  warmup=1,
+                  repeats=1,
+                  threshold=0.5):
+    """
+    predict image main func
+    """
+    img, scale_factor = image_preprocess(image_file, image_shape)
+    inputs = {}
+    inputs["image"] = img
+    if FLAGS.include_nms:
+        inputs['scale_factor'] = scale_factor
+    input_names = predictor.get_input_names()
+    for i, _ in enumerate(input_names):
+        input_tensor = predictor.get_input_handle(input_names[i])
+        input_tensor.copy_from_cpu(inputs[input_names[i]])
+
+    for i in range(warmup):
+        predictor.run()
+
+    np_boxes, np_boxes_num = None, None
+    cpu_mems, gpu_mems = 0, 0
+    predict_time = 0.0
+    time_min = float("inf")
+    time_max = float("-inf")
+    for i in range(repeats):
+        start_time = time.time()
+        predictor.run()
+        output_names = predictor.get_output_names()
+        boxes_tensor = predictor.get_output_handle(output_names[0])
+        np_boxes = boxes_tensor.copy_to_cpu()
+        if FLAGS.include_nms:
+            boxes_num = predictor.get_output_handle(output_names[1])
+            np_boxes_num = boxes_num.copy_to_cpu()
+        end_time = time.time()
+        timed = end_time - start_time
+        time_min = min(time_min, timed)
+        time_max = max(time_max, timed)
+        predict_time += timed
+        cpu_mem, gpu_mem = get_current_memory_mb()
+        cpu_mems += cpu_mem
+        gpu_mems += gpu_mem
+
+    time_avg = predict_time / repeats
+    print("[Benchmark]Avg cpu_mem:{} MB, avg gpu_mem: {} MB".format(
+        cpu_mems / repeats, gpu_mems / repeats))
+    print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format(
+        round(time_min * 1000, 2),
+        round(time_max * 1000, 1), round(time_avg * 1000, 1)))
+    if not FLAGS.include_nms:
+        postprocess = PPYOLOEPostProcess(score_threshold=0.3, nms_threshold=0.6)
+        res = postprocess(np_boxes, scale_factor)
+    else:
+        res = {'bbox': np_boxes, 'bbox_num': np_boxes_num}
+    res_img = draw_box(
+        image_file, res["bbox"], CLASS_LABEL, threshold=threshold)
+    cv2.imwrite("result.jpg", res_img)
+
+
+def eval(predictor, val_loader, metric, rerun_flag=False):
+    """
+    eval main func
+    """
+    cpu_mems, gpu_mems = 0, 0
+    predict_time = 0.0
+    time_min = float("inf")
+    time_max = float("-inf")
+    sample_nums = len(val_loader)
+    input_names = predictor.get_input_names()
+    output_names = predictor.get_output_names()
+    boxes_tensor = predictor.get_output_handle(output_names[0])
+    if FLAGS.include_nms:
+        boxes_num = predictor.get_output_handle(output_names[1])
+    for batch_id, data in enumerate(val_loader):
+        data_all = {k: np.array(v) for k, v in data.items()}
+        for i, _ in enumerate(input_names):
+            input_tensor = predictor.get_input_handle(input_names[i])
+            input_tensor.copy_from_cpu(data_all[input_names[i]])
+        start_time = time.time()
+        predictor.run()
+        np_boxes = boxes_tensor.copy_to_cpu()
+        if FLAGS.include_nms:
+            np_boxes_num = boxes_num.copy_to_cpu()
+        if rerun_flag:
+            return
+        end_time = time.time()
+        timed = end_time - start_time
+        time_min = min(time_min, timed)
+        time_max = max(time_max, timed)
+        predict_time += timed
+        cpu_mem, gpu_mem = get_current_memory_mb()
+        cpu_mems += cpu_mem
+        gpu_mems += gpu_mem
+        if not FLAGS.include_nms:
+            postprocess = PPYOLOEPostProcess(
+                score_threshold=0.3, nms_threshold=0.6)
+            res = postprocess(np_boxes, data_all['scale_factor'])
+        else:
+            res = {'bbox': np_boxes, 'bbox_num': np_boxes_num}
+        metric.update(data_all, res)
+        if batch_id % 100 == 0:
+            print("Eval iter:", batch_id)
+            sys.stdout.flush()
+    metric.accumulate()
+    metric.log()
+    map_res = metric.get_results()
+    metric.reset()
+    time_avg = predict_time / sample_nums
+    print("[Benchmark]Avg cpu_mem:{} MB, avg gpu_mem: {} MB".format(
+        cpu_mems / sample_nums, gpu_mems / sample_nums))
+    print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format(
+        round(time_min * 1000, 2),
+        round(time_max * 1000, 1), round(time_avg * 1000, 1)))
+    print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0]))
+    sys.stdout.flush()
+
+
+def main():
+    """
+    main func
+    """
+    predictor, rerun_flag = load_predictor(
+        FLAGS.model_path,
+        device=FLAGS.device,
+        use_trt=FLAGS.use_trt,
+        use_mkldnn=FLAGS.use_mkldnn,
+        precision=FLAGS.precision,
+        use_dynamic_shape=FLAGS.use_dynamic_shape,
+        cpu_threads=FLAGS.cpu_threads)
+
+    if FLAGS.image_file:
+        warmup, repeats = 1, 1
+        if FLAGS.benchmark:
+            warmup, repeats = 50, 100
+        predict_image(
+            predictor,
+            FLAGS.image_file,
+            image_shape=[FLAGS.img_shape, FLAGS.img_shape],
+            warmup=warmup,
+            repeats=repeats)
+    else:
+        reader_cfg = load_config(FLAGS.reader_config)
+
+        dataset = reader_cfg["EvalDataset"]
+        global val_loader
+        val_loader = create("EvalReader")(reader_cfg["EvalDataset"],
+                                          reader_cfg["worker_num"],
+                                          return_list=True)
+        clsid2catid = {v: k for k, v in dataset.catid2clsid.items()}
+        anno_file = dataset.get_anno()
+        metric = COCOMetric(
+            anno_file=anno_file, clsid2catid=clsid2catid, IouType="bbox")
+        eval(predictor, val_loader, metric, rerun_flag=rerun_flag)
+
+    if rerun_flag:
+        print(
+            "***** Collect dynamic shape done, Please rerun the program to get correct results. *****"
+        )
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    parser = argsparser()
+    FLAGS = parser.parse_args()
+
+    # DataLoader need run on cpu
+    paddle.set_device("cpu")
+
+    main()
diff --git a/deploy/auto_compression/post_process.py b/deploy/auto_compression/post_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..eea2f019548ec288a23e37b3bd2faf24f9a98935
--- /dev/null
+++ b/deploy/auto_compression/post_process.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cv2
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    indexes = np.argsort(scores)
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(
+                current_box, axis=0), )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+
+
+class PPYOLOEPostProcess(object):
+    """
+    Args:
+        input_shape (int): network input image size
+        scale_factor (float): scale factor of ori image
+    """
+
+    def __init__(self,
+                 score_threshold=0.4,
+                 nms_threshold=0.5,
+                 nms_top_k=10000,
+                 keep_top_k=300):
+        self.score_threshold = score_threshold
+        self.nms_threshold = nms_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+
+    def _non_max_suppression(self, prediction, scale_factor):
+        batch_size = prediction.shape[0]
+        out_boxes_list = []
+        box_num_list = []
+        for batch_id in range(batch_size):
+            bboxes, confidences = prediction[batch_id][..., :4], prediction[
+                batch_id][..., 4:]
+            # nms
+            picked_box_probs = []
+            picked_labels = []
+            for class_index in range(0, confidences.shape[1]):
+                probs = confidences[:, class_index]
+                mask = probs > self.score_threshold
+                probs = probs[mask]
+                if probs.shape[0] == 0:
+                    continue
+                subset_boxes = bboxes[mask, :]
+                box_probs = np.concatenate(
+                    [subset_boxes, probs.reshape(-1, 1)], axis=1)
+                box_probs = hard_nms(
+                    box_probs,
+                    iou_threshold=self.nms_threshold,
+                    top_k=self.nms_top_k)
+                picked_box_probs.append(box_probs)
+                picked_labels.extend([class_index] * box_probs.shape[0])
+
+            if len(picked_box_probs) == 0:
+                out_boxes_list.append(np.empty((0, 4)))
+
+            else:
+                picked_box_probs = np.concatenate(picked_box_probs)
+                # resize output boxes
+                picked_box_probs[:, 0] /= scale_factor[batch_id][1]
+                picked_box_probs[:, 2] /= scale_factor[batch_id][1]
+                picked_box_probs[:, 1] /= scale_factor[batch_id][0]
+                picked_box_probs[:, 3] /= scale_factor[batch_id][0]
+
+                # clas score box
+                out_box = np.concatenate(
+                    [
+                        np.expand_dims(
+                            np.array(picked_labels), axis=-1), np.expand_dims(
+                                picked_box_probs[:, 4], axis=-1),
+                        picked_box_probs[:, :4]
+                    ],
+                    axis=1)
+                if out_box.shape[0] > self.keep_top_k:
+                    out_box = out_box[out_box[:, 1].argsort()[::-1]
+                                      [:self.keep_top_k]]
+                out_boxes_list.append(out_box)
+                box_num_list.append(out_box.shape[0])
+
+        out_boxes_list = np.concatenate(out_boxes_list, axis=0)
+        box_num_list = np.array(box_num_list)
+        return out_boxes_list, box_num_list
+
+    def __call__(self, outs, scale_factor):
+        out_boxes_list, box_num_list = self._non_max_suppression(outs,
+                                                                 scale_factor)
+        return {'bbox': out_boxes_list, 'bbox_num': box_num_list}
diff --git a/deploy/auto_compression/run.py b/deploy/auto_compression/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..d940307db618c80f015b32637e7610784d1affb9
--- /dev/null
+++ b/deploy/auto_compression/run.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import numpy as np
+import argparse
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.core.workspace import create
+from ppdet.metrics import COCOMetric, VOCMetric, KeyPointTopDownCOCOEval
+from paddleslim.auto_compression.config_helpers import load_config as load_slim_config
+from paddleslim.auto_compression import AutoCompression
+from post_process import PPYOLOEPostProcess
+from paddleslim.common.dataloader import get_feed_vars
+
+
+def argsparser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default=None,
+        help="path of compression strategy config.",
+        required=True)
+    parser.add_argument(
+        '--save_dir',
+        type=str,
+        default='output',
+        help="directory to save compressed model.")
+    parser.add_argument(
+        '--devices',
+        type=str,
+        default='gpu',
+        help="which device used to compress.")
+
+    return parser
+
+
+def reader_wrapper(reader, input_list):
+    def gen():
+        for data in reader:
+            in_dict = {}
+            if isinstance(input_list, list):
+                for input_name in input_list:
+                    in_dict[input_name] = data[input_name]
+            elif isinstance(input_list, dict):
+                for input_name in input_list.keys():
+                    in_dict[input_list[input_name]] = data[input_name]
+            yield in_dict
+
+    return gen
+
+
+def convert_numpy_data(data, metric):
+    data_all = {}
+    data_all = {k: np.array(v) for k, v in data.items()}
+    if isinstance(metric, VOCMetric):
+        for k, v in data_all.items():
+            if not isinstance(v[0], np.ndarray):
+                tmp_list = []
+                for t in v:
+                    tmp_list.append(np.array(t))
+                data_all[k] = np.array(tmp_list)
+    else:
+        data_all = {k: np.array(v) for k, v in data.items()}
+    return data_all
+
+
+def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
+    metric = global_config['metric']
+    for batch_id, data in enumerate(val_loader):
+        data_all = convert_numpy_data(data, metric)
+        data_input = {}
+        for k, v in data.items():
+            if isinstance(global_config['input_list'], list):
+                if k in test_feed_names:
+                    data_input[k] = np.array(v)
+            elif isinstance(global_config['input_list'], dict):
+                if k in global_config['input_list'].keys():
+                    data_input[global_config['input_list'][k]] = np.array(v)
+        outs = exe.run(compiled_test_program,
+                       feed=data_input,
+                       fetch_list=test_fetch_list,
+                       return_numpy=False)
+        res = {}
+        if 'include_nms' in global_config and not global_config['include_nms']:
+            if 'arch' in global_config and global_config['arch'] == 'PPYOLOE':
+                postprocess = PPYOLOEPostProcess(
+                    score_threshold=0.01, nms_threshold=0.6)
+            else:
+                assert "Not support arch={} now.".format(global_config['arch'])
+            res = postprocess(np.array(outs[0]), data_all['scale_factor'])
+        else:
+            for out in outs:
+                v = np.array(out)
+                if len(v.shape) > 1:
+                    res['bbox'] = v
+                else:
+                    res['bbox_num'] = v
+
+        metric.update(data_all, res)
+        if batch_id % 100 == 0:
+            print('Eval iter:', batch_id)
+    metric.accumulate()
+    metric.log()
+    map_res = metric.get_results()
+    metric.reset()
+    map_key = 'keypoint' if 'arch' in global_config and global_config[
+        'arch'] == 'keypoint' else 'bbox'
+    return map_res[map_key][0]
+
+
+def main():
+    global global_config
+    all_config = load_slim_config(FLAGS.config_path)
+    assert "Global" in all_config, "Key 'Global' not found in config file."
+    global_config = all_config["Global"]
+    reader_cfg = load_config(global_config['reader_config'])
+
+    train_loader = create('EvalReader')(reader_cfg['TrainDataset'],
+                                        reader_cfg['worker_num'],
+                                        return_list=True)
+    if global_config.get('input_list') is None:
+        global_config['input_list'] = get_feed_vars(
+            global_config['model_dir'], global_config['model_filename'],
+            global_config['params_filename'])
+    train_loader = reader_wrapper(train_loader, global_config['input_list'])
+
+    if 'Evaluation' in global_config.keys() and global_config[
+            'Evaluation'] and paddle.distributed.get_rank() == 0:
+        eval_func = eval_function
+        dataset = reader_cfg['EvalDataset']
+        global val_loader
+        _eval_batch_sampler = paddle.io.BatchSampler(
+            dataset, batch_size=reader_cfg['EvalReader']['batch_size'])
+        val_loader = create('EvalReader')(dataset,
+                                          reader_cfg['worker_num'],
+                                          batch_sampler=_eval_batch_sampler,
+                                          return_list=True)
+        metric = None
+        if reader_cfg['metric'] == 'COCO':
+            clsid2catid = {v: k for k, v in dataset.catid2clsid.items()}
+            anno_file = dataset.get_anno()
+            metric = COCOMetric(
+                anno_file=anno_file, clsid2catid=clsid2catid, IouType='bbox')
+        elif reader_cfg['metric'] == 'VOC':
+            metric = VOCMetric(
+                label_list=dataset.get_label_list(),
+                class_num=reader_cfg['num_classes'],
+                map_type=reader_cfg['map_type'])
+        elif reader_cfg['metric'] == 'KeyPointTopDownCOCOEval':
+            anno_file = dataset.get_anno()
+            metric = KeyPointTopDownCOCOEval(anno_file,
+                                             len(dataset), 17, 'output_eval')
+        else:
+            raise ValueError("metric currently only supports COCO and VOC.")
+        global_config['metric'] = metric
+    else:
+        eval_func = None
+
+    ac = AutoCompression(
+        model_dir=global_config["model_dir"],
+        model_filename=global_config["model_filename"],
+        params_filename=global_config["params_filename"],
+        save_dir=FLAGS.save_dir,
+        config=all_config,
+        train_dataloader=train_loader,
+        eval_callback=eval_func)
+    ac.compress()
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    parser = argsparser()
+    FLAGS = parser.parse_args()
+    assert FLAGS.devices in ['cpu', 'gpu', 'xpu', 'npu']
+    paddle.set_device(FLAGS.devices)
+
+    main()
diff --git a/deploy/benchmark/benchmark.sh b/deploy/benchmark/benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e29aaa884d30316237aede0c18b38e2cc520ee4b
--- /dev/null
+++ b/deploy/benchmark/benchmark.sh
@@ -0,0 +1,36 @@
+# All rights `PaddleDetection` reserved
+#!/bin/bash
+model_dir=$1
+model_name=$2
+
+export img_dir="demo"
+export log_path="output_pipeline"
+
+
+echo "model_dir : ${model_dir}"
+echo "img_dir: ${img_dir}"
+
+# TODO: support batch size>1
+for use_mkldnn in "True" "False"; do
+    for threads in "1" "6"; do
+            echo "${model_name}  ${model_dir}, use_mkldnn: ${use_mkldnn}   threads: ${threads}"
+            python deploy/python/infer.py \
+		 --model_dir=${model_dir} \
+		 --run_benchmark True \
+		 --enable_mkldnn=${use_mkldnn} \
+		 --device=CPU \
+		 --cpu_threads=${threads} \
+		 --image_dir=${img_dir}  2>&1 | tee ${log_path}/${model_name}_cpu_usemkldnn_${use_mkldnn}_cputhreads_${threads}_bs1_infer.log
+    done
+done
+
+for run_mode in "fluid" "trt_fp32" "trt_fp16"; do
+    echo "${model_name}  ${model_dir}, run_mode: ${run_mode}"
+    python deploy/python/infer.py \
+	--model_dir=${model_dir} \
+	--run_benchmark=True \
+	--device=GPU \
+	--run_mode=${run_mode} \
+	--image_dir=${img_dir}  2>&1 | tee ${log_path}/${model_name}_gpu_runmode_${run_mode}_bs1_infer.log
+done
+
diff --git a/deploy/benchmark/benchmark_quant.sh b/deploy/benchmark/benchmark_quant.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a21541dd044bf9bd4a33bb4eb2116b47743e5a8a
--- /dev/null
+++ b/deploy/benchmark/benchmark_quant.sh
@@ -0,0 +1,23 @@
+# All rights `PaddleDetection` reserved
+#!/bin/bash
+model_dir=$1
+model_name=$2
+
+export img_dir="demo"
+export log_path="output_pipeline"
+
+
+echo "model_dir : ${model_dir}"
+echo "img_dir: ${img_dir}"
+
+# TODO: support batch size>1
+for run_mode in "trt_int8"; do
+    echo "${model_name}  ${model_dir}, run_mode: ${run_mode}"
+    python deploy/python/infer.py \
+	--model_dir=${model_dir} \
+	--run_benchmark=True \
+	--device=GPU \
+	--run_mode=${run_mode} \
+	--image_dir=${img_dir}  2>&1 | tee ${log_path}/${model_name}_gpu_runmode_${run_mode}_bs1_infer.log
+done
+
diff --git a/deploy/benchmark/log_parser_excel.py b/deploy/benchmark/log_parser_excel.py
new file mode 100644
index 0000000000000000000000000000000000000000..317b3759572c6acef3438fbc654bc5918e8bdd38
--- /dev/null
+++ b/deploy/benchmark/log_parser_excel.py
@@ -0,0 +1,300 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import argparse
+import pandas as pd
+
+
+def parse_args():
+    """
+    parse input args
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--log_path",
+        type=str,
+        default="./output_pipeline",
+        help="benchmark log path")
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        default="benchmark_excel.xlsx",
+        help="output excel file name")
+    parser.add_argument(
+        "--analysis_trt", dest="analysis_trt", action='store_true')
+    parser.add_argument(
+        "--analysis_mkl", dest="analysis_mkl", action='store_true')
+    return parser.parse_args()
+
+
+def find_all_logs(path_walk):
+    """
+    find all .log files from target dir
+    """
+    for root, ds, files in os.walk(path_walk):
+        for file_name in files:
+            if re.match(r'.*.log', file_name):
+                full_path = os.path.join(root, file_name)
+                yield file_name, full_path
+
+
+def process_log(file_name):
+    """
+    process log to dict
+    """
+    output_dict = {}
+    with open(file_name, 'r') as f:
+        for i, data in enumerate(f.readlines()):
+            if i == 0:
+                continue
+            line_lists = data.split(" ")
+
+            # conf info
+            if "runtime_device:" in line_lists:
+                pos_buf = line_lists.index("runtime_device:")
+                output_dict["runtime_device"] = line_lists[pos_buf + 1].strip()
+            if "ir_optim:" in line_lists:
+                pos_buf = line_lists.index("ir_optim:")
+                output_dict["ir_optim"] = line_lists[pos_buf + 1].strip()
+            if "enable_memory_optim:" in line_lists:
+                pos_buf = line_lists.index("enable_memory_optim:")
+                output_dict["enable_memory_optim"] = line_lists[pos_buf +
+                                                                1].strip()
+            if "enable_tensorrt:" in line_lists:
+                pos_buf = line_lists.index("enable_tensorrt:")
+                output_dict["enable_tensorrt"] = line_lists[pos_buf + 1].strip()
+            if "precision:" in line_lists:
+                pos_buf = line_lists.index("precision:")
+                output_dict["precision"] = line_lists[pos_buf + 1].strip()
+            if "enable_mkldnn:" in line_lists:
+                pos_buf = line_lists.index("enable_mkldnn:")
+                output_dict["enable_mkldnn"] = line_lists[pos_buf + 1].strip()
+            if "cpu_math_library_num_threads:" in line_lists:
+                pos_buf = line_lists.index("cpu_math_library_num_threads:")
+                output_dict["cpu_math_library_num_threads"] = line_lists[
+                    pos_buf + 1].strip()
+
+            # model info
+            if "model_name:" in line_lists:
+                pos_buf = line_lists.index("model_name:")
+                output_dict["model_name"] = list(
+                    filter(None, line_lists[pos_buf + 1].strip().split('/')))[
+                        -1]
+
+            # data info
+            if "batch_size:" in line_lists:
+                pos_buf = line_lists.index("batch_size:")
+                output_dict["batch_size"] = line_lists[pos_buf + 1].strip()
+            if "input_shape:" in line_lists:
+                pos_buf = line_lists.index("input_shape:")
+                output_dict["input_shape"] = line_lists[pos_buf + 1].strip()
+
+            # perf info
+            if "cpu_rss(MB):" in line_lists:
+                pos_buf = line_lists.index("cpu_rss(MB):")
+                output_dict["cpu_rss(MB)"] = line_lists[pos_buf + 1].strip(
+                ).split(',')[0]
+            if "gpu_rss(MB):" in line_lists:
+                pos_buf = line_lists.index("gpu_rss(MB):")
+                output_dict["gpu_rss(MB)"] = line_lists[pos_buf + 1].strip(
+                ).split(',')[0]
+            if "gpu_util:" in line_lists:
+                pos_buf = line_lists.index("gpu_util:")
+                output_dict["gpu_util"] = line_lists[pos_buf + 1].strip().split(
+                    ',')[0]
+            if "preproce_time(ms):" in line_lists:
+                pos_buf = line_lists.index("preproce_time(ms):")
+                output_dict["preproce_time(ms)"] = line_lists[
+                    pos_buf + 1].strip().split(',')[0]
+            if "inference_time(ms):" in line_lists:
+                pos_buf = line_lists.index("inference_time(ms):")
+                output_dict["inference_time(ms)"] = line_lists[
+                    pos_buf + 1].strip().split(',')[0]
+            if "postprocess_time(ms):" in line_lists:
+                pos_buf = line_lists.index("postprocess_time(ms):")
+                output_dict["postprocess_time(ms)"] = line_lists[
+                    pos_buf + 1].strip().split(',')[0]
+    return output_dict
+
+
+def filter_df_merge(cpu_df, filter_column=None):
+    """
+    process cpu data frame, merge by 'model_name', 'batch_size'
+    Args:
+        cpu_df ([type]): [description]
+    """
+    if not filter_column:
+        raise Exception(
+            "please assign filter_column for filter_df_merge function")
+
+    df_lists = []
+    filter_column_lists = []
+    for k, v in cpu_df.groupby(filter_column, dropna=True):
+        filter_column_lists.append(k)
+        df_lists.append(v)
+    final_output_df = df_lists[-1]
+
+    # merge same model
+    for i in range(len(df_lists) - 1):
+        left_suffix = cpu_df[filter_column].unique()[0]
+        right_suffix = df_lists[i][filter_column].unique()[0]
+        print(left_suffix, right_suffix)
+        if not pd.isnull(right_suffix):
+            final_output_df = pd.merge(
+                final_output_df,
+                df_lists[i],
+                how='left',
+                left_on=['model_name', 'batch_size'],
+                right_on=['model_name', 'batch_size'],
+                suffixes=('', '_{0}_{1}'.format(filter_column, right_suffix)))
+
+    # rename default df columns
+    origin_column_names = list(cpu_df.columns.values)
+    origin_column_names.remove(filter_column)
+    suffix = final_output_df[filter_column].unique()[0]
+    for name in origin_column_names:
+        final_output_df.rename(
+            columns={name: "{0}_{1}_{2}".format(name, filter_column, suffix)},
+            inplace=True)
+    final_output_df.rename(
+        columns={
+            filter_column: "{0}_{1}_{2}".format(filter_column, filter_column,
+                                                suffix)
+        },
+        inplace=True)
+
+    final_output_df.sort_values(
+        by=[
+            "model_name_{0}_{1}".format(filter_column, suffix),
+            "batch_size_{0}_{1}".format(filter_column, suffix)
+        ],
+        inplace=True)
+    return final_output_df
+
+
+def trt_perf_analysis(raw_df):
+    """
+    sperate raw dataframe to a list of dataframe
+    compare tensorrt percision performance
+    """
+    # filter df by gpu, compare tensorrt and gpu
+    # define default dataframe for gpu performance analysis
+    gpu_df = raw_df.loc[raw_df['runtime_device'] == 'gpu']
+    new_df = filter_df_merge(gpu_df, "precision")
+
+    # calculate qps diff percentile
+    infer_fp32 = "inference_time(ms)_precision_fp32"
+    infer_fp16 = "inference_time(ms)_precision_fp16"
+    infer_int8 = "inference_time(ms)_precision_int8"
+    new_df["fp32_fp16_diff"] = new_df[[infer_fp32, infer_fp16]].apply(
+        lambda x: (float(x[infer_fp16]) - float(x[infer_fp32])) / float(x[infer_fp32]),
+        axis=1)
+    new_df["fp32_gpu_diff"] = new_df[["inference_time(ms)", infer_fp32]].apply(
+        lambda x: (float(x[infer_fp32]) - float(x[infer_fp32])) / float(x["inference_time(ms)"]),
+        axis=1)
+    new_df["fp16_int8_diff"] = new_df[[infer_fp16, infer_int8]].apply(
+        lambda x: (float(x[infer_int8]) - float(x[infer_fp16])) / float(x[infer_fp16]),
+        axis=1)
+
+    return new_df
+
+
+def mkl_perf_analysis(raw_df):
+    """
+    sperate raw dataframe to a list of dataframe
+    compare mkldnn performance with not enable mkldnn
+    """
+    # filter df by cpu, compare mkl and cpu
+    # define default dataframe for cpu mkldnn analysis
+    cpu_df = raw_df.loc[raw_df['runtime_device'] == 'cpu']
+    mkl_compare_df = cpu_df.loc[cpu_df['cpu_math_library_num_threads'] == '1']
+    thread_compare_df = cpu_df.loc[cpu_df['enable_mkldnn'] == 'True']
+
+    # define dataframe need to be analyzed
+    output_mkl_df = filter_df_merge(mkl_compare_df, 'enable_mkldnn')
+    output_thread_df = filter_df_merge(thread_compare_df,
+                                       'cpu_math_library_num_threads')
+
+    # calculate performance diff percentile
+    # compare mkl performance with cpu
+    enable_mkldnn = "inference_time(ms)_enable_mkldnn_True"
+    disable_mkldnn = "inference_time(ms)_enable_mkldnn_False"
+    output_mkl_df["mkl_infer_diff"] = output_mkl_df[[
+        enable_mkldnn, disable_mkldnn
+    ]].apply(
+        lambda x: (float(x[enable_mkldnn]) - float(x[disable_mkldnn])) / float(x[disable_mkldnn]),
+        axis=1)
+    cpu_enable_mkldnn = "cpu_rss(MB)_enable_mkldnn_True"
+    cpu_disable_mkldnn = "cpu_rss(MB)_enable_mkldnn_False"
+    output_mkl_df["mkl_cpu_rss_diff"] = output_mkl_df[[
+        cpu_enable_mkldnn, cpu_disable_mkldnn
+    ]].apply(
+        lambda x: (float(x[cpu_enable_mkldnn]) - float(x[cpu_disable_mkldnn])) / float(x[cpu_disable_mkldnn]),
+        axis=1)
+
+    # compare cpu_multi_thread performance with cpu
+    num_threads_1 = "inference_time(ms)_cpu_math_library_num_threads_1"
+    num_threads_6 = "inference_time(ms)_cpu_math_library_num_threads_6"
+    output_thread_df["mkl_infer_diff"] = output_thread_df[[
+        num_threads_6, num_threads_1
+    ]].apply(
+        lambda x: (float(x[num_threads_6]) - float(x[num_threads_1])) / float(x[num_threads_1]),
+        axis=1)
+    cpu_num_threads_1 = "cpu_rss(MB)_cpu_math_library_num_threads_1"
+    cpu_num_threads_6 = "cpu_rss(MB)_cpu_math_library_num_threads_6"
+    output_thread_df["mkl_cpu_rss_diff"] = output_thread_df[[
+        cpu_num_threads_6, cpu_num_threads_1
+    ]].apply(
+        lambda x: (float(x[cpu_num_threads_6]) - float(x[cpu_num_threads_1])) / float(x[cpu_num_threads_1]),
+        axis=1)
+
+    return output_mkl_df, output_thread_df
+
+
+def main():
+    """
+    main
+    """
+    args = parse_args()
+    # create empty DataFrame
+    origin_df = pd.DataFrame(columns=[
+        "model_name", "batch_size", "input_shape", "runtime_device", "ir_optim",
+        "enable_memory_optim", "enable_tensorrt", "precision", "enable_mkldnn",
+        "cpu_math_library_num_threads", "preproce_time(ms)",
+        "inference_time(ms)", "postprocess_time(ms)", "cpu_rss(MB)",
+        "gpu_rss(MB)", "gpu_util"
+    ])
+
+    for file_name, full_path in find_all_logs(args.log_path):
+        dict_log = process_log(full_path)
+        origin_df = origin_df.append(dict_log, ignore_index=True)
+
+    raw_df = origin_df.sort_values(by='model_name')
+    raw_df.sort_values(by=["model_name", "batch_size"], inplace=True)
+    raw_df.to_excel(args.output_name)
+
+    if args.analysis_trt:
+        trt_df = trt_perf_analysis(raw_df)
+        trt_df.to_excel("trt_analysis_{}".format(args.output_name))
+
+    if args.analysis_mkl:
+        mkl_df, thread_df = mkl_perf_analysis(raw_df)
+        mkl_df.to_excel("mkl_enable_analysis_{}".format(args.output_name))
+        thread_df.to_excel("mkl_threads_analysis_{}".format(args.output_name))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deploy/cpp/CMakeLists.txt b/deploy/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..34f8808d53e085c43048c4955a5715d663e4291e
--- /dev/null
+++ b/deploy/cpp/CMakeLists.txt
@@ -0,0 +1,264 @@
+cmake_minimum_required(VERSION 3.0)
+project(PaddleObjectDetector CXX C)
+
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support,defaultuseMKL."          ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    ON)
+option(WITH_TENSORRT   "Compile demo with TensorRT."                                    OFF)
+
+option(WITH_KEYPOINT        "Whether to Compile KeyPoint detector"                    OFF)
+option(WITH_MOT       "Whether to Compile MOT detector" OFF)
+
+SET(PADDLE_DIR "" CACHE PATH "Location of libraries")
+SET(PADDLE_LIB_NAME "" CACHE STRING "libpaddle_inference")
+SET(OPENCV_DIR "" CACHE PATH "Location of libraries")
+SET(CUDA_LIB "" CACHE PATH "Location of libraries")
+SET(CUDNN_LIB "" CACHE PATH "Location of libraries")
+SET(TENSORRT_INC_DIR "" CACHE PATH "Compile demo with TensorRT")
+SET(TENSORRT_LIB_DIR "" CACHE PATH "Compile demo with TensorRT")
+
+include(cmake/yaml-cpp.cmake)
+
+include_directories("${CMAKE_SOURCE_DIR}/")
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/ext/yaml-cpp/src/ext-yaml-cpp/include")
+link_directories("${CMAKE_CURRENT_BINARY_DIR}/ext/yaml-cpp/lib")
+
+if (WITH_KEYPOINT)
+    set(SRCS src/main_keypoint.cc src/preprocess_op.cc src/object_detector.cc src/picodet_postprocess.cc src/utils.cc src/keypoint_detector.cc src/keypoint_postprocess.cc)
+elseif (WITH_MOT)
+    set(SRCS src/main_jde.cc src/preprocess_op.cc src/object_detector.cc src/jde_detector.cc src/tracker.cc src/trajectory.cc src/lapjv.cpp src/picodet_postprocess.cc src/utils.cc)
+else ()
+    set(SRCS src/main.cc src/preprocess_op.cc src/object_detector.cc src/picodet_postprocess.cc src/utils.cc)
+endif()
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
+
+if (WITH_MKL)
+    ADD_DEFINITIONS(-DUSE_MKL)
+endif()
+
+if (NOT DEFINED PADDLE_DIR OR ${PADDLE_DIR} STREQUAL "")
+    message(FATAL_ERROR "please set PADDLE_DIR with -DPADDLE_DIR=/path/paddle_influence_dir")
+endif()
+message("PADDLE_DIR IS:" ${PADDLE_DIR})
+
+if (NOT DEFINED OPENCV_DIR OR ${OPENCV_DIR} STREQUAL "")
+    message(FATAL_ERROR "please set OPENCV_DIR with -DOPENCV_DIR=/path/opencv")
+endif()
+
+include_directories("${CMAKE_SOURCE_DIR}/")
+include_directories("${PADDLE_DIR}/")
+include_directories("${PADDLE_DIR}/third_party/install/protobuf/include")
+include_directories("${PADDLE_DIR}/third_party/install/glog/include")
+include_directories("${PADDLE_DIR}/third_party/install/gflags/include")
+include_directories("${PADDLE_DIR}/third_party/install/xxhash/include")
+if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/include")
+    include_directories("${PADDLE_DIR}/third_party/install/snappy/include")
+endif()
+if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/include")
+    include_directories("${PADDLE_DIR}/third_party/install/snappystream/include")
+endif()
+include_directories("${PADDLE_DIR}/third_party/boost")
+include_directories("${PADDLE_DIR}/third_party/eigen3")
+
+if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib")
+    link_directories("${PADDLE_DIR}/third_party/install/snappy/lib")
+endif()
+if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib")
+    link_directories("${PADDLE_DIR}/third_party/install/snappystream/lib")
+endif()
+
+link_directories("${PADDLE_DIR}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_DIR}/third_party/install/glog/lib")
+link_directories("${PADDLE_DIR}/third_party/install/gflags/lib")
+link_directories("${PADDLE_DIR}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_DIR}/third_party/install/paddle2onnx/lib")
+link_directories("${PADDLE_DIR}/third_party/install/onnxruntime/lib")
+link_directories("${PADDLE_DIR}/paddle/lib/")
+link_directories("${CMAKE_CURRENT_BINARY_DIR}")
+
+
+
+if (WIN32)
+  include_directories("${PADDLE_DIR}/paddle/fluid/inference")
+  include_directories("${PADDLE_DIR}/paddle/include")
+  link_directories("${PADDLE_DIR}/paddle/fluid/inference")
+  find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/build/ NO_DEFAULT_PATH)
+
+else ()
+  find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/share/OpenCV NO_DEFAULT_PATH)
+  include_directories("${PADDLE_DIR}/paddle/include")
+  link_directories("${PADDLE_DIR}/paddle/lib")
+endif ()
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+if (WIN32)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -o2 -fopenmp -std=c++11")
+    set(CMAKE_STATIC_LIBRARY_PREFIX "")
+endif()
+
+# TODO let users define cuda lib path
+if (WITH_GPU)
+    if (NOT DEFINED CUDA_LIB OR ${CUDA_LIB} STREQUAL "")
+        message(FATAL_ERROR "please set CUDA_LIB with -DCUDA_LIB=/path/cuda-8.0/lib64")
+    endif()
+    if (NOT WIN32)
+        if (NOT DEFINED CUDNN_LIB)
+            message(FATAL_ERROR "please set CUDNN_LIB with -DCUDNN_LIB=/path/cudnn_v7.4/cuda/lib64")
+        endif()
+    endif(NOT WIN32)
+endif()
+
+
+if (NOT WIN32)
+  if (WITH_TENSORRT AND WITH_GPU)
+	  include_directories("${TENSORRT_INC_DIR}/")
+	  link_directories("${TENSORRT_LIB_DIR}/")
+  endif()
+endif(NOT WIN32)
+
+if (NOT WIN32)
+    set(NGRAPH_PATH "${PADDLE_DIR}/third_party/install/ngraph")
+    if(EXISTS ${NGRAPH_PATH})
+        include(GNUInstallDirs)
+        include_directories("${NGRAPH_PATH}/include")
+        link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
+        set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+endif()
+
+if(WITH_MKL)
+  include_directories("${PADDLE_DIR}/third_party/install/mklml/include")
+  if (WIN32)
+    set(MATH_LIB ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.lib
+            ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.lib)
+  else ()
+    set(MATH_LIB ${PADDLE_DIR}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+            ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+    execute_process(COMMAND cp -r ${PADDLE_DIR}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} /usr/lib)
+  endif ()
+  set(MKLDNN_PATH "${PADDLE_DIR}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    if (WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+    else ()
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    endif ()
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_DIR}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+
+if (WIN32)
+    if(EXISTS "${PADDLE_DIR}/paddle/fluid/inference/${PADDLE_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+        set(DEPS
+            ${PADDLE_DIR}/paddle/fluid/inference/${PADDLE_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+    else()
+        set(DEPS
+            ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
+endif()
+
+
+if (WIN32)
+    set(DEPS ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+    set(DEPS ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+message("PADDLE_LIB_NAME:" ${PADDLE_LIB_NAME})
+message("DEPS:" $DEPS)
+
+if (NOT WIN32)
+    set(DEPS ${DEPS}
+        ${MATH_LIB} ${MKLDNN_LIB}
+        glog gflags protobuf z xxhash yaml-cpp
+        )
+    if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib")
+        set(DEPS ${DEPS} snappystream)
+    endif()
+    if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib")
+        set(DEPS ${DEPS} snappy)
+    endif()
+else()
+    set(DEPS ${DEPS}
+        ${MATH_LIB} ${MKLDNN_LIB}
+        glog gflags_static libprotobuf xxhash libyaml-cppmt)
+    set(DEPS ${DEPS} libcmt shlwapi)
+    if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib")
+        set(DEPS ${DEPS} snappy)
+    endif()
+    if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib")
+        set(DEPS ${DEPS} snappystream)
+    endif()
+endif(NOT WIN32)
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    if (WITH_TENSORRT)
+	    set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+	    set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else()
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDNN_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+endif()
+
+if (NOT WIN32)
+    set(EXTERNAL_LIB "-ldl -lrt -lgomp -lz -lm -lpthread")
+    set(DEPS ${DEPS} ${EXTERNAL_LIB})
+endif()
+
+set(DEPS ${DEPS} ${OpenCV_LIBS})
+add_executable(main ${SRCS})
+ADD_DEPENDENCIES(main ext-yaml-cpp)
+message("DEPS:" $DEPS)
+target_link_libraries(main ${DEPS})
+
+if (WIN32 AND WITH_MKL)
+    add_custom_command(TARGET main POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./mklml.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./libiomp5md.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./mkldnn.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./release/mkldnn.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}.dll ./release/${PADDLE_LIB_NAME}.dll
+    )
+endif()
+
+if (WIN32 AND NOT WITH_MKL)
+    add_custom_command(TARGET main POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/openblas/lib/openblas.dll ./openblas.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/openblas/lib/openblas.dll ./release/openblas.dll
+    )
+endif()
+
+if (WIN32)
+    add_custom_command(TARGET main POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/onnxruntime/lib/onnxruntime.dll ./onnxruntime.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/paddle2onnx/lib/paddle2onnx.dll ./paddle2onnx.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/onnxruntime/lib/onnxruntime.dll ./release/onnxruntime.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/paddle2onnx/lib/paddle2onnx.dll ./release/paddle2onnx.dll
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}.dll ./release/${PADDLE_LIB_NAME}.dll
+    )
+endif()
diff --git a/deploy/cpp/README.md b/deploy/cpp/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ffa5e251e7913b4af30fa6abe9912c9434af996f
--- /dev/null
+++ b/deploy/cpp/README.md
@@ -0,0 +1,54 @@
+# C++端预测部署
+
+
+
+## 各环境编译部署教程
+- [Linux 编译部署](docs/linux_build.md)
+- [Windows编译部署(使用Visual Studio 2019)](docs/windows_vs2019_build.md)
+- [NV Jetson编译部署](docs/Jetson_build.md)
+
+
+## C++部署总览
+[1.说明](#1说明)
+
+[2.主要目录和文件](#2主要目录和文件)
+
+
+### 1.说明
+
+本目录为用户提供一个跨平台的`C++`部署方案，让用户通过`PaddleDetection`训练的模型导出后，即可基于本项目快速运行，也可以快速集成代码结合到自己的项目实际应用中去。
+
+主要设计的目标包括以下四点：
+- 跨平台，支持在 `Windows` 和 `Linux` 完成编译、二次开发集成和部署运行
+- 可扩展性，支持用户针对新模型开发自己特殊的数据预处理等逻辑
+- 高性能，除了`PaddlePaddle`自身带来的性能优势，我们还针对图像检测的特点对关键步骤进行了性能优化
+- 支持各种不同检测模型结构，包括`Yolov3`/`Faster_RCNN`/`SSD`等
+
+### 2.主要目录和文件
+
+```bash
+deploy/cpp
+|
+├── src
+│   ├── main.cc # 集成代码示例, 程序入口
+│   ├── object_detector.cc # 模型加载和预测主要逻辑封装类实现
+│   └── preprocess_op.cc # 预处理相关主要逻辑封装实现
+|
+├── include
+│   ├── config_parser.h # 导出模型配置yaml文件解析
+│   ├── object_detector.h # 模型加载和预测主要逻辑封装类
+│   └── preprocess_op.h # 预处理相关主要逻辑类封装
+|
+├── docs
+│   ├── linux_build.md # Linux 编译指南
+│   └── windows_vs2019_build.md # Windows VS2019编译指南
+│
+├── build.sh # 编译命令脚本
+│
+├── CMakeList.txt # cmake编译入口文件
+|
+├── CMakeSettings.json # Visual Studio 2019 CMake项目编译设置
+│
+└── cmake # 依赖的外部项目cmake（目前仅有yaml-cpp）
+
+```
diff --git a/deploy/cpp/cmake/yaml-cpp.cmake b/deploy/cpp/cmake/yaml-cpp.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7bc7f34d476d69d57336940bcf6c8c55311b8112
--- /dev/null
+++ b/deploy/cpp/cmake/yaml-cpp.cmake
@@ -0,0 +1,30 @@
+
+find_package(Git REQUIRED)
+
+include(ExternalProject)
+
+message("${CMAKE_BUILD_TYPE}")
+
+ExternalProject_Add(
+        ext-yaml-cpp
+        URL https://bj.bcebos.com/paddlex/deploy/deps/yaml-cpp.zip
+        URL_MD5 9542d6de397d1fbd649ed468cb5850e6
+        CMAKE_ARGS
+        -DYAML_CPP_BUILD_TESTS=OFF
+		-DYAML_CPP_BUILD_TOOLS=OFF
+        -DYAML_CPP_INSTALL=OFF
+        -DYAML_CPP_BUILD_CONTRIB=OFF
+		-DMSVC_SHARED_RT=OFF
+		-DBUILD_SHARED_LIBS=OFF
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+        -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=${CMAKE_BINARY_DIR}/ext/yaml-cpp/lib
+        -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY=${CMAKE_BINARY_DIR}/ext/yaml-cpp/lib
+        PREFIX "${CMAKE_BINARY_DIR}/ext/yaml-cpp"
+        # Disable install step
+        INSTALL_COMMAND ""
+	    LOG_DOWNLOAD ON
+        LOG_BUILD 1
+)
diff --git a/deploy/cpp/docs/Jetson_build.md b/deploy/cpp/docs/Jetson_build.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea9699a438ed3977e118b155a01b533d83bb12f4
--- /dev/null
+++ b/deploy/cpp/docs/Jetson_build.md
@@ -0,0 +1,210 @@
+# Jetson平台编译指南
+
+## 说明
+`NVIDIA Jetson`设备是具有`NVIDIA GPU`的嵌入式设备，可以将目标检测算法部署到该设备上。本文档是在`Jetson`硬件上部署`PaddleDetection`模型的教程。
+
+本文档以`Jetson TX2`硬件、`JetPack 4.3`版本为例进行说明。
+
+`Jetson`平台的开发指南请参考[NVIDIA Jetson Linux Developer Guide](https://docs.nvidia.com/jetson/l4t/index.html).
+
+## Jetson环境搭建
+`Jetson`系统软件安装，请参考[NVIDIA Jetson Linux Developer Guide](https://docs.nvidia.com/jetson/l4t/index.html).
+
+* (1) 查看硬件系统的l4t的版本号
+```
+cat /etc/nv_tegra_release
+```
+* (2) 根据硬件，选择硬件可安装的`JetPack`版本，硬件和`JetPack`版本对应关系请参考[jetpack-archive](https://developer.nvidia.com/embedded/jetpack-archive).
+
+* (3) 下载`JetPack`，请参考[NVIDIA Jetson Linux Developer Guide](https://docs.nvidia.com/jetson/l4t/index.html) 中的`Preparing a Jetson Developer Kit for Use`章节内容进行刷写系统镜像。
+
+**注意**: 请在[jetpack-archive](https://developer.nvidia.com/embedded/jetpack-archive) 根据硬件选择适配的`JetPack`版本进行刷机。
+
+## 下载或编译`Paddle`预测库
+本文档使用`Paddle`在`JetPack4.3`上预先编译好的预测库，请根据硬件在[安装与编译 Linux 预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) 中选择对应版本的`Paddle`预测库。
+
+这里选择[nv_jetson_cuda10_cudnn7.6_trt6(jetpack4.3)](https://paddle-inference-lib.bj.bcebos.com/2.0.0-nv-jetson-jetpack4.3-all/paddle_inference.tgz), `Paddle`版本`2.0.0-rc0`,`CUDA`版本`10.0`,`CUDNN`版本`7.6`，`TensorRT`版本`6`。
+
+若需要自己在`Jetson`平台上自定义编译`Paddle`库，请参考文档[安装与编译 Linux 预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html) 的`NVIDIA Jetson嵌入式硬件预测库源码编译`部分内容。
+
+### Step1: 下载代码
+
+ `git clone https://github.com/PaddlePaddle/PaddleDetection.git`
+
+**说明**：其中`C++`预测代码在`/root/projects/PaddleDetection/deploy/cpp` 目录，该目录不依赖任何`PaddleDetection`下其他目录。
+
+
+### Step2: 下载PaddlePaddle C++ 预测库 paddle_inference
+
+解压下载的[nv_jetson_cuda10_cudnn7.6_trt6(jetpack4.3)](https://paddle-inference-lib.bj.bcebos.com/2.0.1-nv-jetson-jetpack4.3-all/paddle_inference.tgz) 。
+
+下载并解压后`/root/projects/paddle_inference`目录包含内容为：
+```
+paddle_inference
+├── paddle # paddle核心库和头文件
+|
+├── third_party # 第三方依赖库和头文件
+|
+└── version.txt # 版本和编译信息
+```
+
+**注意:** 预编译库`nv-jetson-cuda10-cudnn7.6-trt6`使用的`GCC`版本是`7.5.0`，其他都是使用`GCC 4.8.5`编译的。使用高版本的GCC可能存在`ABI`兼容性问题，建议降级或[自行编译预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)。
+
+
+### Step4: 编译
+
+编译`cmake`的命令在`scripts/build.sh`中，请根据实际情况修改主要参数，其主要内容说明如下：
+
+注意，`TX2`平台的`CUDA`、`CUDNN`需要通过`JetPack`安装。
+
+```
+# 是否使用GPU(即是否使用 CUDA)
+WITH_GPU=ON
+
+# 是否使用MKL or openblas，TX2需要设置为OFF
+WITH_MKL=OFF
+
+# 是否集成 TensorRT(仅WITH_GPU=ON 有效)
+WITH_TENSORRT=ON
+
+# TensorRT 的include路径
+TENSORRT_INC_DIR=/usr/include/aarch64-linux-gnu
+
+# TensorRT 的lib路径
+TENSORRT_LIB_DIR=/usr/lib/aarch64-linux-gnu
+
+# Paddle 预测库路径
+PADDLE_DIR=/path/to/paddle_inference/
+
+# Paddle 预测库名称
+PADDLE_LIB_NAME=paddle_inference
+
+# Paddle 的预测库是否使用静态库来编译
+# 使用TensorRT时，Paddle的预测库通常为动态库
+WITH_STATIC_LIB=OFF
+
+# CUDA 的 lib 路径
+CUDA_LIB=/usr/local/cuda-10.0/lib64
+
+# CUDNN 的 lib 路径
+CUDNN_LIB=/usr/lib/aarch64-linux-gnu
+
+# 是否开启关键点模型预测功能
+WITH_KEYPOINT=ON
+
+# OPENCV_DIR 的路径
+# linux平台请下载：https://bj.bcebos.com/paddleseg/deploy/opencv3.4.6gcc4.8ffmpeg.tar.gz2，并解压到deps文件夹下
+# TX2平台请下载：https://paddlemodels.bj.bcebos.com/TX2_JetPack4.3_opencv_3.4.10_gcc7.5.0.zip，并解压到deps文件夹下
+OPENCV_DIR=/path/to/opencv
+
+# 请检查以上各个路径是否正确
+
+# 以下无需改动
+cmake .. \
+    -DWITH_GPU=${WITH_GPU} \
+    -DWITH_MKL=OFF \
+    -DWITH_TENSORRT=${WITH_TENSORRT} \
+    -DTENSORRT_DIR=${TENSORRT_DIR} \
+    -DPADDLE_DIR=${PADDLE_DIR} \
+    -DWITH_STATIC_LIB=${WITH_STATIC_LIB} \
+    -DCUDA_LIB=${CUDA_LIB} \
+    -DCUDNN_LIB=${CUDNN_LIB} \
+    -DOPENCV_DIR=${OPENCV_DIR} \
+    -DPADDLE_LIB_NAME={PADDLE_LIB_NAME} \
+    -DWITH_KEYPOINT=${WITH_KEYPOINT}
+make
+```
+
+例如设置如下：
+```
+# 是否使用GPU(即是否使用 CUDA)
+WITH_GPU=ON
+
+# 是否使用MKL or openblas
+WITH_MKL=OFF
+
+# 是否集成 TensorRT(仅WITH_GPU=ON 有效)
+WITH_TENSORRT=OFF
+
+# TensorRT 的include路径
+TENSORRT_INC_DIR=/usr/include/aarch64-linux-gnu
+
+# TensorRT 的lib路径
+TENSORRT_LIB_DIR=/usr/lib/aarch64-linux-gnu
+
+# Paddle 预测库路径
+PADDLE_DIR=/home/nvidia/PaddleDetection_infer/paddle_inference/
+
+# Paddle 预测库名称
+PADDLE_LIB_NAME=paddle_inference
+
+# Paddle 的预测库是否使用静态库来编译
+# 使用TensorRT时，Paddle的预测库通常为动态库
+WITH_STATIC_LIB=OFF
+
+# CUDA 的 lib 路径
+CUDA_LIB=/usr/local/cuda-10.0/lib64
+
+# CUDNN 的 lib 路径
+CUDNN_LIB=/usr/lib/aarch64-linux-gnu/
+
+# 是否开启关键点模型预测功能
+WITH_KEYPOINT=ON
+```
+
+修改脚本设置好主要参数后，执行`build`脚本：
+ ```shell
+ sh ./scripts/build.sh
+ ```
+
+### Step5: 预测及可视化
+编译成功后，预测入口程序为`build/main`其主要命令参数说明如下：
+|  参数   | 说明  |
+|  ----  | ----  |
+| --model_dir  | 导出的检测预测模型所在路径 |
+| --model_dir_keypoint  | Option | 导出的关键点预测模型所在路径 |
+| --image_file  | 要预测的图片文件路径 |
+| --image_dir  |  要预测的图片文件夹路径   |
+| --video_file  | 要预测的视频文件路径 |
+| --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
+| --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
+| --run_mode | 使用GPU时，默认为paddle, 可选（paddle/trt_fp32/trt_fp16/trt_int8）|
+| --batch_size  | 检测模型预测时的batch size，在指定`image_dir`时有效 |
+| --batch_size_keypoint  | 关键点模型预测时的batch size，默认为8 |
+| --run_benchmark | 是否重复预测来进行benchmark测速 ｜
+| --output_dir | 输出图片所在的文件夹, 默认为output ｜
+| --use_mkldnn | CPU预测中是否开启MKLDNN加速 |
+| --cpu_threads | 设置cpu线程数，默认为1 |
+| --use_dark | 关键点模型输出预测是否使用DarkPose后处理，默认为true |
+
+**注意**:
+- 优先级顺序：`camera_id` > `video_file` > `image_dir` > `image_file`。
+- --run_benchmark如果设置为True，则需要安装依赖`pip install pynvml psutil GPUtil`。
+
+
+`样例一`：
+```shell
+#不使用`GPU`测试图片 `/root/projects/images/test.jpeg`  
+./main --model_dir=/root/projects/models/yolov3_darknet --image_file=/root/projects/images/test.jpeg
+```
+
+图片文件`可视化预测结果`会保存在当前目录下`output.jpg`文件中。
+
+
+`样例二`:
+```shell
+#使用 `GPU`预测视频`/root/projects/videos/test.mp4`
+./main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --device=GPU
+```
+视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。
+
+`样例三`：
+```shell
+#使用关键点模型与检测模型联合预测，使用 `GPU`预测  
+#检测模型检测到的人送入关键点模型进行关键点预测
+./main --model_dir=/root/projects/models/yolov3_darknet --model_dir_keypoint=/root/projects/models/hrnet_w32_256x192 --image_file=/root/projects/images/test.jpeg --device=GPU
+```
+
+## 性能测试
+benchmark请查看[BENCHMARK_INFER](../../BENCHMARK_INFER.md)
diff --git a/deploy/cpp/docs/linux_build.md b/deploy/cpp/docs/linux_build.md
new file mode 100644
index 0000000000000000000000000000000000000000..ee28e73ee56db3ec46a1674a6af0cb3af1012b3e
--- /dev/null
+++ b/deploy/cpp/docs/linux_build.md
@@ -0,0 +1,149 @@
+# Linux平台编译指南
+
+## 说明
+本文档在 `Linux`平台使用`GCC 8.2`测试过，如果需要使用其他G++版本编译使用，则需要重新编译Paddle预测库，请参考: [从源码编译Paddle预测库](https://paddleinference.paddlepaddle.org.cn/user_guides/source_compile.html)。本文档使用的预置的opencv库是在ubuntu 16.04上用gcc8.2编译的，如果需要在gcc8.2以外的环境编译，那么需自行编译opencv库。
+
+## 前置条件
+* G++ 8.2
+* CUDA 9.0 / CUDA 10.1, cudnn 7+ （仅在使用GPU版本的预测库时需要）
+* CMake 3.0+
+
+请确保系统已经安装好上述基本软件，**下面所有示例以工作目录为 `/root/projects/`演示**。
+
+### Step1: 下载代码
+
+ `git clone https://github.com/PaddlePaddle/PaddleDetection.git`
+
+**说明**：其中`C++`预测代码在`/root/projects/PaddleDetection/deploy/cpp` 目录，该目录不依赖任何`PaddleDetection`下其他目录。
+
+
+### Step2: 下载PaddlePaddle C++ 预测库 paddle_inference
+
+PaddlePaddle C++ 预测库针对不同的`CPU`和`CUDA`版本提供了不同的预编译版本，请根据实际情况下载:  [C++预测库下载列表](https://paddleinference.paddlepaddle.org.cn/user_guides/download_lib.html)
+
+
+下载并解压后`/root/projects/paddle_inference`目录包含内容为：
+```
+paddle_inference
+├── paddle # paddle核心库和头文件
+|
+├── third_party # 第三方依赖库和头文件
+|
+└── version.txt # 版本和编译信息
+```
+
+**注意:** 预编译版本除`nv-jetson-cuda10-cudnn7.5-trt5` 以外其它包都是基于`GCC 4.8.5`编译，使用高版本`GCC`可能存在 `ABI`兼容性问题，建议降级或[自行编译预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)。
+
+
+### Step3: 编译
+
+编译`cmake`的命令在`scripts/build.sh`中，请根据实际情况修改主要参数，其主要内容说明如下：
+
+```
+# 是否使用GPU(即是否使用 CUDA)
+WITH_GPU=OFF
+
+# 使用MKL or openblas
+WITH_MKL=ON
+
+# 是否集成 TensorRT(仅WITH_GPU=ON 有效)
+WITH_TENSORRT=OFF
+
+# TensorRT 的include路径
+TENSORRT_LIB_DIR=/path/to/TensorRT/include
+
+# TensorRT 的lib路径
+TENSORRT_LIB_DIR=/path/to/TensorRT/lib
+
+# Paddle 预测库路径
+PADDLE_DIR=/path/to/paddle_inference
+
+# Paddle 预测库名称
+PADDLE_LIB_NAME=paddle_inference
+
+# CUDA 的 lib 路径
+CUDA_LIB=/path/to/cuda/lib
+
+# CUDNN 的 lib 路径
+CUDNN_LIB=/path/to/cudnn/lib
+
+# 是否开启关键点模型预测功能
+WITH_KEYPOINT=ON
+
+# 请检查以上各个路径是否正确
+
+# 以下无需改动
+cmake .. \
+    -DWITH_GPU=${WITH_GPU} \
+    -DWITH_MKL=${WITH_MKL} \
+    -DWITH_TENSORRT=${WITH_TENSORRT} \
+    -DTENSORRT_LIB_DIR=${TENSORRT_LIB_DIR} \
+    -DTENSORRT_INC_DIR=${TENSORRT_INC_DIR} \
+    -DPADDLE_DIR=${PADDLE_DIR} \
+    -DCUDA_LIB=${CUDA_LIB} \
+    -DCUDNN_LIB=${CUDNN_LIB} \
+    -DOPENCV_DIR=${OPENCV_DIR} \
+    -DPADDLE_LIB_NAME=${PADDLE_LIB_NAME} \
+    -DWITH_KEYPOINT=${WITH_KEYPOINT}
+make
+
+```
+
+修改脚本设置好主要参数后，执行`build`脚本：
+ ```shell
+ sh ./scripts/build.sh
+ ```
+
+**注意**: OPENCV依赖OPENBLAS，Ubuntu用户需确认系统是否已存在`libopenblas.so`。如未安装，可执行apt-get install libopenblas-dev进行安装。
+
+### Step4: 预测及可视化
+编译成功后，预测入口程序为`build/main`其主要命令参数说明如下：
+|  参数   | 说明  |
+|  ----  | ----  |
+| --model_dir  | 导出的检测预测模型所在路径 |
+| --model_dir_keypoint  | Option | 导出的关键点预测模型所在路径 |
+| --image_file  | 要预测的图片文件路径 |
+| --image_dir  |  要预测的图片文件夹路径   |
+| --video_file  | 要预测的视频文件路径 |
+| --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
+| --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
+| --run_mode | 使用GPU时，默认为paddle, 可选（paddle/trt_fp32/trt_fp16/trt_int8）|
+| --batch_size  | 检测模型预测时的batch size，在指定`image_dir`时有效 |
+| --batch_size_keypoint  | 关键点模型预测时的batch size，默认为8 |
+| --run_benchmark | 是否重复预测来进行benchmark测速 ｜
+| --output_dir | 输出图片所在的文件夹, 默认为output ｜
+| --use_mkldnn | CPU预测中是否开启MKLDNN加速 |
+| --cpu_threads | 设置cpu线程数，默认为1 |
+| --use_dark | 关键点模型输出预测是否使用DarkPose后处理，默认为true |
+
+**注意**:
+- 优先级顺序：`camera_id` > `video_file` > `image_dir` > `image_file`。
+- --run_benchmark如果设置为True，则需要安装依赖`pip install pynvml psutil GPUtil`。
+
+`样例一`：
+```shell
+#不使用`GPU`测试图片 `/root/projects/images/test.jpeg`  
+./build/main --model_dir=/root/projects/models/yolov3_darknet --image_file=/root/projects/images/test.jpeg
+```
+
+图片文件`可视化预测结果`会保存在当前目录下`output.jpg`文件中。
+
+
+`样例二`:
+```shell
+#使用 `GPU`预测视频`/root/projects/videos/test.mp4`
+./build/main --model_dir=/root/projects/models/yolov3_darknet --video_file=/root/projects/images/test.mp4 --device=GPU
+```
+视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。
+
+
+`样例三`：
+```shell
+#使用关键点模型与检测模型联合预测，使用 `GPU`预测  
+#检测模型检测到的人送入关键点模型进行关键点预测
+./build/main --model_dir=/root/projects/models/yolov3_darknet --model_dir_keypoint=/root/projects/models/hrnet_w32_256x192 --image_file=/root/projects/images/test.jpeg --device=GPU
+```
+
+## 性能测试
+benchmark请查看[BENCHMARK_INFER](../../BENCHMARK_INFER.md)
diff --git a/deploy/cpp/docs/windows_vs2019_build.md b/deploy/cpp/docs/windows_vs2019_build.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a23cabc7bf640ed548942012354013f500d6be2
--- /dev/null
+++ b/deploy/cpp/docs/windows_vs2019_build.md
@@ -0,0 +1,158 @@
+# Visual Studio 2019 Community CMake 编译指南
+
+Windows 平台下，我们使用`Visual Studio 2019 Community` 进行了测试。微软从`Visual Studio 2017`开始即支持直接管理`CMake`跨平台编译项目，但是直到`2019`才提供了稳定和完全的支持，所以如果你想使用CMake管理项目编译构建，我们推荐你使用`Visual Studio 2019`环境下构建。
+
+
+## 前置条件
+* Visual Studio 2019 (根据Paddle预测库所使用的VS版本选择，请参考 [Visual Studio 不同版本二进制兼容性](https://docs.microsoft.com/zh-cn/cpp/porting/binary-compat-2015-2017?view=vs-2019) )
+* CUDA 9.0 / CUDA 10.0，cudnn 7+ / TensorRT（仅在使用GPU版本的预测库时需要）
+* CMake 3.0+ [CMake下载](https://cmake.org/download/)
+
+**特别注意：windows下预测库需要的TensorRT版本为：**。
+
+|  预测库版本   | TensorRT版本  |
+|  ----  | ----  |
+| cuda10.1_cudnn7.6_avx_mkl_trt6 |  TensorRT-6.0.1.5  |
+| cuda10.2_cudnn7.6_avx_mkl_trt7 |  TensorRT-7.0.0.11 |
+| cuda11.0_cudnn8.0_avx_mkl_trt7 |  TensorRT-7.2.1.6  |
+
+请确保系统已经安装好上述基本软件，我们使用的是`VS2019`的社区版。
+
+**下面所有示例以工作目录为 `D:\projects`演示**。
+
+### Step1: 下载代码
+
+下载源代码
+```shell
+git clone https://github.com/PaddlePaddle/PaddleDetection.git
+```
+
+**说明**：其中`C++`预测代码在`PaddleDetection/deploy/cpp` 目录，该目录不依赖任何`PaddleDetection`下其他目录。
+
+
+### Step2: 下载PaddlePaddle C++ 预测库 paddle_inference
+
+PaddlePaddle C++ 预测库针对不同的`CPU`和`CUDA`版本提供了不同的预编译版本，请根据实际情况下载:  [C++预测库下载列表](https://paddleinference.paddlepaddle.org.cn/user_guides/download_lib.html#windows)
+
+解压后`D:\projects\paddle_inference`目录包含内容为：
+```
+paddle_inference
+├── paddle # paddle核心库和头文件
+|
+├── third_party # 第三方依赖库和头文件
+|
+└── version.txt # 版本和编译信息
+```
+
+### Step3: 安装配置OpenCV
+
+1. 在OpenCV官网下载适用于Windows平台的3.4.6版本， [下载地址](https://sourceforge.net/projects/opencvlibrary/files/3.4.6/opencv-3.4.6-vc14_vc15.exe/download)  
+2. 运行下载的可执行文件，将OpenCV解压至指定目录，如`D:\projects\opencv`
+3. 配置环境变量，如下流程所示（如果使用全局绝对路径，可以不用设置环境变量）  
+    - 我的电脑->属性->高级系统设置->环境变量
+    - 在系统变量中找到Path（如没有，自行创建），并双击编辑
+    - 新建，将opencv路径填入并保存，如`D:\projects\opencv\build\x64\vc14\bin`
+
+### Step4: 编译
+
+1. 进入到`cpp`文件夹
+```
+cd D:\projects\PaddleDetection\deploy\cpp
+```
+
+2. 使用CMake生成项目文件
+
+编译参数的含义说明如下（带`*`表示仅在使用**GPU版本**预测库时指定, 其中CUDA库版本尽量对齐，**使用9.0、10.0版本，不使用9.2、10.1等版本CUDA库**）：
+
+|  参数名   | 含义  |
+|  ----  | ----  |
+| *CUDA_LIB  | CUDA的库路径 |
+| *CUDNN_LIB | CUDNN的库路径 |
+| OPENCV_DIR  | OpenCV的安装路径， |
+| PADDLE_DIR | Paddle预测库的路径 |
+| PADDLE_LIB_NAME | Paddle 预测库名称 |
+
+**注意：**
+
+1. 如果编译环境为CPU，需要下载`CPU`版预测库，请把`WITH_GPU`的勾去掉
+2. 如果使用的是`openblas`版本，请把`WITH_MKL`勾去掉
+3. 如无需使用关键点模型可以把`WITH_KEYPOINT`勾去掉
+4. Windows环境下，`PADDLE_LIB_NAME`需要设置为`paddle_inference`
+
+执行如下命令项目文件：
+```
+cmake . -G "Visual Studio 16 2019" -A x64 -T host=x64 -DWITH_GPU=ON -DWITH_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCUDA_LIB=path_to_cuda_lib -DCUDNN_LIB=path_to_cudnn_lib -DPADDLE_DIR=path_to_paddle_lib -DPADDLE_LIB_NAME=paddle_inference -DOPENCV_DIR=path_to_opencv -DWITH_KEYPOINT=ON
+```
+
+例如：
+```
+cmake . -G "Visual Studio 16 2019" -A x64 -T host=x64 -DWITH_GPU=ON -DWITH_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCUDA_LIB=D:\projects\packages\cuda10_0\lib\x64 -DCUDNN_LIB=D:\projects\packages\cuda10_0\lib\x64 -DPADDLE_DIR=D:\projects\packages\paddle_inference -DPADDLE_LIB_NAME=paddle_inference -DOPENCV_DIR=D:\projects\packages\opencv3_4_6 -DWITH_KEYPOINT=ON
+```
+
+
+
+3. 编译
+用`Visual Studio 16 2019`打开`cpp`文件夹下的`PaddleObjectDetector.sln`，将编译模式设置为`Release`，点击`生成`->`全部生成
+
+
+### Step5: 预测及可视化
+
+上述`Visual Studio 2019`编译产出的可执行文件在`out\build\x64-Release`目录下，打开`cmd`，并切换到该目录：
+
+```
+cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release
+```
+可执行文件`main`即为样例的预测程序，其主要的命令行参数如下：
+
+|  参数   | 说明  |
+|  ----  | ----  |
+| --model_dir  | 导出的检测预测模型所在路径 |
+| --model_dir_keypoint  | Option | 导出的关键点预测模型所在路径 |
+| --image_file  | 要预测的图片文件路径 |
+| --image_dir  |  要预测的图片文件夹路径   |
+| --video_file  | 要预测的视频文件路径 |
+| --camera_id | Option | 用来预测的摄像头ID，默认为-1（表示不使用摄像头预测）|
+| --device  | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`|
+| --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
+| --run_mode | 使用GPU时，默认为paddle, 可选（paddle/trt_fp32/trt_fp16/trt_int8）|
+| --batch_size  | 检测模型预测时的batch size，在指定`image_dir`时有效 |
+| --batch_size_keypoint  | 关键点模型预测时的batch size，默认为8 |
+| --run_benchmark | 是否重复预测来进行benchmark测速 ｜
+| --output_dir | 输出图片所在的文件夹, 默认为output ｜
+| --use_mkldnn | CPU预测中是否开启MKLDNN加速 |
+| --cpu_threads | 设置cpu线程数，默认为1 |
+| --use_dark | 关键点模型输出预测是否使用DarkPose后处理，默认为true |
+
+**注意**：  
+（1）优先级顺序：`camera_id` > `video_file` > `image_dir` > `image_file`。
+（2）如果提示找不到`opencv_world346.dll`，把`D:\projects\packages\opencv3_4_6\build\x64\vc14\bin`文件夹下的`opencv_world346.dll`拷贝到`main.exe`文件夹下即可。
+（3）--run_benchmark如果设置为True，则需要安装依赖`pip install pynvml psutil GPUtil`。
+
+
+`样例一`：
+```shell
+#不使用`GPU`测试图片 `D:\\images\\test.jpeg`  
+.\main --model_dir=D:\\models\\yolov3_darknet --image_file=D:\\images\\test.jpeg
+```
+
+图片文件`可视化预测结果`会保存在当前目录下`output.jpg`文件中。
+
+
+`样例二`:
+```shell
+#使用`GPU`测试视频 `D:\\videos\\test.mp4`  
+.\main --model_dir=D:\\models\\yolov3_darknet --video_path=D:\\videos\\test.mp4 --device=GPU
+```
+
+视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。
+
+
+`样例三`：
+```shell
+#使用关键点模型与检测模型联合预测，使用 `GPU`预测  
+#检测模型检测到的人送入关键点模型进行关键点预测
+.\main --model_dir=D:\\models\\yolov3_darknet --model_dir_keypoint=D:\\models\\hrnet_w32_256x192 --image_file=D:\\images\\test.jpeg --device=GPU
+```
+
+## 性能测试
+Benchmark请查看[BENCHMARK_INFER](../../BENCHMARK_INFER.md)
diff --git a/deploy/cpp/include/config_parser.h b/deploy/cpp/include/config_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f2e381c5284bb7ce16a6b06f858a32e83290f98
--- /dev/null
+++ b/deploy/cpp/include/config_parser.h
@@ -0,0 +1,142 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "yaml-cpp/yaml.h"
+
+#ifdef _WIN32
+#define OS_PATH_SEP "\\"
+#else
+#define OS_PATH_SEP "/"
+#endif
+
+namespace PaddleDetection {
+
+// Inference model configuration parser
+class ConfigPaser {
+ public:
+  ConfigPaser() {}
+
+  ~ConfigPaser() {}
+
+  bool load_config(const std::string& model_dir,
+                   const std::string& cfg = "infer_cfg.yml") {
+    // Load as a YAML::Node
+    YAML::Node config;
+    config = YAML::LoadFile(model_dir + OS_PATH_SEP + cfg);
+
+    // Get runtime mode : paddle, trt_fp16, trt_fp32
+    if (config["mode"].IsDefined()) {
+      mode_ = config["mode"].as<std::string>();
+    } else {
+      std::cerr << "Please set mode, "
+                << "support value : paddle/trt_fp16/trt_fp32." << std::endl;
+      return false;
+    }
+
+    // Get model arch : YOLO, SSD, RetinaNet, RCNN, Face
+    if (config["arch"].IsDefined()) {
+      arch_ = config["arch"].as<std::string>();
+    } else {
+      std::cerr << "Please set model arch,"
+                << "support value : YOLO, SSD, RetinaNet, RCNN, Face."
+                << std::endl;
+      return false;
+    }
+
+    // Get min_subgraph_size for tensorrt
+    if (config["min_subgraph_size"].IsDefined()) {
+      min_subgraph_size_ = config["min_subgraph_size"].as<int>();
+    } else {
+      std::cerr << "Please set min_subgraph_size." << std::endl;
+      return false;
+    }
+    // Get draw_threshold for visualization
+    if (config["draw_threshold"].IsDefined()) {
+      draw_threshold_ = config["draw_threshold"].as<float>();
+    } else {
+      std::cerr << "Please set draw_threshold." << std::endl;
+      return false;
+    }
+    // Get Preprocess for preprocessing
+    if (config["Preprocess"].IsDefined()) {
+      preprocess_info_ = config["Preprocess"];
+    } else {
+      std::cerr << "Please set Preprocess." << std::endl;
+      return false;
+    }
+    // Get label_list for visualization
+    if (config["label_list"].IsDefined()) {
+      label_list_ = config["label_list"].as<std::vector<std::string>>();
+    } else {
+      std::cerr << "Please set label_list." << std::endl;
+      return false;
+    }
+
+    // Get use_dynamic_shape for TensorRT
+    if (config["use_dynamic_shape"].IsDefined()) {
+      use_dynamic_shape_ = config["use_dynamic_shape"].as<bool>();
+    } else {
+      std::cerr << "Please set use_dynamic_shape." << std::endl;
+      return false;
+    }
+
+    // Get conf_thresh for tracker
+    if (config["tracker"].IsDefined()) {
+      if (config["tracker"]["conf_thres"].IsDefined()) {
+        conf_thresh_ = config["tracker"]["conf_thres"].as<float>();
+      } else {
+        std::cerr << "Please set conf_thres in tracker." << std::endl;
+        return false;
+      }
+    }
+
+    // Get NMS for postprocess
+    if (config["NMS"].IsDefined()) {
+      nms_info_ = config["NMS"];
+    }
+    // Get fpn_stride in PicoDet
+    if (config["fpn_stride"].IsDefined()) {
+      fpn_stride_.clear();
+      for (auto item : config["fpn_stride"]) {
+        fpn_stride_.emplace_back(item.as<int>());
+      }
+    }
+
+    if (config["mask"].IsDefined()) {
+      mask_ = config["mask"].as<bool>();
+    }
+
+    return true;
+  }
+  std::string mode_;
+  float draw_threshold_;
+  std::string arch_;
+  int min_subgraph_size_;
+  YAML::Node preprocess_info_;
+  YAML::Node nms_info_;
+  std::vector<std::string> label_list_;
+  std::vector<int> fpn_stride_;
+  bool use_dynamic_shape_;
+  float conf_thresh_;
+  bool mask_ = false;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/include/jde_detector.h b/deploy/cpp/include/jde_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..959b9b448b5d8a09909eaca93793d6c0d09003f5
--- /dev/null
+++ b/deploy/cpp/include/jde_detector.h
@@ -0,0 +1,134 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "paddle_inference_api.h"  // NOLINT
+
+#include "include/config_parser.h"
+#include "include/preprocess_op.h"
+#include "include/tracker.h"
+
+using namespace paddle_infer;
+
+namespace PaddleDetection {
+// JDE Detection Result
+struct MOT_Rect {
+  float left;
+  float top;
+  float right;
+  float bottom;
+};
+
+struct MOT_Track {
+  int ids;
+  float score;
+  MOT_Rect rects;
+};
+
+typedef std::vector<MOT_Track> MOT_Result;
+
+// Generate visualization color
+cv::Scalar GetColor(int idx);
+
+// Visualiztion Detection Result
+cv::Mat VisualizeTrackResult(const cv::Mat& img,
+                             const MOT_Result& results,
+                             const float fps,
+                             const int frame_id);
+
+class JDEDetector {
+ public:
+  explicit JDEDetector(const std::string& model_dir,
+                       const std::string& device = "CPU",
+                       bool use_mkldnn = false,
+                       int cpu_threads = 1,
+                       const std::string& run_mode = "paddle",
+                       const int batch_size = 1,
+                       const int gpu_id = 0,
+                       const int trt_min_shape = 1,
+                       const int trt_max_shape = 1280,
+                       const int trt_opt_shape = 640,
+                       bool trt_calib_mode = false,
+                       const int min_box_area = 200) {
+    this->device_ = device;
+    this->gpu_id_ = gpu_id;
+    this->cpu_math_library_num_threads_ = cpu_threads;
+    this->use_mkldnn_ = use_mkldnn;
+
+    this->trt_min_shape_ = trt_min_shape;
+    this->trt_max_shape_ = trt_max_shape;
+    this->trt_opt_shape_ = trt_opt_shape;
+    this->trt_calib_mode_ = trt_calib_mode;
+    config_.load_config(model_dir);
+    this->use_dynamic_shape_ = config_.use_dynamic_shape_;
+    this->min_subgraph_size_ = config_.min_subgraph_size_;
+    threshold_ = config_.draw_threshold_;
+    preprocessor_.Init(config_.preprocess_info_);
+    LoadModel(model_dir, batch_size, run_mode);
+    this->min_box_area_ = min_box_area;
+    this->conf_thresh_ = config_.conf_thresh_;
+  }
+
+  // Load Paddle inference model
+  void LoadModel(const std::string& model_dir,
+                 const int batch_size = 1,
+                 const std::string& run_mode = "paddle");
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               const double threshold = 0.5,
+               const int warmup = 0,
+               const int repeats = 1,
+               MOT_Result* result = nullptr,
+               std::vector<double>* times = nullptr);
+
+ private:
+  std::string device_ = "CPU";
+  int gpu_id_ = 0;
+  int cpu_math_library_num_threads_ = 1;
+  bool use_mkldnn_ = false;
+  int min_subgraph_size_ = 3;
+  bool use_dynamic_shape_ = false;
+  int trt_min_shape_ = 1;
+  int trt_max_shape_ = 1280;
+  int trt_opt_shape_ = 640;
+  bool trt_calib_mode_ = false;
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(const cv::Mat dets, const cv::Mat emb, MOT_Result* result);
+
+  std::shared_ptr<Predictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> bbox_data_;
+  std::vector<float> emb_data_;
+  float threshold_;
+  ConfigPaser config_;
+  float min_box_area_;
+  float conf_thresh_;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/include/keypoint_detector.h b/deploy/cpp/include/keypoint_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce6aa0e0692d215fc1a704afd37c3787fe8e42ef
--- /dev/null
+++ b/deploy/cpp/include/keypoint_detector.h
@@ -0,0 +1,126 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "paddle_inference_api.h"  // NOLINT
+
+#include "include/config_parser.h"
+#include "include/keypoint_postprocess.h"
+#include "include/preprocess_op.h"
+
+using namespace paddle_infer;
+
+namespace PaddleDetection {
+
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap);
+
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_dir,
+                            const std::string& device = "CPU",
+                            bool use_mkldnn = false,
+                            int cpu_threads = 1,
+                            const std::string& run_mode = "paddle",
+                            const int batch_size = 1,
+                            const int gpu_id = 0,
+                            const int trt_min_shape = 1,
+                            const int trt_max_shape = 1280,
+                            const int trt_opt_shape = 640,
+                            bool trt_calib_mode = false,
+                            bool use_dark = true) {
+    this->device_ = device;
+    this->gpu_id_ = gpu_id;
+    this->cpu_math_library_num_threads_ = cpu_threads;
+    this->use_mkldnn_ = use_mkldnn;
+    this->use_dark = use_dark;
+
+    this->trt_min_shape_ = trt_min_shape;
+    this->trt_max_shape_ = trt_max_shape;
+    this->trt_opt_shape_ = trt_opt_shape;
+    this->trt_calib_mode_ = trt_calib_mode;
+    config_.load_config(model_dir);
+    this->use_dynamic_shape_ = config_.use_dynamic_shape_;
+    this->min_subgraph_size_ = config_.min_subgraph_size_;
+    threshold_ = config_.draw_threshold_;
+    preprocessor_.Init(config_.preprocess_info_);
+    LoadModel(model_dir, batch_size, run_mode);
+  }
+
+  // Load Paddle inference model
+  void LoadModel(const std::string& model_dir,
+                 const int batch_size = 1,
+                 const std::string& run_mode = "paddle");
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               const double threshold = 0.5,
+               const int warmup = 0,
+               const int repeats = 1,
+               std::vector<KeyPointResult>* result = nullptr,
+               std::vector<double>* times = nullptr);
+
+  // Get Model Label list
+  const std::vector<std::string>& GetLabelList() const {
+    return config_.label_list_;
+  }
+
+ private:
+  std::string device_ = "CPU";
+  int gpu_id_ = 0;
+  int cpu_math_library_num_threads_ = 1;
+  bool use_dark = true;
+  bool use_mkldnn_ = false;
+  int min_subgraph_size_ = 3;
+  bool use_dynamic_shape_ = false;
+  int trt_min_shape_ = 1;
+  int trt_max_shape_ = 1280;
+  int trt_opt_shape_ = 640;
+  bool trt_calib_mode_ = false;
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(std::vector<float>& output,
+                   std::vector<int> output_shape,
+                   std::vector<int64_t>& idxout,
+                   std::vector<int> idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+
+  std::shared_ptr<Predictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> output_data_;
+  std::vector<int64_t> idx_data_;
+  float threshold_;
+  ConfigPaser config_;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/include/keypoint_postprocess.h b/deploy/cpp/include/keypoint_postprocess.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa0c7d55f06db986404eb23a7df1144a22e7f33f
--- /dev/null
+++ b/deploy/cpp/include/keypoint_postprocess.h
@@ -0,0 +1,134 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+
+namespace PaddleDetection {
+
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p);
+
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine = false);
+
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+
+void get_max_preds(float* heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   float* maxvals,
+                   int batchid,
+                   int joint_idx);
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int64_t>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK = true);
+
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+
+class PoseSmooth {
+ public:
+  explicit PoseSmooth(const int width,
+                      const int height,
+                      std::string filter_type = "OneEuro",
+                      float alpha = 0.5,
+                      float fc_d = 0.1,
+                      float fc_min = 0.1,
+                      float beta = 0.1,
+                      float thres_mult = 0.3)
+      : width(width),
+        height(height),
+        alpha(alpha),
+        fc_d(fc_d),
+        fc_min(fc_min),
+        beta(beta),
+        filter_type(filter_type),
+        thres_mult(thres_mult){};
+
+  // Run predictor
+  KeyPointResult smooth_process(KeyPointResult* result);
+  void PointSmooth(KeyPointResult* result,
+                   KeyPointResult* keypoint_smoothed,
+                   std::vector<float> thresholds,
+                   int index);
+  float OneEuroFilter(float x_cur, float x_pre, int loc);
+  float smoothing_factor(float te, float fc);
+  float ExpSmoothing(float x_cur, float x_pre, int loc = 0);
+
+ private:
+  int width = 0;
+  int height = 0;
+  float alpha = 0.;
+  float fc_d = 1.;
+  float fc_min = 0.;
+  float beta = 1.;
+  float thres_mult = 1.;
+  std::string filter_type = "OneEuro";
+  std::vector<float> thresholds = {0.005,
+                                   0.005,
+                                   0.005,
+                                   0.005,
+                                   0.005,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01,
+                                   0.01};
+  KeyPointResult x_prev_hat;
+  KeyPointResult dx_prev_hat;
+};
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/include/lapjv.h b/deploy/cpp/include/lapjv.h
new file mode 100644
index 0000000000000000000000000000000000000000..331defc42c4c38d7360d38b881909fb51ce7e2c7
--- /dev/null
+++ b/deploy/cpp/include/lapjv.h
@@ -0,0 +1,52 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/gatagat/lap/blob/master/lap/lapjv.h
+// Ths copyright of gatagat/lap is as follows:
+// MIT License
+
+#ifndef LAPJV_H
+#define LAPJV_H
+
+#define LARGE 1000000
+
+#if !defined TRUE
+#define TRUE 1
+#endif
+#if !defined FALSE
+#define FALSE 0
+#endif
+
+#define NEW(x, t, n) if ((x = (t *)malloc(sizeof(t) * (n))) == 0) {return -1;}
+#define FREE(x) if (x != 0) { free(x); x = 0; }
+#define SWAP_INDICES(a, b) { int_t _temp_index = a; a = b; b = _temp_index; }
+#include <opencv2/opencv.hpp>
+
+namespace PaddleDetection {
+
+typedef signed int int_t;
+typedef unsigned int uint_t;
+typedef double cost_t;
+typedef char boolean;
+typedef enum fp_t { FP_1 = 1, FP_2 = 2, FP_DYNAMIC = 3 } fp_t;
+
+int lapjv_internal(
+    const cv::Mat &cost, const bool extend_cost, const float cost_limit,
+    int *x, int *y);
+
+} // namespace PaddleDetection
+
+#endif // LAPJV_H
+
diff --git a/deploy/cpp/include/object_detector.h b/deploy/cpp/include/object_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..47bd29362c85eafc3825d25af73694803e2a1504
--- /dev/null
+++ b/deploy/cpp/include/object_detector.h
@@ -0,0 +1,124 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "paddle_inference_api.h" // NOLINT
+
+#include "include/config_parser.h"
+#include "include/picodet_postprocess.h"
+#include "include/preprocess_op.h"
+#include "include/utils.h"
+
+using namespace paddle_infer;
+namespace PaddleDetection {
+
+// Generate visualization colormap for each class
+std::vector<int> GenerateColorMap(int num_class);
+
+// Visualiztion Detection Result
+cv::Mat
+VisualizeResult(const cv::Mat &img,
+                const std::vector<PaddleDetection::ObjectResult> &results,
+                const std::vector<std::string> &lables,
+                const std::vector<int> &colormap, const bool is_rbox);
+
+class ObjectDetector {
+public:
+  explicit ObjectDetector(const std::string &model_dir,
+                          const std::string &device = "CPU",
+                          bool use_mkldnn = false, int cpu_threads = 1,
+                          const std::string &run_mode = "paddle",
+                          const int batch_size = 1, const int gpu_id = 0,
+                          const int trt_min_shape = 1,
+                          const int trt_max_shape = 1280,
+                          const int trt_opt_shape = 640,
+                          bool trt_calib_mode = false) {
+    this->device_ = device;
+    this->gpu_id_ = gpu_id;
+    this->cpu_math_library_num_threads_ = cpu_threads;
+    this->use_mkldnn_ = use_mkldnn;
+
+    this->trt_min_shape_ = trt_min_shape;
+    this->trt_max_shape_ = trt_max_shape;
+    this->trt_opt_shape_ = trt_opt_shape;
+    this->trt_calib_mode_ = trt_calib_mode;
+    config_.load_config(model_dir);
+    this->use_dynamic_shape_ = config_.use_dynamic_shape_;
+    this->min_subgraph_size_ = config_.min_subgraph_size_;
+    threshold_ = config_.draw_threshold_;
+    preprocessor_.Init(config_.preprocess_info_);
+    LoadModel(model_dir, batch_size, run_mode);
+  }
+
+  // Load Paddle inference model
+  void LoadModel(const std::string &model_dir, const int batch_size = 1,
+                 const std::string &run_mode = "paddle");
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs, const double threshold = 0.5,
+               const int warmup = 0, const int repeats = 1,
+               std::vector<PaddleDetection::ObjectResult> *result = nullptr,
+               std::vector<int> *bbox_num = nullptr,
+               std::vector<double> *times = nullptr);
+
+  // Get Model Label list
+  const std::vector<std::string> &GetLabelList() const {
+    return config_.label_list_;
+  }
+
+private:
+  std::string device_ = "CPU";
+  int gpu_id_ = 0;
+  int cpu_math_library_num_threads_ = 1;
+  bool use_mkldnn_ = false;
+  int min_subgraph_size_ = 3;
+  bool use_dynamic_shape_ = false;
+  int trt_min_shape_ = 1;
+  int trt_max_shape_ = 1280;
+  int trt_opt_shape_ = 640;
+  bool trt_calib_mode_ = false;
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat &image_mat);
+  // Postprocess result
+  void Postprocess(const std::vector<cv::Mat> mats,
+                   std::vector<PaddleDetection::ObjectResult> *result,
+                   std::vector<int> bbox_num, std::vector<float> output_data_,
+                   std::vector<int> output_mask_data_, bool is_rbox);
+
+  void SOLOv2Postprocess(
+      const std::vector<cv::Mat> mats, std::vector<ObjectResult> *result,
+      std::vector<int> *bbox_num, std::vector<int> out_bbox_num_data_,
+      std::vector<int64_t> out_label_data_, std::vector<float> out_score_data_,
+      std::vector<uint8_t> out_global_mask_data_, float threshold = 0.5);
+
+  std::shared_ptr<Predictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  float threshold_;
+  ConfigPaser config_;
+};
+
+} // namespace PaddleDetection
diff --git a/deploy/cpp/include/picodet_postprocess.h b/deploy/cpp/include/picodet_postprocess.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0705e85d9ac089fd093ba6a1b213dfd08e6e449
--- /dev/null
+++ b/deploy/cpp/include/picodet_postprocess.h
@@ -0,0 +1,37 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <ctime>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "include/utils.h"
+
+namespace PaddleDetection {
+
+void PicoDetPostProcess(std::vector<PaddleDetection::ObjectResult> *results,
+                        std::vector<const float *> outs,
+                        std::vector<int> fpn_stride,
+                        std::vector<float> im_shape,
+                        std::vector<float> scale_factor,
+                        float score_threshold = 0.3, float nms_threshold = 0.5,
+                        int num_class = 80, int reg_max = 7);
+
+} // namespace PaddleDetection
diff --git a/deploy/cpp/include/preprocess_op.h b/deploy/cpp/include/preprocess_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3d4a99bb15f2860a7ce4c7bb17b332565de2da1
--- /dev/null
+++ b/deploy/cpp/include/preprocess_op.h
@@ -0,0 +1,239 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <yaml-cpp/yaml.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+namespace PaddleDetection {
+
+// Object for storing all preprocessed data
+class ImageBlob {
+ public:
+  // image width and height
+  std::vector<float> im_shape_;
+  // Buffer for image data after preprocessing
+  std::vector<float> im_data_;
+  // in net data shape(after pad)
+  std::vector<float> in_net_shape_;
+  // Evaluation image width and height
+  // std::vector<float>  eval_im_size_f_;
+  // Scale factor for image size to origin image size
+  std::vector<float> scale_factor_;
+  // in net image after preprocessing
+  cv::Mat in_net_im_;
+};
+
+// Abstraction of preprocessing opration class
+class PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) = 0;
+  virtual void Run(cv::Mat* im, ImageBlob* data) = 0;
+};
+
+class InitInfo : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+
+class NormalizeImage : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {
+    mean_ = item["mean"].as<std::vector<float>>();
+    scale_ = item["std"].as<std::vector<float>>();
+    if (item["is_scale"]) is_scale_ = item["is_scale"].as<bool>();
+    if (item["norm_type"]) norm_type_ = item["norm_type"].as<std::string>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  // CHW or HWC
+  std::vector<float> mean_;
+  std::vector<float> scale_;
+  bool is_scale_ = true;
+  std::string norm_type_ = "mean_std";
+};
+
+class Permute : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+
+class Resize : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {
+    interp_ = item["interp"].as<int>();
+    keep_ratio_ = item["keep_ratio"].as<bool>();
+    target_size_ = item["target_size"].as<std::vector<int>>();
+  }
+
+  // Compute best resize scale for x-dimension, y-dimension
+  std::pair<float, float> GenerateScale(const cv::Mat& im);
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int interp_;
+  bool keep_ratio_;
+  std::vector<int> target_size_;
+  std::vector<int> in_net_shape_;
+};
+
+class LetterBoxResize : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {
+    target_size_ = item["target_size"].as<std::vector<int>>();
+  }
+
+  float GenerateScale(const cv::Mat& im);
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  std::vector<int> target_size_;
+  std::vector<int> in_net_shape_;
+};
+// Models with FPN need input shape % stride == 0
+class PadStride : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {
+    stride_ = item["stride"].as<int>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int stride_;
+};
+
+class TopDownEvalAffine : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {
+    trainsize_ = item["trainsize"].as<std::vector<int>>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int interp_ = 1;
+  std::vector<int> trainsize_;
+};
+
+class WarpAffine : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {
+    input_h_ = item["input_h"].as<int>();
+    input_w_ = item["input_w"].as<int>();
+    keep_res_ = item["keep_res"].as<bool>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int input_h_;
+  int input_w_;
+  int interp_ = 1;
+  bool keep_res_ = true;
+  int pad_ = 31;
+};
+
+class Pad : public PreprocessOp {
+ public:
+  virtual void Init(const YAML::Node& item) {
+    size_ = item["size"].as<std::vector<int>>();
+    fill_value_ = item["fill_value"].as<std::vector<float>>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  std::vector<int> size_;
+  std::vector<float> fill_value_;
+};
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.15);
+
+// check whether the input size is dynamic
+bool CheckDynamicInput(const std::vector<cv::Mat>& imgs);
+
+// Pad images in batch
+std::vector<cv::Mat> PadBatch(const std::vector<cv::Mat>& imgs);
+
+class Preprocessor {
+ public:
+  void Init(const YAML::Node& config_node) {
+    // initialize image info at first
+    ops_["InitInfo"] = std::make_shared<InitInfo>();
+    for (const auto& item : config_node) {
+      auto op_name = item["type"].as<std::string>();
+
+      ops_[op_name] = CreateOp(op_name);
+      ops_[op_name]->Init(item);
+    }
+  }
+
+  std::shared_ptr<PreprocessOp> CreateOp(const std::string& name) {
+    if (name == "Resize") {
+      return std::make_shared<Resize>();
+    } else if (name == "LetterBoxResize") {
+      return std::make_shared<LetterBoxResize>();
+    } else if (name == "Permute") {
+      return std::make_shared<Permute>();
+    } else if (name == "NormalizeImage") {
+      return std::make_shared<NormalizeImage>();
+    } else if (name == "PadStride") {
+      // use PadStride instead of PadBatch
+      return std::make_shared<PadStride>();
+    } else if (name == "TopDownEvalAffine") {
+      return std::make_shared<TopDownEvalAffine>();
+    } else if (name == "WarpAffine") {
+      return std::make_shared<WarpAffine>();
+    }else if (name == "Pad") {
+      return std::make_shared<Pad>();
+    }
+    std::cerr << "can not find function of OP: " << name
+              << " and return: nullptr" << std::endl;
+    return nullptr;
+  }
+
+  void Run(cv::Mat* im, ImageBlob* data);
+
+ public:
+  static const std::vector<std::string> RUN_ORDER;
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<PreprocessOp>> ops_;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/include/tracker.h b/deploy/cpp/include/tracker.h
new file mode 100644
index 0000000000000000000000000000000000000000..903c3b3046280766e33ce67ef157ba0ea558e3e1
--- /dev/null
+++ b/deploy/cpp/include/tracker.h
@@ -0,0 +1,63 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/CnybTseng/JDE/blob/master/platforms/common/jdetracker.h
+// Ths copyright of CnybTseng/JDE is as follows:
+// MIT License 
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <opencv2/opencv.hpp>
+
+#include "trajectory.h"
+
+namespace PaddleDetection {
+
+typedef std::map<int, int> Match;
+typedef std::map<int, int>::iterator MatchIterator;
+
+struct Track
+{
+    int id;
+    float score;
+    cv::Vec4f ltrb;
+};
+
+class JDETracker
+{
+public:
+    static JDETracker *instance(void);
+    virtual bool update(const cv::Mat &dets, const cv::Mat &emb, std::vector<Track> &tracks);
+private:
+    JDETracker(void);
+    virtual ~JDETracker(void) {}
+    cv::Mat motion_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b);
+    void linear_assignment(const cv::Mat &cost, float cost_limit, Match &matches,
+        std::vector<int> &mismatch_row, std::vector<int> &mismatch_col);
+    void remove_duplicate_trajectory(TrajectoryPool &a, TrajectoryPool &b, float iou_thresh=0.15f);
+private:
+    static JDETracker *me;
+    int timestamp;
+    TrajectoryPool tracked_trajectories;
+    TrajectoryPool lost_trajectories;
+    TrajectoryPool removed_trajectories;
+    int max_lost_time;
+    float lambda;
+    float det_thresh;
+};
+
+} // namespace PaddleDetection
diff --git a/deploy/cpp/include/trajectory.h b/deploy/cpp/include/trajectory.h
new file mode 100644
index 0000000000000000000000000000000000000000..d801e280007b52b6fda98d90aebde197cf090ca5
--- /dev/null
+++ b/deploy/cpp/include/trajectory.h
@@ -0,0 +1,202 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/CnybTseng/JDE/blob/master/platforms/common/trajectory.h
+// Ths copyright of CnybTseng/JDE is as follows:
+// MIT License
+
+#pragma once
+
+#include <vector>
+#include <opencv2/opencv.hpp>
+
+namespace PaddleDetection {
+
+typedef enum
+{
+    New = 0,
+    Tracked = 1,
+    Lost = 2,
+    Removed = 3
+} TrajectoryState;
+
+class Trajectory;
+typedef std::vector<Trajectory> TrajectoryPool;
+typedef std::vector<Trajectory>::iterator TrajectoryPoolIterator;
+typedef std::vector<Trajectory *>TrajectoryPtrPool;
+typedef std::vector<Trajectory *>::iterator TrajectoryPtrPoolIterator;
+
+class TKalmanFilter : public cv::KalmanFilter
+{
+public:
+    TKalmanFilter(void);
+    virtual ~TKalmanFilter(void) {}
+    virtual void init(const cv::Mat &measurement);
+    virtual const cv::Mat &predict();
+    virtual const cv::Mat &correct(const cv::Mat &measurement);
+    virtual void project(cv::Mat &mean, cv::Mat &covariance) const;
+private:
+    float std_weight_position;
+    float std_weight_velocity;
+};
+
+inline TKalmanFilter::TKalmanFilter(void) : cv::KalmanFilter(8, 4)
+{
+    cv::KalmanFilter::transitionMatrix = cv::Mat::eye(8, 8, CV_32F);
+    for (int i = 0; i < 4; ++i)
+        cv::KalmanFilter::transitionMatrix.at<float>(i, i + 4) = 1;
+    cv::KalmanFilter::measurementMatrix = cv::Mat::eye(4, 8, CV_32F);
+    std_weight_position = 1/20.f;
+    std_weight_velocity = 1/160.f;
+}
+
+class Trajectory : public TKalmanFilter
+{
+public:
+    Trajectory();
+    Trajectory(cv::Vec4f &ltrb, float score, const cv::Mat &embedding);
+    Trajectory(const Trajectory &other);
+    Trajectory &operator=(const Trajectory &rhs);
+    virtual ~Trajectory(void) {};
+
+    static int next_id();
+    virtual const cv::Mat &predict(void);
+    virtual void update(Trajectory &traj, int timestamp, bool update_embedding=true);
+    virtual void activate(int timestamp);
+    virtual void reactivate(Trajectory &traj, int timestamp, bool newid=false);
+    virtual void mark_lost(void);
+    virtual void mark_removed(void);
+
+    friend TrajectoryPool operator+(const TrajectoryPool &a, const TrajectoryPool &b);
+    friend TrajectoryPool operator+(const TrajectoryPool &a, const TrajectoryPtrPool &b);
+    friend TrajectoryPool &operator+=(TrajectoryPool &a, const TrajectoryPtrPool &b);
+    friend TrajectoryPool operator-(const TrajectoryPool &a, const TrajectoryPool &b);
+    friend TrajectoryPool &operator-=(TrajectoryPool &a, const TrajectoryPool &b);
+    friend TrajectoryPtrPool operator+(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b);
+    friend TrajectoryPtrPool operator+(const TrajectoryPtrPool &a, TrajectoryPool &b);
+    friend TrajectoryPtrPool operator-(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b);
+
+    friend cv::Mat embedding_distance(const TrajectoryPool &a, const TrajectoryPool &b);
+    friend cv::Mat embedding_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b);
+    friend cv::Mat embedding_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b);
+
+    friend cv::Mat mahalanobis_distance(const TrajectoryPool &a, const TrajectoryPool &b);
+    friend cv::Mat mahalanobis_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b);
+    friend cv::Mat mahalanobis_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b);
+
+    friend cv::Mat iou_distance(const TrajectoryPool &a, const TrajectoryPool &b);
+    friend cv::Mat iou_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b);
+    friend cv::Mat iou_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b);
+private:   
+    void update_embedding(const cv::Mat &embedding);
+public:
+    TrajectoryState state;
+    cv::Vec4f ltrb;
+    cv::Mat smooth_embedding;
+    int id;
+    bool is_activated;
+    int timestamp;
+    int starttime;
+    float score;
+private:
+    static int count;
+    cv::Vec4f xyah;
+    cv::Mat current_embedding;
+    float eta;
+    int length;
+};
+
+inline cv::Vec4f ltrb2xyah(cv::Vec4f &ltrb)
+{
+    cv::Vec4f xyah;
+    xyah[0] = (ltrb[0] + ltrb[2]) * 0.5f;
+    xyah[1] = (ltrb[1] + ltrb[3]) * 0.5f;
+    xyah[3] =  ltrb[3] - ltrb[1];
+    xyah[2] = (ltrb[2] - ltrb[0]) / xyah[3];
+    return xyah;
+}
+
+inline Trajectory::Trajectory() :
+    state(New), ltrb(cv::Vec4f()), smooth_embedding(cv::Mat()), id(0),
+    is_activated(false), timestamp(0), starttime(0), score(0), eta(0.9), length(0)
+{
+}
+
+inline Trajectory::Trajectory(cv::Vec4f &ltrb_, float score_, const cv::Mat &embedding) :
+    state(New), ltrb(ltrb_), smooth_embedding(cv::Mat()), id(0),
+    is_activated(false), timestamp(0), starttime(0), score(score_), eta(0.9), length(0)
+{
+    xyah = ltrb2xyah(ltrb);
+    update_embedding(embedding);
+}
+
+inline Trajectory::Trajectory(const Trajectory &other):
+    state(other.state), ltrb(other.ltrb), id(other.id), is_activated(other.is_activated),
+    timestamp(other.timestamp), starttime(other.starttime), xyah(other.xyah),
+    score(other.score), eta(other.eta), length(other.length)
+{    
+    other.smooth_embedding.copyTo(smooth_embedding);
+    other.current_embedding.copyTo(current_embedding);
+    // copy state in KalmanFilter
+    
+    other.statePre.copyTo(cv::KalmanFilter::statePre);
+    other.statePost.copyTo(cv::KalmanFilter::statePost);
+    other.errorCovPre.copyTo(cv::KalmanFilter::errorCovPre);
+    other.errorCovPost.copyTo(cv::KalmanFilter::errorCovPost);
+    
+}
+
+inline Trajectory &Trajectory::operator=(const Trajectory &rhs)
+{
+    this->state = rhs.state;
+    this->ltrb = rhs.ltrb;
+    rhs.smooth_embedding.copyTo(this->smooth_embedding);
+    this->id = rhs.id;
+    this->is_activated = rhs.is_activated;
+    this->timestamp = rhs.timestamp;
+    this->starttime = rhs.starttime; 
+    this->xyah = rhs.xyah;
+    this->score = rhs.score;
+    rhs.current_embedding.copyTo(this->current_embedding);            
+    this->eta = rhs.eta;    
+    this->length = rhs.length;
+
+    // copy state in KalmanFilter
+    
+    rhs.statePre.copyTo(cv::KalmanFilter::statePre);
+    rhs.statePost.copyTo(cv::KalmanFilter::statePost);
+    rhs.errorCovPre.copyTo(cv::KalmanFilter::errorCovPre);
+    rhs.errorCovPost.copyTo(cv::KalmanFilter::errorCovPost);
+    
+    return *this;
+}
+
+inline int Trajectory::next_id()
+{
+    ++count;
+    return count;
+}
+
+inline void Trajectory::mark_lost(void)
+{
+    state = Lost;
+}
+
+inline void Trajectory::mark_removed(void)
+{
+    state = Removed;
+}
+
+}   // namespace PaddleDetection 
diff --git a/deploy/cpp/include/utils.h b/deploy/cpp/include/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b41db0dacff17339ffcac591b7825cec09d3663d
--- /dev/null
+++ b/deploy/cpp/include/utils.h
@@ -0,0 +1,41 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <ctime>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace PaddleDetection {
+
+// Object Detection Result
+struct ObjectResult {
+  // Rectangle coordinates of detected object: left, right, top, down
+  std::vector<int> rect;
+  // Class id of detected object
+  int class_id;
+  // Confidence of detected object
+  float confidence;
+  // Mask of detected object
+  std::vector<int> mask;
+};
+
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold);
+
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/scripts/build.sh b/deploy/cpp/scripts/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1937c7a05b6854e47c32c7c7833526b94de083ff
--- /dev/null
+++ b/deploy/cpp/scripts/build.sh
@@ -0,0 +1,86 @@
+# 是否使用GPU(即是否使用 CUDA)
+WITH_GPU=OFF
+
+# 是否使用MKL or openblas，TX2需要设置为OFF
+WITH_MKL=ON
+
+# 是否集成 TensorRT(仅WITH_GPU=ON 有效)
+WITH_TENSORRT=OFF
+
+# paddle 预测库lib名称，由于不同平台不同版本预测库lib名称不同，请查看所下载的预测库中`paddle_inference/lib/`文件夹下`lib`的名称
+PADDLE_LIB_NAME=libpaddle_inference
+
+# TensorRT 的include路径
+TENSORRT_INC_DIR=/path/to/tensorrt/include
+
+# TensorRT 的lib路径
+TENSORRT_LIB_DIR=/path/to/tensorrt/lib
+
+# Paddle 预测库路径
+PADDLE_DIR=/path/to/paddle_inference
+
+# CUDA 的 lib 路径
+CUDA_LIB=/path/to/cuda/lib
+
+# CUDNN 的 lib 路径
+CUDNN_LIB=/path/to/cudnn/lib
+
+# 是否开启关键点模型预测功能
+WITH_KEYPOINT=OFF
+
+# 是否开启跟踪模型预测功能
+WITH_MOT=OFF
+
+MACHINE_TYPE=`uname -m`
+echo "MACHINE_TYPE: "${MACHINE_TYPE}
+
+
+if [ "$MACHINE_TYPE" = "x86_64" ]
+then
+  echo "set OPENCV_DIR for x86_64"
+  # linux系统通过以下命令下载预编译的opencv
+  mkdir -p $(pwd)/deps && cd $(pwd)/deps
+  wget -c https://paddledet.bj.bcebos.com/data/opencv-3.4.16_gcc8.2_ffmpeg.tar.gz
+  tar -xvf opencv-3.4.16_gcc8.2_ffmpeg.tar.gz && cd ..
+
+  # set OPENCV_DIR
+  OPENCV_DIR=$(pwd)/deps/opencv-3.4.16_gcc8.2_ffmpeg
+
+elif [ "$MACHINE_TYPE" = "aarch64" ]
+then
+  echo "set OPENCV_DIR for aarch64"
+  # TX2平台通过以下命令下载预编译的opencv
+  mkdir -p $(pwd)/deps && cd $(pwd)/deps
+  wget -c https://bj.bcebos.com/v1/paddledet/data/TX2_JetPack4.3_opencv_3.4.6_gcc7.5.0.tar.gz
+  tar -xvf TX2_JetPack4.3_opencv_3.4.6_gcc7.5.0.tar.gz && cd ..
+
+  # set OPENCV_DIR
+  OPENCV_DIR=$(pwd)/deps/TX2_JetPack4.3_opencv_3.4.6_gcc7.5.0/
+
+else
+  echo "Please set OPENCV_DIR manually"
+fi
+
+echo "OPENCV_DIR: "$OPENCV_DIR
+
+# 以下无需改动
+rm -rf build
+mkdir -p build
+cd build
+cmake .. \
+    -DWITH_GPU=${WITH_GPU} \
+    -DWITH_MKL=${WITH_MKL} \
+    -DWITH_TENSORRT=${WITH_TENSORRT} \
+    -DTENSORRT_LIB_DIR=${TENSORRT_LIB_DIR} \
+    -DTENSORRT_INC_DIR=${TENSORRT_INC_DIR} \
+    -DPADDLE_DIR=${PADDLE_DIR} \
+    -DWITH_STATIC_LIB=${WITH_STATIC_LIB} \
+    -DCUDA_LIB=${CUDA_LIB} \
+    -DCUDNN_LIB=${CUDNN_LIB} \
+    -DOPENCV_DIR=${OPENCV_DIR} \
+    -DPADDLE_LIB_NAME=${PADDLE_LIB_NAME} \
+    -DWITH_KEYPOINT=${WITH_KEYPOINT} \
+    -DWITH_MOT=${WITH_MOT}
+
+make
+echo "make finished!"
diff --git a/deploy/cpp/src/jde_detector.cc b/deploy/cpp/src/jde_detector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5df8b87a7f89deddb19fb328ab5d5adcd5c5245c
--- /dev/null
+++ b/deploy/cpp/src/jde_detector.cc
@@ -0,0 +1,368 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/jde_detector.h"
+
+using namespace paddle_infer;
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void JDEDetector::LoadModel(const std::string& model_dir,
+                            const int batch_size,
+                            const std::string& run_mode) {
+  paddle_infer::Config config;
+  std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel";
+  std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams";
+  config.SetModel(prog_file, params_file);
+  if (this->device_ == "GPU") {
+    config.EnableUseGpu(200, this->gpu_id_);
+    config.SwitchIrOptim(true);
+    // use tensorrt
+    if (run_mode != "paddle") {
+      auto precision = paddle_infer::Config::Precision::kFloat32;
+      if (run_mode == "trt_fp32") {
+        precision = paddle_infer::Config::Precision::kFloat32;
+      } else if (run_mode == "trt_fp16") {
+        precision = paddle_infer::Config::Precision::kHalf;
+      } else if (run_mode == "trt_int8") {
+        precision = paddle_infer::Config::Precision::kInt8;
+      } else {
+        printf(
+            "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or "
+            "'trt_int8'");
+      }
+      // set tensorrt
+      config.EnableTensorRtEngine(1 << 30,
+                                  batch_size,
+                                  this->min_subgraph_size_,
+                                  precision,
+                                  false,
+                                  this->trt_calib_mode_);
+
+      // set use dynamic shape
+      if (this->use_dynamic_shape_) {
+        // set DynamicShsape for image tensor
+        const std::vector<int> min_input_shape = {
+            1, 3, this->trt_min_shape_, this->trt_min_shape_};
+        const std::vector<int> max_input_shape = {
+            1, 3, this->trt_max_shape_, this->trt_max_shape_};
+        const std::vector<int> opt_input_shape = {
+            1, 3, this->trt_opt_shape_, this->trt_opt_shape_};
+        const std::map<std::string, std::vector<int>> map_min_input_shape = {
+            {"image", min_input_shape}};
+        const std::map<std::string, std::vector<int>> map_max_input_shape = {
+            {"image", max_input_shape}};
+        const std::map<std::string, std::vector<int>> map_opt_input_shape = {
+            {"image", opt_input_shape}};
+
+        config.SetTRTDynamicShapeInfo(
+            map_min_input_shape, map_max_input_shape, map_opt_input_shape);
+        std::cout << "TensorRT dynamic shape enabled" << std::endl;
+      }
+    }
+
+  } else if (this->device_ == "XPU") {
+    config.EnableXpu(10 * 1024 * 1024);
+  } else {
+    config.DisableGpu();
+    if (this->use_mkldnn_) {
+      config.EnableMKLDNN();
+      // cache 10 different shapes for mkldnn to avoid memory leak
+      config.SetMkldnnCacheCapacity(10);
+    }
+    config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
+  }
+  config.SwitchUseFeedFetchOps(false);
+  config.SwitchIrOptim(true);
+  config.DisableGlogInfo();
+  // Memory optimization
+  config.EnableMemoryOptim();
+  predictor_ = std::move(CreatePredictor(config));
+}
+
+// Visualiztion results
+cv::Mat VisualizeTrackResult(const cv::Mat& img,
+                             const MOT_Result& results,
+                             const float fps,
+                             const int frame_id) {
+  cv::Mat vis_img = img.clone();
+  int im_h = img.rows;
+  int im_w = img.cols;
+  float text_scale = std::max(1, int(im_w / 1600.));
+  float text_thickness = 2.;
+  float line_thickness = std::max(1, int(im_w / 500.));
+
+  std::ostringstream oss;
+  oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+  oss << "frame: " << frame_id << " ";
+  oss << "fps: " << fps << " ";
+  oss << "num: " << results.size();
+  std::string text = oss.str();
+
+  cv::Point origin;
+  origin.x = 0;
+  origin.y = int(15 * text_scale);
+  cv::putText(vis_img,
+              text,
+              origin,
+              cv::FONT_HERSHEY_PLAIN,
+              text_scale,
+              (0, 0, 255),
+              2);
+
+  for (int i = 0; i < results.size(); ++i) {
+    const int obj_id = results[i].ids;
+    const float score = results[i].score;
+
+    cv::Scalar color = GetColor(obj_id);
+
+    cv::Point pt1 = cv::Point(results[i].rects.left, results[i].rects.top);
+    cv::Point pt2 = cv::Point(results[i].rects.right, results[i].rects.bottom);
+    cv::Point id_pt =
+        cv::Point(results[i].rects.left, results[i].rects.top + 10);
+    cv::Point score_pt =
+        cv::Point(results[i].rects.left, results[i].rects.top - 10);
+    cv::rectangle(vis_img, pt1, pt2, color, line_thickness);
+
+    std::ostringstream idoss;
+    idoss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    idoss << obj_id;
+    std::string id_text = idoss.str();
+
+    cv::putText(vis_img,
+                id_text,
+                id_pt,
+                cv::FONT_HERSHEY_PLAIN,
+                text_scale,
+                cv::Scalar(0, 255, 255),
+                text_thickness);
+
+    std::ostringstream soss;
+    soss << std::setiosflags(std::ios::fixed) << std::setprecision(2);
+    soss << score;
+    std::string score_text = soss.str();
+
+    cv::putText(vis_img,
+                score_text,
+                score_pt,
+                cv::FONT_HERSHEY_PLAIN,
+                text_scale,
+                cv::Scalar(0, 255, 255),
+                text_thickness);
+  }
+  return vis_img;
+}
+
+void FilterDets(const float conf_thresh,
+                const cv::Mat dets,
+                std::vector<int>* index) {
+  for (int i = 0; i < dets.rows; ++i) {
+    float score = *dets.ptr<float>(i, 4);
+    if (score > conf_thresh) {
+      index->push_back(i);
+    }
+  }
+}
+
+void JDEDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void JDEDetector::Postprocess(const cv::Mat dets,
+                              const cv::Mat emb,
+                              MOT_Result* result) {
+  result->clear();
+  std::vector<Track> tracks;
+  std::vector<int> valid;
+  FilterDets(conf_thresh_, dets, &valid);
+  cv::Mat new_dets, new_emb;
+  for (int i = 0; i < valid.size(); ++i) {
+    new_dets.push_back(dets.row(valid[i]));
+    new_emb.push_back(emb.row(valid[i]));
+  }
+  JDETracker::instance()->update(new_dets, new_emb, tracks);
+  if (tracks.size() == 0) {
+    MOT_Track mot_track;
+    MOT_Rect ret = {*dets.ptr<float>(0, 0),
+                    *dets.ptr<float>(0, 1),
+                    *dets.ptr<float>(0, 2),
+                    *dets.ptr<float>(0, 3)};
+    mot_track.ids = 1;
+    mot_track.score = *dets.ptr<float>(0, 4);
+    mot_track.rects = ret;
+    result->push_back(mot_track);
+  } else {
+    std::vector<Track>::iterator titer;
+    for (titer = tracks.begin(); titer != tracks.end(); ++titer) {
+      if (titer->score < threshold_) {
+        continue;
+      } else {
+        float w = titer->ltrb[2] - titer->ltrb[0];
+        float h = titer->ltrb[3] - titer->ltrb[1];
+        bool vertical = w / h > 1.6;
+        float area = w * h;
+        if (area > min_box_area_ && !vertical) {
+          MOT_Track mot_track;
+          MOT_Rect ret = {
+              titer->ltrb[0], titer->ltrb[1], titer->ltrb[2], titer->ltrb[3]};
+          mot_track.rects = ret;
+          mot_track.score = titer->score;
+          mot_track.ids = titer->id;
+          result->push_back(mot_track);
+        }
+      }
+    }
+  }
+}
+
+void JDEDetector::Predict(const std::vector<cv::Mat> imgs,
+                          const double threshold,
+                          const int warmup,
+                          const int repeats,
+                          MOT_Result* result,
+                          std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+
+  // Prepare input tensor
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputHandle(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Reshape({batch_size, 3, rh, rw});
+      in_tensor->CopyFromCpu(in_data_all.data());
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(im_shape_all.data());
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(scale_factor_all.data());
+    }
+  }
+
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<int> bbox_shape;
+  std::vector<int> emb_shape;
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto bbox_tensor = predictor_->GetOutputHandle(output_names[0]);
+    bbox_shape = bbox_tensor->shape();
+    auto emb_tensor = predictor_->GetOutputHandle(output_names[1]);
+    emb_shape = emb_tensor->shape();
+    // Calculate bbox length
+    int bbox_size = 1;
+    for (int j = 0; j < bbox_shape.size(); ++j) {
+      bbox_size *= bbox_shape[j];
+    }
+    // Calculate emb length
+    int emb_size = 1;
+    for (int j = 0; j < emb_shape.size(); ++j) {
+      emb_size *= emb_shape[j];
+    }
+
+    bbox_data_.resize(bbox_size);
+    bbox_tensor->CopyToCpu(bbox_data_.data());
+
+    emb_data_.resize(emb_size);
+    emb_tensor->CopyToCpu(emb_data_.data());
+  }
+
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto bbox_tensor = predictor_->GetOutputHandle(output_names[0]);
+    bbox_shape = bbox_tensor->shape();
+    auto emb_tensor = predictor_->GetOutputHandle(output_names[1]);
+    emb_shape = emb_tensor->shape();
+    // Calculate bbox length
+    int bbox_size = 1;
+    for (int j = 0; j < bbox_shape.size(); ++j) {
+      bbox_size *= bbox_shape[j];
+    }
+    // Calculate emb length
+    int emb_size = 1;
+    for (int j = 0; j < emb_shape.size(); ++j) {
+      emb_size *= emb_shape[j];
+    }
+
+    bbox_data_.resize(bbox_size);
+    bbox_tensor->CopyToCpu(bbox_data_.data());
+
+    emb_data_.resize(emb_size);
+    emb_tensor->CopyToCpu(emb_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  result->clear();
+
+  cv::Mat dets(bbox_shape[0], 6, CV_32FC1, bbox_data_.data());
+  cv::Mat emb(bbox_shape[0], emb_shape[1], CV_32FC1, emb_data_.data());
+
+  Postprocess(dets, emb, result);
+
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  (*times)[0] += double(preprocess_diff.count() * 1000);
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  (*times)[1] += double(inference_diff.count() * 1000);
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  (*times)[2] += double(postprocess_diff.count() * 1000);
+}
+
+cv::Scalar GetColor(int idx) {
+  idx = idx * 3;
+  cv::Scalar color =
+      cv::Scalar((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255);
+  return color;
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/src/keypoint_detector.cc b/deploy/cpp/src/keypoint_detector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0ee884566749c5ab459d8ec76aa98ae4e1d1f3c
--- /dev/null
+++ b/deploy/cpp/src/keypoint_detector.cc
@@ -0,0 +1,314 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/keypoint_detector.h"
+
+using namespace paddle_infer;
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void KeyPointDetector::LoadModel(const std::string& model_dir,
+                                 const int batch_size,
+                                 const std::string& run_mode) {
+  paddle_infer::Config config;
+  std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel";
+  std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams";
+  config.SetModel(prog_file, params_file);
+  if (this->device_ == "GPU") {
+    config.EnableUseGpu(200, this->gpu_id_);
+    config.SwitchIrOptim(true);
+    // use tensorrt
+    if (run_mode != "paddle") {
+      auto precision = paddle_infer::Config::Precision::kFloat32;
+      if (run_mode == "trt_fp32") {
+        precision = paddle_infer::Config::Precision::kFloat32;
+      } else if (run_mode == "trt_fp16") {
+        precision = paddle_infer::Config::Precision::kHalf;
+      } else if (run_mode == "trt_int8") {
+        precision = paddle_infer::Config::Precision::kInt8;
+      } else {
+        printf(
+            "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or "
+            "'trt_int8'");
+      }
+      // set tensorrt
+      config.EnableTensorRtEngine(1 << 30,
+                                  batch_size,
+                                  this->min_subgraph_size_,
+                                  precision,
+                                  false,
+                                  this->trt_calib_mode_);
+
+      // set use dynamic shape
+      if (this->use_dynamic_shape_) {
+        // set DynamicShsape for image tensor
+        const std::vector<int> min_input_shape = {
+            1, 3, this->trt_min_shape_, this->trt_min_shape_};
+        const std::vector<int> max_input_shape = {
+            1, 3, this->trt_max_shape_, this->trt_max_shape_};
+        const std::vector<int> opt_input_shape = {
+            1, 3, this->trt_opt_shape_, this->trt_opt_shape_};
+        const std::map<std::string, std::vector<int>> map_min_input_shape = {
+            {"image", min_input_shape}};
+        const std::map<std::string, std::vector<int>> map_max_input_shape = {
+            {"image", max_input_shape}};
+        const std::map<std::string, std::vector<int>> map_opt_input_shape = {
+            {"image", opt_input_shape}};
+
+        config.SetTRTDynamicShapeInfo(
+            map_min_input_shape, map_max_input_shape, map_opt_input_shape);
+        std::cout << "TensorRT dynamic shape enabled" << std::endl;
+      }
+    }
+
+  } else if (this->device_ == "XPU") {
+    config.EnableXpu(10 * 1024 * 1024);
+  } else {
+    config.DisableGpu();
+    if (this->use_mkldnn_) {
+      config.EnableMKLDNN();
+      // cache 10 different shapes for mkldnn to avoid memory leak
+      config.SetMkldnnCacheCapacity(10);
+    }
+    config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
+  }
+  config.SwitchUseFeedFetchOps(false);
+  config.SwitchIrOptim(true);
+  config.DisableGlogInfo();
+  // Memory optimization
+  config.EnableMemoryOptim();
+  predictor_ = std::move(CreatePredictor(config));
+}
+
+// Visualization MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap) {
+  const int edge[][2] = {{0, 1},
+                         {0, 2},
+                         {1, 3},
+                         {2, 4},
+                         {3, 5},
+                         {4, 6},
+                         {5, 7},
+                         {6, 8},
+                         {7, 9},
+                         {8, 10},
+                         {5, 11},
+                         {6, 12},
+                         {11, 13},
+                         {12, 14},
+                         {13, 15},
+                         {14, 16},
+                         {11, 12}};
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > 0.5) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+      int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+      int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+      int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+      cv::line(vis_img,
+               cv::Point2d(x_start, y_start),
+               cv::Point2d(x_end, y_end),
+               colormap[i],
+               1);
+    }
+  }
+  return vis_img;
+}
+
+void KeyPointDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void KeyPointDetector::Postprocess(std::vector<float>& output,
+                                   std::vector<int> output_shape,
+                                   std::vector<int64_t>& idxout,
+                                   std::vector<int> idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  std::vector<float> preds(output_shape[1] * 3, 0);
+
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(output,
+                    output_shape,
+                    idxout,
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid,
+                    this->use_dark);
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+}
+
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               const double threshold,
+                               const int warmup,
+                               const int repeats,
+                               std::vector<KeyPointResult>* result,
+                               std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+
+  // Prepare input tensor
+
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputHandle(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Reshape({batch_size, 3, rh, rw});
+      in_tensor->CopyFromCpu(in_data_all.data());
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(im_shape_all.data());
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(scale_factor_all.data());
+    }
+  }
+
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<int> output_shape, idx_shape;
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetOutputHandle(output_names[0]);
+    output_shape = out_tensor->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    output_data_.resize(output_size);
+    out_tensor->CopyToCpu(output_data_.data());
+
+    auto idx_tensor = predictor_->GetOutputHandle(output_names[1]);
+    idx_shape = idx_tensor->shape();
+    // Calculate output length
+    output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+    idx_data_.resize(output_size);
+    idx_tensor->CopyToCpu(idx_data_.data());
+  }
+
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetOutputHandle(output_names[0]);
+    output_shape = out_tensor->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    out_tensor->CopyToCpu(output_data_.data());
+
+    auto idx_tensor = predictor_->GetOutputHandle(output_names[1]);
+    idx_shape = idx_tensor->shape();
+    // Calculate output length
+    output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+    idx_data_.resize(output_size);
+    idx_tensor->CopyToCpu(idx_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/src/keypoint_postprocess.cc b/deploy/cpp/src/keypoint_postprocess.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb692b0a78bcf48ac96aa45b671300b9ff2db400
--- /dev/null
+++ b/deploy/cpp/src/keypoint_postprocess.cc
@@ -0,0 +1,316 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "include/keypoint_postprocess.h"
+#include <math.h>
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+
+namespace PaddleDetection {
+
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{-0.5f * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine) {
+  if (affine) {
+    cv::Mat trans(2, 3, CV_64FC1);
+    get_affine_transform(center, scale, 0, output_size, trans, 1);
+    for (int p = 0; p < dim[1]; ++p) {
+      affine_tranform(
+          coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+    }
+  } else {
+    float heat_w = static_cast<float>(output_size[0]);
+    float heat_h = static_cast<float>(output_size[1]);
+    float x_scale = scale[0] / heat_w;
+    float y_scale = scale[1] / heat_h;
+    float offset_x = center[0] - scale[0] / 2.;
+    float offset_y = center[1] - scale[1] / 2.;
+    for (int i = 0; i < dim[1]; i++) {
+      target_coords[i * 3 + 1] = x_scale * coords[i * 2] + offset_x;
+      target_coords[i * 3 + 2] = y_scale * coords[i * 2 + 1] + offset_y;
+    }
+  }
+}
+
+// only for batchsize == 1
+void get_max_preds(float* heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   float* maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<int>& dim,
+                std::vector<float>& coords,
+                int px,
+                int py,
+                int index,
+                int ch) {
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 =
+      heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0, dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1, 1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1, 1));
+
+  float epsilon = 1e-10;
+  // sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+
+  // compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2 * xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2 * xy + yd2);
+
+  // finally get offset by derivative and hassian, which combined by dx/dy and
+  // dxx/dyy
+  if (dxx * dyy - dxy * dxy != 0) {
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2, 2, CV_32F, M);
+    cv::Mat derivative(2, 1, CV_32F, D);
+    cv::Mat offset = -hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0, 0);
+    coords[ch * 2 + 1] += offset.at<float>(1, 0);
+  }
+}
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int64_t>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+
+    int idx = idxout[batchid * dim[1] + j];
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+
+    if (DARK && px > 1 && px < heatmap_width - 2 && py > 1 &&
+        py < heatmap_height - 2) {
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    } else {
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                       heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                       heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
+
+// Run predictor
+KeyPointResult PoseSmooth::smooth_process(KeyPointResult* result) {
+  KeyPointResult keypoint_smoothed = *result;
+  if (this->x_prev_hat.num_joints == -1) {
+    this->x_prev_hat = *result;
+    this->dx_prev_hat = *result;
+    std::fill(dx_prev_hat.keypoints.begin(), dx_prev_hat.keypoints.end(), 0.);
+    return keypoint_smoothed;
+  } else {
+    for (int i = 0; i < result->num_joints; i++) {
+      this->PointSmooth(result, &keypoint_smoothed, this->thresholds, i);
+    }
+    return keypoint_smoothed;
+  }
+}
+
+void PoseSmooth::PointSmooth(KeyPointResult* result,
+                             KeyPointResult* keypoint_smoothed,
+                             std::vector<float> thresholds,
+                             int index) {
+  float distance = sqrt(pow((result->keypoints[index * 3 + 1] -
+                             this->x_prev_hat.keypoints[index * 3 + 1]) /
+                                this->width,
+                            2) +
+                        pow((result->keypoints[index * 3 + 2] -
+                             this->x_prev_hat.keypoints[index * 3 + 2]) /
+                                this->height,
+                            2));
+  if (distance < thresholds[index] * this->thres_mult) {
+    keypoint_smoothed->keypoints[index * 3 + 1] =
+        this->x_prev_hat.keypoints[index * 3 + 1];
+    keypoint_smoothed->keypoints[index * 3 + 2] =
+        this->x_prev_hat.keypoints[index * 3 + 2];
+  } else {
+    if (this->filter_type == "OneEuro") {
+      keypoint_smoothed->keypoints[index * 3 + 1] =
+          this->OneEuroFilter(result->keypoints[index * 3 + 1],
+                              this->x_prev_hat.keypoints[index * 3 + 1],
+                              index * 3 + 1);
+      keypoint_smoothed->keypoints[index * 3 + 2] =
+          this->OneEuroFilter(result->keypoints[index * 3 + 2],
+                              this->x_prev_hat.keypoints[index * 3 + 2],
+                              index * 3 + 2);
+    } else {
+      keypoint_smoothed->keypoints[index * 3 + 1] =
+          this->ExpSmoothing(result->keypoints[index * 3 + 1],
+                             this->x_prev_hat.keypoints[index * 3 + 1],
+                             index * 3 + 1);
+      keypoint_smoothed->keypoints[index * 3 + 2] =
+          this->ExpSmoothing(result->keypoints[index * 3 + 2],
+                             this->x_prev_hat.keypoints[index * 3 + 2],
+                             index * 3 + 2);
+    }
+  }
+  return;
+}
+
+float PoseSmooth::OneEuroFilter(float x_cur, float x_pre, int loc) {
+  float te = 1.0;
+  this->alpha = this->smoothing_factor(te, this->fc_d);
+  float dx_cur = (x_cur - x_pre) / te;
+  float dx_cur_hat =
+      this->ExpSmoothing(dx_cur, this->dx_prev_hat.keypoints[loc]);
+
+  float fc = this->fc_min + this->beta * abs(dx_cur_hat);
+  this->alpha = this->smoothing_factor(te, fc);
+  float x_cur_hat = this->ExpSmoothing(x_cur, x_pre);
+  this->x_prev_hat.keypoints[loc] = x_cur_hat;
+  this->dx_prev_hat.keypoints[loc] = dx_cur_hat;
+  return x_cur_hat;
+}
+
+float PoseSmooth::smoothing_factor(float te, float fc) {
+  float r = 2 * PI * fc * te;
+  return r / (r + 1);
+}
+
+float PoseSmooth::ExpSmoothing(float x_cur, float x_pre, int loc) {
+  return this->alpha * x_cur + (1 - this->alpha) * x_pre;
+}
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/src/lapjv.cpp b/deploy/cpp/src/lapjv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8a7b58d5d86892f6abfeae8bbd058ad26a8d85a
--- /dev/null
+++ b/deploy/cpp/src/lapjv.cpp
@@ -0,0 +1,405 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/gatagat/lap/blob/master/lap/lapjv.cpp
+// Ths copyright of gatagat/lap is as follows:
+// MIT License
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "include/lapjv.h"
+
+namespace PaddleDetection {
+
+/** Column-reduction and reduction transfer for a dense cost matrix.
+ */
+int _ccrrt_dense(const int n, float *cost[],
+                     int *free_rows, int *x, int *y, float *v)
+{
+    int n_free_rows;
+    bool *unique;
+
+    for (int i = 0; i < n; i++) {
+        x[i] = -1;
+        v[i] = LARGE;
+        y[i] = 0;
+    }
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            const float c = cost[i][j];
+            if (c < v[j]) {
+                v[j] = c;
+                y[j] = i;
+            }
+        }
+    }
+    NEW(unique, bool, n);
+    memset(unique, TRUE, n);
+    {
+        int j = n;
+        do {
+            j--;
+            const int i = y[j];
+            if (x[i] < 0) {
+                x[i] = j;
+            } else {
+                unique[i] = FALSE;
+                y[j] = -1;
+            }
+        } while (j > 0);
+    }
+    n_free_rows = 0;
+    for (int i = 0; i < n; i++) {
+        if (x[i] < 0) {
+            free_rows[n_free_rows++] = i;
+        } else if (unique[i]) {
+            const int j = x[i];
+            float min = LARGE;
+            for (int j2 = 0; j2 < n; j2++) {
+                if (j2 == (int)j) {
+                    continue;
+                }
+                const float c = cost[i][j2] - v[j2];
+                if (c < min) {
+                    min = c;
+                }
+            }
+            v[j] -= min;
+        }
+    }
+    FREE(unique);
+    return n_free_rows;
+}
+
+
+/** Augmenting row reduction for a dense cost matrix.
+ */
+int _carr_dense(
+    const int n, float *cost[],
+    const int n_free_rows,
+    int *free_rows, int *x, int *y, float *v)
+{
+    int current = 0;
+    int new_free_rows = 0;
+    int rr_cnt = 0;
+    while (current < n_free_rows) {
+        int i0;
+        int j1, j2;
+        float v1, v2, v1_new;
+        bool v1_lowers;
+
+        rr_cnt++;
+        const int free_i = free_rows[current++];
+        j1 = 0;
+        v1 = cost[free_i][0] - v[0];
+        j2 = -1;
+        v2 = LARGE;
+        for (int j = 1; j < n; j++) {
+            const float c = cost[free_i][j] - v[j];
+            if (c < v2) {
+                if (c >= v1) {
+                    v2 = c;
+                    j2 = j;
+                } else {
+                    v2 = v1;
+                    v1 = c;
+                    j2 = j1;
+                    j1 = j;
+                }
+            }
+        }
+        i0 = y[j1];
+        v1_new = v[j1] - (v2 - v1);
+        v1_lowers = v1_new < v[j1];
+        if (rr_cnt < current * n) {
+            if (v1_lowers) {
+                v[j1] = v1_new;
+            } else if (i0 >= 0 && j2 >= 0) {
+                j1 = j2;
+                i0 = y[j2];
+            }
+            if (i0 >= 0) {
+                if (v1_lowers) {
+                    free_rows[--current] = i0;
+                } else {
+                    free_rows[new_free_rows++] = i0;
+                }
+            }
+        } else {
+            if (i0 >= 0) {
+                free_rows[new_free_rows++] = i0;
+            }
+        }
+        x[free_i] = j1;
+        y[j1] = free_i;
+    }
+    return new_free_rows;
+}
+
+
+/** Find columns with minimum d[j] and put them on the SCAN list.
+ */
+int _find_dense(const int n, int lo, float *d, int *cols, int *y)
+{
+    int hi = lo + 1;
+    float mind = d[cols[lo]];
+    for (int k = hi; k < n; k++) {
+        int j = cols[k];
+        if (d[j] <= mind) {
+            if (d[j] < mind) {
+                hi = lo;
+                mind = d[j];
+            }
+            cols[k] = cols[hi];
+            cols[hi++] = j;
+        }
+    }
+    return hi;
+}
+
+
+// Scan all columns in TODO starting from arbitrary column in SCAN
+// and try to decrease d of the TODO columns using the SCAN column.
+int _scan_dense(const int n, float *cost[],
+                    int *plo, int*phi,
+                    float *d, int *cols, int *pred,
+                    int *y, float *v)
+{
+    int lo = *plo;
+    int hi = *phi;
+    float h, cred_ij;
+
+    while (lo != hi) {
+        int j = cols[lo++];
+        const int i = y[j];
+        const float mind = d[j];
+        h = cost[i][j] - v[j] - mind;
+        // For all columns in TODO
+        for (int k = hi; k < n; k++) {
+            j = cols[k];
+            cred_ij = cost[i][j] - v[j] - h;
+            if (cred_ij < d[j]) {
+                d[j] = cred_ij;
+                pred[j] = i;
+                if (cred_ij == mind) {
+                    if (y[j] < 0) {
+                        return j;
+                    }
+                    cols[k] = cols[hi];
+                    cols[hi++] = j;
+                }
+            }
+        }
+    }
+    *plo = lo;
+    *phi = hi;
+    return -1;
+}
+
+
+/** Single iteration of modified Dijkstra shortest path algorithm as explained in the JV paper.
+ *
+ * This is a dense matrix version.
+ *
+ * \return The closest free column index.
+ */
+int find_path_dense(
+    const int n, float *cost[],
+    const int start_i,
+    int *y, float *v,
+    int *pred)
+{
+    int lo = 0, hi = 0;
+    int final_j = -1;
+    int n_ready = 0;
+    int *cols;
+    float *d;
+
+    NEW(cols, int, n);
+    NEW(d, float, n);
+
+    for (int i = 0; i < n; i++) {
+        cols[i] = i;
+        pred[i] = start_i;
+        d[i] = cost[start_i][i] - v[i];
+    }
+    while (final_j == -1) {
+        // No columns left on the SCAN list.
+        if (lo == hi) {
+            n_ready = lo;
+            hi = _find_dense(n, lo, d, cols, y);
+            for (int k = lo; k < hi; k++) {
+                const int j = cols[k];
+                if (y[j] < 0) {
+                    final_j = j;
+                }
+            }
+        }
+        if (final_j == -1) {
+            final_j = _scan_dense(
+                    n, cost, &lo, &hi, d, cols, pred, y, v);
+        }
+    }
+
+    {
+        const float mind = d[cols[lo]];
+        for (int k = 0; k < n_ready; k++) {
+            const int j = cols[k];
+            v[j] += d[j] - mind;
+        }
+    }
+
+    FREE(cols);
+    FREE(d);
+
+    return final_j;
+}
+
+
+/** Augment for a dense cost matrix.
+ */
+int _ca_dense(
+    const int n, float *cost[],
+    const int n_free_rows,
+    int *free_rows, int *x, int *y, float *v)
+{
+    int *pred;
+
+    NEW(pred, int, n);
+
+    for (int *pfree_i = free_rows; pfree_i < free_rows + n_free_rows; pfree_i++) {
+        int i = -1, j;
+        int k = 0;
+
+        j = find_path_dense(n, cost, *pfree_i, y, v, pred);
+        while (i != *pfree_i) {
+            i = pred[j];
+            y[j] = i;
+            SWAP_INDICES(j, x[i]);
+            k++;
+        }
+    }
+    FREE(pred);
+    return 0;
+}
+
+
+/** Solve dense sparse LAP.
+ */
+int lapjv_internal(
+    const cv::Mat &cost, const bool extend_cost, const float cost_limit,
+    int *x, int *y ) {
+    int n_rows = cost.rows;
+    int n_cols = cost.cols;
+    int n;
+    if (n_rows == n_cols) {
+      n = n_rows;
+    } else if (!extend_cost) {
+      throw std::invalid_argument("Square cost array expected. If cost is intentionally non-square, pass extend_cost=True.");
+    }
+
+    // Get extend cost
+    if (extend_cost || cost_limit < LARGE) {
+      n = n_rows + n_cols;
+    }
+    cv::Mat cost_expand(n, n, CV_32F);
+    float expand_value;
+    if (cost_limit < LARGE) {
+      expand_value = cost_limit / 2;
+    } else {
+      double max_v;
+      minMaxLoc(cost, nullptr, &max_v);
+      expand_value = (float)max_v + 1;
+    }
+
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < n; ++j) {
+        cost_expand.at<float>(i, j) = expand_value;
+        if (i >= n_rows && j >= n_cols) {
+          cost_expand.at<float>(i, j) = 0;
+        } else if (i < n_rows && j < n_cols) {
+          cost_expand.at<float>(i, j) = cost.at<float>(i, j);
+        }
+      }
+    } 
+
+    // Convert Mat to pointer array
+    float **cost_ptr;
+    NEW(cost_ptr, float *, n);
+    for (int i = 0; i < n; ++i) {
+      NEW(cost_ptr[i], float, n);
+    }
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < n; ++j) {
+        cost_ptr[i][j] = cost_expand.at<float>(i, j);
+      }
+    }
+
+    int ret;
+    int *free_rows;
+    float *v;
+    int *x_c;
+    int *y_c;
+
+    NEW(free_rows, int, n);
+    NEW(v, float, n);
+    NEW(x_c, int, n);
+    NEW(y_c, int, n);
+
+    ret = _ccrrt_dense(n, cost_ptr, free_rows, x_c, y_c, v);
+    int i = 0;
+    while (ret > 0 && i < 2) {
+      ret = _carr_dense(n, cost_ptr, ret, free_rows, x_c, y_c, v);
+      i++;
+    }
+    if (ret > 0) {
+      ret = _ca_dense(n, cost_ptr, ret, free_rows, x_c, y_c, v);
+    }
+    FREE(v);
+    FREE(free_rows);
+    for (int i = 0; i < n; ++i) {
+      FREE(cost_ptr[i]);
+    }
+    FREE(cost_ptr);
+    if (ret != 0) {
+      if (ret == -1){
+        throw "Out of memory.";
+      }
+      throw "Unknown error (lapjv_internal)";
+    }
+    // Get output of x, y, opt
+    for (int i = 0; i < n; ++i) {
+      if (i < n_rows) {
+        x[i] = x_c[i];
+        if (x[i] >= n_cols) {
+          x[i] = -1;
+        }
+      }      
+      if (i < n_cols) {
+        y[i] = y_c[i];
+        if (y[i] >= n_rows) {
+          y[i] = -1;
+        }
+      } 
+    }
+    
+    FREE(x_c);
+    FREE(y_c);
+    return ret;
+}
+
+} // namespace PaddleDetection
diff --git a/deploy/cpp/src/main.cc b/deploy/cpp/src/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6912031ba7e887b5d2b8449b026bdab6263ea08b
--- /dev/null
+++ b/deploy/cpp/src/main.cc
@@ -0,0 +1,428 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <math.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#ifdef _WIN32
+#include <direct.h>
+#include <io.h>
+#elif LINUX
+#include <stdarg.h>
+#include <sys/stat.h>
+#endif
+
+#include <gflags/gflags.h>
+#include "include/object_detector.h"
+
+DEFINE_string(model_dir, "", "Path of inference model");
+DEFINE_string(image_file, "", "Path of input image");
+DEFINE_string(image_dir,
+              "",
+              "Dir of input image, `image_file` has a higher priority.");
+DEFINE_int32(batch_size, 1, "batch_size");
+DEFINE_string(
+    video_file,
+    "",
+    "Path of input video, `video_file` or `camera_id` has a highest priority.");
+DEFINE_int32(camera_id, -1, "Device id of camera to predict");
+DEFINE_bool(
+    use_gpu,
+    false,
+    "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device,
+              "CPU",
+              "Choose the device you want to run, it can be: CPU/GPU/XPU, "
+              "default is CPU.");
+DEFINE_double(threshold, 0.5, "Threshold of score.");
+DEFINE_string(output_dir, "output", "Directory of output visualization files.");
+DEFINE_string(run_mode,
+              "paddle",
+              "Mode of running(paddle/trt_fp32/trt_fp16/trt_int8)");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
+DEFINE_bool(run_benchmark,
+            false,
+            "Whether to predict a image_file repeatedly for benchmark");
+DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU");
+DEFINE_int32(cpu_threads, 1, "Num of threads with CPU");
+DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI");
+DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI");
+DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI");
+DEFINE_bool(trt_calib_mode,
+            false,
+            "If the model is produced by TRT offline quantitative calibration, "
+            "trt_calib_mode need to set True");
+
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  LOG(INFO) << "----------------------- Config info -----------------------";
+  LOG(INFO) << "runtime_device: " << FLAGS_device;
+  LOG(INFO) << "ir_optim: "
+            << "True";
+  LOG(INFO) << "enable_memory_optim: "
+            << "True";
+  int has_trt = FLAGS_run_mode.find("trt");
+  if (has_trt >= 0) {
+    LOG(INFO) << "enable_tensorrt: "
+              << "True";
+    std::string precision = FLAGS_run_mode.substr(4, 8);
+    LOG(INFO) << "precision: " << precision;
+  } else {
+    LOG(INFO) << "enable_tensorrt: "
+              << "False";
+    LOG(INFO) << "precision: "
+              << "fp32";
+  }
+  LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False");
+  LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_threads;
+  LOG(INFO) << "----------------------- Data info -----------------------";
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+  LOG(INFO) << "input_shape: "
+            << "dynamic shape";
+  LOG(INFO) << "----------------------- Model info -----------------------";
+  FLAGS_model_dir.erase(FLAGS_model_dir.find_last_not_of("/") + 1);
+  LOG(INFO) << "model_name: "
+            << FLAGS_model_dir.substr(FLAGS_model_dir.find_last_of('/') + 1);
+  LOG(INFO) << "----------------------- Perf info ------------------------";
+  LOG(INFO) << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0);
+  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num;
+}
+
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string& path) {
+#ifdef _WIN32
+  struct _stat buffer;
+  return (_stat(path.c_str(), &buffer) == 0);
+#else
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+#ifdef _WIN32
+  ret = _mkdir(path.c_str());
+#else
+  ret = mkdir(path.c_str(), 0755);
+#endif  // !_WIN32
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void PredictVideo(const std::string& video_path,
+                  PaddleDetection::ObjectDetector* det,
+                  const std::string& output_dir = "output") {
+  // Open video
+  cv::VideoCapture capture;
+  std::string video_out_name = "output.mp4";
+  if (FLAGS_camera_id != -1) {
+    capture.open(FLAGS_camera_id);
+  } else {
+    capture.open(video_path.c_str());
+    video_out_name =
+        video_path.substr(video_path.find_last_of(OS_PATH_SEP) + 1);
+  }
+  if (!capture.isOpened()) {
+    printf("can not open video : %s\n", video_path.c_str());
+    return;
+  }
+
+  // Get Video info : resolution, fps, frame count
+  int video_width = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_WIDTH));
+  int video_height = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_HEIGHT));
+  int video_fps = static_cast<int>(capture.get(CV_CAP_PROP_FPS));
+  int video_frame_count =
+      static_cast<int>(capture.get(CV_CAP_PROP_FRAME_COUNT));
+  printf("fps: %d, frame_count: %d\n", video_fps, video_frame_count);
+
+  // Create VideoWriter for output
+  cv::VideoWriter video_out;
+  std::string video_out_path(output_dir);
+  if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+    video_out_path += OS_PATH_SEP;
+  }
+  video_out_path += video_out_name;
+  video_out.open(video_out_path.c_str(),
+                 0x00000021,
+                 video_fps,
+                 cv::Size(video_width, video_height),
+                 true);
+  if (!video_out.isOpened()) {
+    printf("create video writer failed!\n");
+    return;
+  }
+
+  std::vector<PaddleDetection::ObjectResult> result;
+  std::vector<int> bbox_num;
+  std::vector<double> det_times;
+  auto labels = det->GetLabelList();
+  auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+  // Capture all frames and do inference
+  cv::Mat frame;
+  int frame_id = 1;
+  bool is_rbox = false;
+  while (capture.read(frame)) {
+    if (frame.empty()) {
+      break;
+    }
+    std::vector<cv::Mat> imgs;
+    imgs.push_back(frame);
+    printf("detect frame: %d\n", frame_id);
+    det->Predict(imgs, FLAGS_threshold, 0, 1, &result, &bbox_num, &det_times);
+    std::vector<PaddleDetection::ObjectResult> out_result;
+    for (const auto& item : result) {
+      if (item.confidence < FLAGS_threshold || item.class_id == -1) {
+        continue;
+      }
+      out_result.push_back(item);
+      if (item.rect.size() > 6) {
+        is_rbox = true;
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+               item.class_id,
+               item.confidence,
+               item.rect[0],
+               item.rect[1],
+               item.rect[2],
+               item.rect[3],
+               item.rect[4],
+               item.rect[5],
+               item.rect[6],
+               item.rect[7]);
+      } else {
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+               item.class_id,
+               item.confidence,
+               item.rect[0],
+               item.rect[1],
+               item.rect[2],
+               item.rect[3]);
+      }
+    }
+
+    cv::Mat out_im = PaddleDetection::VisualizeResult(
+        frame, out_result, labels, colormap, is_rbox);
+
+    video_out.write(out_im);
+    frame_id += 1;
+  }
+  capture.release();
+  video_out.release();
+}
+
+void PredictImage(const std::vector<std::string> all_img_paths,
+                  const int batch_size,
+                  const double threshold,
+                  const bool run_benchmark,
+                  PaddleDetection::ObjectDetector* det,
+                  const std::string& output_dir = "output") {
+  std::vector<double> det_t = {0, 0, 0};
+  int steps = ceil(float(all_img_paths.size()) / batch_size);
+  printf("total images = %d, batch_size = %d, total steps = %d\n",
+         all_img_paths.size(),
+         batch_size,
+         steps);
+  for (int idx = 0; idx < steps; idx++) {
+    std::vector<cv::Mat> batch_imgs;
+    int left_image_cnt = all_img_paths.size() - idx * batch_size;
+    if (left_image_cnt > batch_size) {
+      left_image_cnt = batch_size;
+    }
+    for (int bs = 0; bs < left_image_cnt; bs++) {
+      std::string image_file_path = all_img_paths.at(idx * batch_size + bs);
+      cv::Mat im = cv::imread(image_file_path, 1);
+      batch_imgs.insert(batch_imgs.end(), im);
+    }
+
+    // Store all detected result
+    std::vector<PaddleDetection::ObjectResult> result;
+    std::vector<int> bbox_num;
+    std::vector<double> det_times;
+    bool is_rbox = false;
+    if (run_benchmark) {
+      det->Predict(
+          batch_imgs, threshold, 10, 10, &result, &bbox_num, &det_times);
+    } else {
+      det->Predict(batch_imgs, threshold, 0, 1, &result, &bbox_num, &det_times);
+      // get labels and colormap
+      auto labels = det->GetLabelList();
+      auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+
+      int item_start_idx = 0;
+      for (int i = 0; i < left_image_cnt; i++) {
+        cv::Mat im = batch_imgs[i];
+        std::vector<PaddleDetection::ObjectResult> im_result;
+        int detect_num = 0;
+
+        for (int j = 0; j < bbox_num[i]; j++) {
+          PaddleDetection::ObjectResult item = result[item_start_idx + j];
+          if (item.confidence < threshold || item.class_id == -1) {
+            continue;
+          }
+          detect_num += 1;
+          im_result.push_back(item);
+          if (item.rect.size() > 6) {
+            is_rbox = true;
+            printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+                   item.class_id,
+                   item.confidence,
+                   item.rect[0],
+                   item.rect[1],
+                   item.rect[2],
+                   item.rect[3],
+                   item.rect[4],
+                   item.rect[5],
+                   item.rect[6],
+                   item.rect[7]);
+          } else {
+            printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+                   item.class_id,
+                   item.confidence,
+                   item.rect[0],
+                   item.rect[1],
+                   item.rect[2],
+                   item.rect[3]);
+          }
+        }
+        std::cout << all_img_paths.at(idx * batch_size + i)
+                  << " The number of detected box: " << detect_num << std::endl;
+        item_start_idx = item_start_idx + bbox_num[i];
+        // Visualization result
+        cv::Mat vis_img = PaddleDetection::VisualizeResult(
+            im, im_result, labels, colormap, is_rbox);
+        std::vector<int> compression_params;
+        compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+        compression_params.push_back(95);
+        std::string output_path(output_dir);
+        if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+          output_path += OS_PATH_SEP;
+        }
+        std::string image_file_path = all_img_paths.at(idx * batch_size + i);
+        output_path +=
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::imwrite(output_path, vis_img, compression_params);
+        printf("Visualized output saved as %s\n", output_path.c_str());
+      }
+    }
+    det_t[0] += det_times[0];
+    det_t[1] += det_times[1];
+    det_t[2] += det_times[2];
+    det_times.clear();
+  }
+  PrintBenchmarkLog(det_t, all_img_paths.size());
+}
+
+int main(int argc, char** argv) {
+  // Parsing command-line
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_image_file.empty() && FLAGS_image_dir.empty() &&
+       FLAGS_video_file.empty())) {
+    std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ "
+              << "--image_file=/PATH/TO/INPUT/IMAGE/" << std::endl;
+    return -1;
+  }
+  if (!(FLAGS_run_mode == "paddle" || FLAGS_run_mode == "trt_fp32" ||
+        FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) {
+    std::cout
+        << "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
+    return -1;
+  }
+  transform(FLAGS_device.begin(),
+            FLAGS_device.end(),
+            FLAGS_device.begin(),
+            ::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" ||
+        FLAGS_device == "XPU")) {
+    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
+    return -1;
+  }
+  if (FLAGS_use_gpu) {
+    std::cout << "Deprecated, please use `--device` to set the device you want "
+                 "to run.";
+    return -1;
+  }
+  // Load model and create a object detector
+  PaddleDetection::ObjectDetector det(FLAGS_model_dir,
+                                      FLAGS_device,
+                                      FLAGS_use_mkldnn,
+                                      FLAGS_cpu_threads,
+                                      FLAGS_run_mode,
+                                      FLAGS_batch_size,
+                                      FLAGS_gpu_id,
+                                      FLAGS_trt_min_shape,
+                                      FLAGS_trt_max_shape,
+                                      FLAGS_trt_opt_shape,
+                                      FLAGS_trt_calib_mode);
+  // Do inference on input video or image
+  if (!PathExists(FLAGS_output_dir)) {
+    MkDirs(FLAGS_output_dir);
+  }
+  if (!FLAGS_video_file.empty() || FLAGS_camera_id != -1) {
+    PredictVideo(FLAGS_video_file, &det, FLAGS_output_dir);
+  } else if (!FLAGS_image_file.empty() || !FLAGS_image_dir.empty()) {
+    std::vector<std::string> all_img_paths;
+    std::vector<cv::String> cv_all_img_paths;
+    if (!FLAGS_image_file.empty()) {
+      all_img_paths.push_back(FLAGS_image_file);
+      if (FLAGS_batch_size > 1) {
+        std::cout << "batch_size should be 1, when set `image_file`."
+                  << std::endl;
+        return -1;
+      }
+    } else {
+      cv::glob(FLAGS_image_dir, cv_all_img_paths);
+      for (const auto& img_path : cv_all_img_paths) {
+        all_img_paths.push_back(img_path);
+      }
+    }
+    PredictImage(all_img_paths,
+                 FLAGS_batch_size,
+                 FLAGS_threshold,
+                 FLAGS_run_benchmark,
+                 &det,
+                 FLAGS_output_dir);
+  }
+  return 0;
+}
diff --git a/deploy/cpp/src/main_jde.cc b/deploy/cpp/src/main_jde.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3bba98dd4c9b6b4cd01cd44d38c564dc6c8d82dc
--- /dev/null
+++ b/deploy/cpp/src/main_jde.cc
@@ -0,0 +1,269 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <math.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#ifdef _WIN32
+#include <direct.h>
+#include <io.h>
+#elif LINUX
+#include <stdarg.h>
+#include <sys/stat.h>
+#endif
+
+#include <gflags/gflags.h>
+#include <opencv2/opencv.hpp>
+#include "include/jde_detector.h"
+#include "include/object_detector.h"
+
+DEFINE_string(model_dir, "", "Path of inference model");
+DEFINE_int32(batch_size, 1, "batch_size");
+DEFINE_string(
+    video_file,
+    "",
+    "Path of input video, `video_file` or `camera_id` has a highest priority.");
+DEFINE_int32(camera_id, -1, "Device id of camera to predict");
+DEFINE_bool(
+    use_gpu,
+    false,
+    "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device,
+              "CPU",
+              "Choose the device you want to run, it can be: CPU/GPU/XPU, "
+              "default is CPU.");
+DEFINE_double(threshold, 0.5, "Threshold of score.");
+DEFINE_string(output_dir, "output", "Directory of output visualization files.");
+DEFINE_string(run_mode,
+              "paddle",
+              "Mode of running(paddle/trt_fp32/trt_fp16/trt_int8)");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
+DEFINE_bool(run_benchmark,
+            false,
+            "Whether to predict a image_file repeatedly for benchmark");
+DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU");
+DEFINE_int32(cpu_threads, 1, "Num of threads with CPU");
+DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI");
+DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI");
+DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI");
+DEFINE_bool(trt_calib_mode,
+            false,
+            "If the model is produced by TRT offline quantitative calibration, "
+            "trt_calib_mode need to set True");
+
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  LOG(INFO) << "----------------------- Config info -----------------------";
+  LOG(INFO) << "runtime_device: " << FLAGS_device;
+  LOG(INFO) << "ir_optim: "
+            << "True";
+  LOG(INFO) << "enable_memory_optim: "
+            << "True";
+  int has_trt = FLAGS_run_mode.find("trt");
+  if (has_trt >= 0) {
+    LOG(INFO) << "enable_tensorrt: "
+              << "True";
+    std::string precision = FLAGS_run_mode.substr(4, 8);
+    LOG(INFO) << "precision: " << precision;
+  } else {
+    LOG(INFO) << "enable_tensorrt: "
+              << "False";
+    LOG(INFO) << "precision: "
+              << "fp32";
+  }
+  LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False");
+  LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_threads;
+  LOG(INFO) << "----------------------- Data info -----------------------";
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+  LOG(INFO) << "input_shape: "
+            << "dynamic shape";
+  LOG(INFO) << "----------------------- Model info -----------------------";
+  FLAGS_model_dir.erase(FLAGS_model_dir.find_last_not_of("/") + 1);
+  LOG(INFO) << "model_name: "
+            << FLAGS_model_dir.substr(FLAGS_model_dir.find_last_of('/') + 1);
+  LOG(INFO) << "----------------------- Perf info ------------------------";
+  LOG(INFO) << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0);
+  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num;
+}
+
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string& path) {
+#ifdef _WIN32
+  struct _stat buffer;
+  return (_stat(path.c_str(), &buffer) == 0);
+#else
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+#ifdef _WIN32
+  ret = _mkdir(path.c_str());
+#else
+  ret = mkdir(path.c_str(), 0755);
+#endif  // !_WIN32
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void PredictVideo(const std::string& video_path,
+                  PaddleDetection::JDEDetector* mot,
+                  const std::string& output_dir = "output") {
+  // Open video
+  cv::VideoCapture capture;
+  std::string video_out_name = "output.mp4";
+  if (FLAGS_camera_id != -1) {
+    capture.open(FLAGS_camera_id);
+  } else {
+    capture.open(video_path.c_str());
+    video_out_name =
+        video_path.substr(video_path.find_last_of(OS_PATH_SEP) + 1);
+  }
+  if (!capture.isOpened()) {
+    printf("can not open video : %s\n", video_path.c_str());
+    return;
+  }
+
+  // Get Video info : resolution, fps, frame count
+  int video_width = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_WIDTH));
+  int video_height = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_HEIGHT));
+  int video_fps = static_cast<int>(capture.get(CV_CAP_PROP_FPS));
+  int video_frame_count =
+      static_cast<int>(capture.get(CV_CAP_PROP_FRAME_COUNT));
+  printf("fps: %d, frame_count: %d\n", video_fps, video_frame_count);
+
+  // Create VideoWriter for output
+  cv::VideoWriter video_out;
+  std::string video_out_path(output_dir);
+  if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+    video_out_path += OS_PATH_SEP;
+  }
+  video_out_path += video_out_name;
+  video_out.open(video_out_path.c_str(),
+                 0x00000021,
+                 video_fps,
+                 cv::Size(video_width, video_height),
+                 true);
+  if (!video_out.isOpened()) {
+    printf("create video writer failed!\n");
+    return;
+  }
+
+  PaddleDetection::MOT_Result result;
+  std::vector<double> det_times(3);
+  double times;
+  // Capture all frames and do inference
+  cv::Mat frame;
+  int frame_id = 1;
+  while (capture.read(frame)) {
+    if (frame.empty()) {
+      break;
+    }
+    std::vector<cv::Mat> imgs;
+    imgs.push_back(frame);
+    printf("detect frame: %d\n", frame_id);
+    mot->Predict(imgs, FLAGS_threshold, 0, 1, &result, &det_times);
+    frame_id += 1;
+    times = std::accumulate(det_times.begin(), det_times.end(), 0) / frame_id;
+
+    cv::Mat out_im = PaddleDetection::VisualizeTrackResult(
+        frame, result, 1000. / times, frame_id);
+
+    video_out.write(out_im);
+  }
+  capture.release();
+  video_out.release();
+  PrintBenchmarkLog(det_times, frame_id);
+  printf("Visualized output saved as %s\n", video_out_path.c_str());
+}
+
+int main(int argc, char** argv) {
+  // Parsing command-line
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() || FLAGS_video_file.empty()) {
+    std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ "
+              << "--video_file=/PATH/TO/INPUT/VIDEO/" << std::endl;
+    return -1;
+  }
+  if (!(FLAGS_run_mode == "paddle" || FLAGS_run_mode == "trt_fp32" ||
+        FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) {
+    std::cout
+        << "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
+    return -1;
+  }
+  transform(FLAGS_device.begin(),
+            FLAGS_device.end(),
+            FLAGS_device.begin(),
+            ::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" ||
+        FLAGS_device == "XPU")) {
+    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
+    return -1;
+  }
+  if (FLAGS_use_gpu) {
+    std::cout << "Deprecated, please use `--device` to set the device you want "
+                 "to run.";
+    return -1;
+  }
+
+  // Do inference on input video or image
+  PaddleDetection::JDEDetector mot(FLAGS_model_dir,
+                                   FLAGS_device,
+                                   FLAGS_use_mkldnn,
+                                   FLAGS_cpu_threads,
+                                   FLAGS_run_mode,
+                                   FLAGS_batch_size,
+                                   FLAGS_gpu_id,
+                                   FLAGS_trt_min_shape,
+                                   FLAGS_trt_max_shape,
+                                   FLAGS_trt_opt_shape,
+                                   FLAGS_trt_calib_mode);
+  if (!PathExists(FLAGS_output_dir)) {
+    MkDirs(FLAGS_output_dir);
+  }
+  PredictVideo(FLAGS_video_file, &mot, FLAGS_output_dir);
+  return 0;
+}
diff --git a/deploy/cpp/src/main_keypoint.cc b/deploy/cpp/src/main_keypoint.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab6555367f64b0f13f4707a2367754c4da61f392
--- /dev/null
+++ b/deploy/cpp/src/main_keypoint.cc
@@ -0,0 +1,598 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <math.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#ifdef _WIN32
+#include <direct.h>
+#include <io.h>
+#elif LINUX
+#include <stdarg.h>
+#endif
+
+#include <gflags/gflags.h>
+#include "include/keypoint_detector.h"
+#include "include/object_detector.h"
+#include "include/preprocess_op.h"
+
+DEFINE_string(model_dir, "", "Path of object detector inference model");
+DEFINE_string(model_dir_keypoint,
+              "",
+              "Path of keypoint detector inference model");
+DEFINE_string(image_file, "", "Path of input image");
+DEFINE_string(image_dir,
+              "",
+              "Dir of input image, `image_file` has a higher priority.");
+DEFINE_int32(batch_size, 1, "batch_size of object detector");
+DEFINE_int32(batch_size_keypoint, 8, "batch_size of keypoint detector");
+DEFINE_string(
+    video_file,
+    "",
+    "Path of input video, `video_file` or `camera_id` has a highest priority.");
+DEFINE_int32(camera_id, -1, "Device id of camera to predict");
+DEFINE_bool(
+    use_gpu,
+    false,
+    "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device,
+              "CPU",
+              "Choose the device you want to run, it can be: CPU/GPU/XPU, "
+              "default is CPU.");
+DEFINE_double(threshold, 0.5, "Threshold of score.");
+DEFINE_double(threshold_keypoint, 0.5, "Threshold of score.");
+DEFINE_string(output_dir, "output", "Directory of output visualization files.");
+DEFINE_string(run_mode,
+              "paddle",
+              "Mode of running(paddle/trt_fp32/trt_fp16/trt_int8)");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
+DEFINE_bool(run_benchmark,
+            false,
+            "Whether to predict a image_file repeatedly for benchmark");
+DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU");
+DEFINE_int32(cpu_threads, 1, "Num of threads with CPU");
+DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI");
+DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI");
+DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI");
+DEFINE_bool(trt_calib_mode,
+            false,
+            "If the model is produced by TRT offline quantitative calibration, "
+            "trt_calib_mode need to set True");
+DEFINE_bool(use_dark, true, "Whether use dark decode in keypoint postprocess");
+
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  LOG(INFO) << "----------------------- Config info -----------------------";
+  LOG(INFO) << "runtime_device: " << FLAGS_device;
+  LOG(INFO) << "ir_optim: "
+            << "True";
+  LOG(INFO) << "enable_memory_optim: "
+            << "True";
+  int has_trt = FLAGS_run_mode.find("trt");
+  if (has_trt >= 0) {
+    LOG(INFO) << "enable_tensorrt: "
+              << "True";
+    std::string precision = FLAGS_run_mode.substr(4, 8);
+    LOG(INFO) << "precision: " << precision;
+  } else {
+    LOG(INFO) << "enable_tensorrt: "
+              << "False";
+    LOG(INFO) << "precision: "
+              << "fp32";
+  }
+  LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False");
+  LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_threads;
+  LOG(INFO) << "----------------------- Data info -----------------------";
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+  LOG(INFO) << "input_shape: "
+            << "dynamic shape";
+  LOG(INFO) << "----------------------- Model info -----------------------";
+  FLAGS_model_dir.erase(FLAGS_model_dir.find_last_not_of(OS_PATH_SEP) + 1);
+  LOG(INFO) << "model_name: " << FLAGS_model_dir;
+  LOG(INFO) << "----------------------- Perf info ------------------------";
+  LOG(INFO) << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0.);
+  img_num = std::max(1, img_num);
+  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num;
+}
+
+void PrintKptsBenchmarkLog(std::vector<double> det_time, int img_num) {
+  LOG(INFO) << "----------------------- Data info -----------------------";
+  LOG(INFO) << "batch_size_keypoint: " << FLAGS_batch_size_keypoint;
+  LOG(INFO) << "----------------------- Model info -----------------------";
+  FLAGS_model_dir_keypoint.erase(
+      FLAGS_model_dir_keypoint.find_last_not_of(OS_PATH_SEP) + 1);
+  LOG(INFO) << "keypoint_model_name: " << FLAGS_model_dir_keypoint;
+  LOG(INFO) << "----------------------- Perf info ------------------------";
+  LOG(INFO) << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0.);
+  img_num = std::max(1, img_num);
+  LOG(INFO) << "Average time cost per person:";
+  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num;
+}
+
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string& path) {
+#ifdef _WIN32
+  struct _stat buffer;
+  return (_stat(path.c_str(), &buffer) == 0);
+#else
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+#ifdef _WIN32
+  ret = _mkdir(path.c_str());
+#else
+  ret = mkdir(path.c_str(), 0755);
+#endif  // !_WIN32
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void PredictVideo(const std::string& video_path,
+                  PaddleDetection::ObjectDetector* det,
+                  PaddleDetection::KeyPointDetector* keypoint,
+                  const std::string& output_dir = "output") {
+  // Open video
+  cv::VideoCapture capture;
+  std::string video_out_name = "output.mp4";
+  if (FLAGS_camera_id != -1) {
+    capture.open(FLAGS_camera_id);
+  } else {
+    capture.open(video_path.c_str());
+    video_out_name =
+        video_path.substr(video_path.find_last_of(OS_PATH_SEP) + 1);
+  }
+  if (!capture.isOpened()) {
+    printf("can not open video : %s\n", video_path.c_str());
+    return;
+  }
+
+  // Get Video info : resolution, fps, frame count
+  int video_width = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_WIDTH));
+  int video_height = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_HEIGHT));
+  int video_fps = static_cast<int>(capture.get(CV_CAP_PROP_FPS));
+  int video_frame_count =
+      static_cast<int>(capture.get(CV_CAP_PROP_FRAME_COUNT));
+  printf("fps: %d, frame_count: %d\n", video_fps, video_frame_count);
+
+  // Create VideoWriter for output
+  cv::VideoWriter video_out;
+  std::string video_out_path(output_dir);
+  if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+    video_out_path += OS_PATH_SEP;
+  }
+  video_out_path += video_out_name;
+  video_out.open(video_out_path.c_str(),
+                 0x00000021,
+                 video_fps,
+                 cv::Size(video_width, video_height),
+                 true);
+  if (!video_out.isOpened()) {
+    printf("create video writer failed!\n");
+    return;
+  }
+  PaddleDetection::PoseSmooth smoother =
+      PaddleDetection::PoseSmooth(video_width, video_height);
+
+  std::vector<PaddleDetection::ObjectResult> result;
+  std::vector<int> bbox_num;
+  std::vector<double> det_times;
+  auto labels = det->GetLabelList();
+  auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+
+  // Store keypoint results
+  std::vector<PaddleDetection::KeyPointResult> result_kpts;
+  std::vector<cv::Mat> imgs_kpts;
+  std::vector<std::vector<float>> center_bs;
+  std::vector<std::vector<float>> scale_bs;
+  std::vector<int> colormap_kpts = PaddleDetection::GenerateColorMap(20);
+  // Capture all frames and do inference
+  cv::Mat frame;
+  int frame_id = 1;
+  bool is_rbox = false;
+  while (capture.read(frame)) {
+    if (frame.empty()) {
+      break;
+    }
+    std::vector<cv::Mat> imgs;
+    imgs.push_back(frame);
+    printf("detect frame: %d\n", frame_id);
+    det->Predict(imgs, FLAGS_threshold, 0, 1, &result, &bbox_num, &det_times);
+    std::vector<PaddleDetection::ObjectResult> out_result;
+    for (const auto& item : result) {
+      if (item.confidence < FLAGS_threshold || item.class_id == -1) {
+        continue;
+      }
+      out_result.push_back(item);
+      if (item.rect.size() > 6) {
+        is_rbox = true;
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+               item.class_id,
+               item.confidence,
+               item.rect[0],
+               item.rect[1],
+               item.rect[2],
+               item.rect[3],
+               item.rect[4],
+               item.rect[5],
+               item.rect[6],
+               item.rect[7]);
+      } else {
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+               item.class_id,
+               item.confidence,
+               item.rect[0],
+               item.rect[1],
+               item.rect[2],
+               item.rect[3]);
+      }
+    }
+
+    if (keypoint) {
+      result_kpts.clear();
+      int imsize = out_result.size();
+      for (int i = 0; i < imsize; i++) {
+        auto item = out_result[i];
+        cv::Mat crop_img;
+        std::vector<double> keypoint_times;
+        std::vector<int> rect = {
+            item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+        std::vector<float> center;
+        std::vector<float> scale;
+        if (item.class_id == 0) {
+          PaddleDetection::CropImg(frame, crop_img, rect, center, scale);
+          center_bs.emplace_back(center);
+          scale_bs.emplace_back(scale);
+          imgs_kpts.emplace_back(crop_img);
+        }
+
+        if (imgs_kpts.size() == FLAGS_batch_size_keypoint ||
+            ((i == imsize - 1) && !imgs_kpts.empty())) {
+          keypoint->Predict(imgs_kpts,
+                            center_bs,
+                            scale_bs,
+                            FLAGS_threshold,
+                            0,
+                            1,
+                            &result_kpts,
+                            &keypoint_times);
+          imgs_kpts.clear();
+          center_bs.clear();
+          scale_bs.clear();
+        }
+      }
+
+      if (result_kpts.size() == 1) {
+        for (int i = 0; i < result_kpts.size(); i++) {
+          result_kpts[i] = smoother.smooth_process(&(result_kpts[i]));
+        }
+      }
+
+      cv::Mat out_im = VisualizeKptsResult(frame, result_kpts, colormap_kpts);
+      video_out.write(out_im);
+    } else {
+      // Visualization result
+      cv::Mat out_im = PaddleDetection::VisualizeResult(
+          frame, out_result, labels, colormap, is_rbox);
+      video_out.write(out_im);
+    }
+
+    frame_id += 1;
+  }
+  capture.release();
+  video_out.release();
+}
+
+void PredictImage(const std::vector<std::string> all_img_paths,
+                  const int batch_size,
+                  const double threshold,
+                  const bool run_benchmark,
+                  PaddleDetection::ObjectDetector* det,
+                  PaddleDetection::KeyPointDetector* keypoint,
+                  const std::string& output_dir = "output") {
+  std::vector<double> det_t = {0, 0, 0};
+  int steps = ceil(static_cast<float>(all_img_paths.size()) / batch_size);
+  int kpts_imgs = 0;
+  std::vector<double> keypoint_t = {0, 0, 0};
+  printf("total images = %d, batch_size = %d, total steps = %d\n",
+         all_img_paths.size(),
+         batch_size,
+         steps);
+  for (int idx = 0; idx < steps; idx++) {
+    std::vector<cv::Mat> batch_imgs;
+    int left_image_cnt = all_img_paths.size() - idx * batch_size;
+    if (left_image_cnt > batch_size) {
+      left_image_cnt = batch_size;
+    }
+    for (int bs = 0; bs < left_image_cnt; bs++) {
+      std::string image_file_path = all_img_paths.at(idx * batch_size + bs);
+      cv::Mat im = cv::imread(image_file_path, 1);
+      batch_imgs.insert(batch_imgs.end(), im);
+    }
+
+    // Store all detected result
+    std::vector<PaddleDetection::ObjectResult> result;
+    std::vector<int> bbox_num;
+    std::vector<double> det_times;
+
+    // Store keypoint results
+    std::vector<PaddleDetection::KeyPointResult> result_kpts;
+    std::vector<cv::Mat> imgs_kpts;
+    std::vector<std::vector<float>> center_bs;
+    std::vector<std::vector<float>> scale_bs;
+    std::vector<int> colormap_kpts = PaddleDetection::GenerateColorMap(20);
+
+    bool is_rbox = false;
+    if (run_benchmark) {
+      det->Predict(
+          batch_imgs, threshold, 10, 10, &result, &bbox_num, &det_times);
+    } else {
+      det->Predict(batch_imgs, threshold, 0, 1, &result, &bbox_num, &det_times);
+    }
+    // get labels and colormap
+    auto labels = det->GetLabelList();
+    auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+    int item_start_idx = 0;
+    for (int i = 0; i < left_image_cnt; i++) {
+      cv::Mat im = batch_imgs[i];
+      std::vector<PaddleDetection::ObjectResult> im_result;
+      int detect_num = 0;
+      for (int j = 0; j < bbox_num[i]; j++) {
+        PaddleDetection::ObjectResult item = result[item_start_idx + j];
+        if (item.confidence < threshold || item.class_id == -1) {
+          continue;
+        }
+        detect_num += 1;
+        im_result.push_back(item);
+        if (item.rect.size() > 6) {
+          is_rbox = true;
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3],
+                 item.rect[4],
+                 item.rect[5],
+                 item.rect[6],
+                 item.rect[7]);
+        } else {
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3]);
+        }
+      }
+      std::cout << all_img_paths.at(idx * batch_size + i)
+                << " The number of detected box: " << detect_num << std::endl;
+      item_start_idx = item_start_idx + bbox_num[i];
+
+      std::vector<int> compression_params;
+      compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+      compression_params.push_back(95);
+      std::string output_path(output_dir);
+      if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+        output_path += OS_PATH_SEP;
+      }
+      std::string image_file_path = all_img_paths.at(idx * batch_size + i);
+      if (keypoint) {
+        int imsize = im_result.size();
+        for (int i = 0; i < imsize; i++) {
+          auto item = im_result[i];
+          cv::Mat crop_img;
+          std::vector<double> keypoint_times;
+          std::vector<int> rect = {
+              item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+          std::vector<float> center;
+          std::vector<float> scale;
+          if (item.class_id == 0) {
+            PaddleDetection::CropImg(im, crop_img, rect, center, scale);
+            center_bs.emplace_back(center);
+            scale_bs.emplace_back(scale);
+            imgs_kpts.emplace_back(crop_img);
+            kpts_imgs += 1;
+          }
+
+          if (imgs_kpts.size() == FLAGS_batch_size_keypoint ||
+              ((i == imsize - 1) && !imgs_kpts.empty())) {
+            if (run_benchmark) {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0.5,
+                                10,
+                                10,
+                                &result_kpts,
+                                &keypoint_times);
+            } else {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0.5,
+                                0,
+                                1,
+                                &result_kpts,
+                                &keypoint_times);
+            }
+            imgs_kpts.clear();
+            center_bs.clear();
+            scale_bs.clear();
+            keypoint_t[0] += keypoint_times[0];
+            keypoint_t[1] += keypoint_times[1];
+            keypoint_t[2] += keypoint_times[2];
+          }
+        }
+        std::string kpts_savepath =
+            output_path + "keypoint_" +
+            image_file_path.substr(image_file_path.find_last_of(OS_PATH_SEP) + 1);
+        cv::Mat kpts_vis_img =
+            VisualizeKptsResult(im, result_kpts, colormap_kpts);
+        cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+        printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+      } else {
+        // Visualization result
+        cv::Mat vis_img = PaddleDetection::VisualizeResult(
+            im, im_result, labels, colormap, is_rbox);
+        std::string det_savepath =
+            output_path +
+            image_file_path.substr(image_file_path.find_last_of(OS_PATH_SEP) + 1);
+        cv::imwrite(det_savepath, vis_img, compression_params);
+        printf("Visualized output saved as %s\n", det_savepath.c_str());
+      }
+    }
+
+    det_t[0] += det_times[0];
+    det_t[1] += det_times[1];
+    det_t[2] += det_times[2];
+  }
+  PrintBenchmarkLog(det_t, all_img_paths.size());
+  if (keypoint) {
+    PrintKptsBenchmarkLog(keypoint_t, kpts_imgs);
+  }
+}
+
+int main(int argc, char** argv) {
+  // Parsing command-line
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_image_file.empty() && FLAGS_image_dir.empty() &&
+       FLAGS_video_file.empty())) {
+    std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ "
+                 "(--model_dir_keypoint=/PATH/TO/INFERENCE_MODEL/)"
+              << "--image_file=/PATH/TO/INPUT/IMAGE/" << std::endl;
+    return -1;
+  }
+  if (!(FLAGS_run_mode == "paddle" || FLAGS_run_mode == "trt_fp32" ||
+        FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) {
+    std::cout
+        << "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
+    return -1;
+  }
+  transform(FLAGS_device.begin(),
+            FLAGS_device.end(),
+            FLAGS_device.begin(),
+            ::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" ||
+        FLAGS_device == "XPU")) {
+    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
+    return -1;
+  }
+  if (FLAGS_use_gpu) {
+    std::cout << "Deprecated, please use `--device` to set the device you want "
+                 "to run.";
+    return -1;
+  }
+  // Load model and create a object detector
+  PaddleDetection::ObjectDetector det(FLAGS_model_dir,
+                                      FLAGS_device,
+                                      FLAGS_use_mkldnn,
+                                      FLAGS_cpu_threads,
+                                      FLAGS_run_mode,
+                                      FLAGS_batch_size,
+                                      FLAGS_gpu_id,
+                                      FLAGS_trt_min_shape,
+                                      FLAGS_trt_max_shape,
+                                      FLAGS_trt_opt_shape,
+                                      FLAGS_trt_calib_mode);
+
+  PaddleDetection::KeyPointDetector* keypoint = nullptr;
+  if (!FLAGS_model_dir_keypoint.empty()) {
+    keypoint = new PaddleDetection::KeyPointDetector(FLAGS_model_dir_keypoint,
+                                                     FLAGS_device,
+                                                     FLAGS_use_mkldnn,
+                                                     FLAGS_cpu_threads,
+                                                     FLAGS_run_mode,
+                                                     FLAGS_batch_size_keypoint,
+                                                     FLAGS_gpu_id,
+                                                     FLAGS_trt_min_shape,
+                                                     FLAGS_trt_max_shape,
+                                                     FLAGS_trt_opt_shape,
+                                                     FLAGS_trt_calib_mode,
+                                                     FLAGS_use_dark);
+  }
+  // Do inference on input video or image
+  if (!PathExists(FLAGS_output_dir)) {
+    MkDirs(FLAGS_output_dir);
+  }
+  if (!FLAGS_video_file.empty() || FLAGS_camera_id != -1) {
+    PredictVideo(FLAGS_video_file, &det, keypoint, FLAGS_output_dir);
+  } else if (!FLAGS_image_file.empty() || !FLAGS_image_dir.empty()) {
+    std::vector<std::string> all_img_paths;
+    std::vector<cv::String> cv_all_img_paths;
+    if (!FLAGS_image_file.empty()) {
+      all_img_paths.push_back(FLAGS_image_file);
+      if (FLAGS_batch_size > 1) {
+        std::cout << "batch_size should be 1, when set `image_file`."
+                  << std::endl;
+        return -1;
+      }
+    } else {
+      cv::glob(FLAGS_image_dir, cv_all_img_paths);
+      for (const auto& img_path : cv_all_img_paths) {
+        all_img_paths.push_back(img_path);
+      }
+    }
+    PredictImage(all_img_paths,
+                 FLAGS_batch_size,
+                 FLAGS_threshold,
+                 FLAGS_run_benchmark,
+                 &det,
+                 keypoint,
+                 FLAGS_output_dir);
+  }
+  delete keypoint;
+  keypoint = nullptr;
+  return 0;
+}
diff --git a/deploy/cpp/src/object_detector.cc b/deploy/cpp/src/object_detector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4f2ceb5d7c07142e51e2b0008148e5d90b55adc
--- /dev/null
+++ b/deploy/cpp/src/object_detector.cc
@@ -0,0 +1,592 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+
+#include "include/object_detector.h"
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void ObjectDetector::LoadModel(const std::string &model_dir,
+                               const int batch_size,
+                               const std::string &run_mode) {
+  paddle_infer::Config config;
+  std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel";
+  std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams";
+  config.SetModel(prog_file, params_file);
+  if (this->device_ == "GPU") {
+    config.EnableUseGpu(200, this->gpu_id_);
+    config.SwitchIrOptim(true);
+    // use tensorrt
+    if (run_mode != "paddle") {
+      auto precision = paddle_infer::Config::Precision::kFloat32;
+      if (run_mode == "trt_fp32") {
+        precision = paddle_infer::Config::Precision::kFloat32;
+      } else if (run_mode == "trt_fp16") {
+        precision = paddle_infer::Config::Precision::kHalf;
+      } else if (run_mode == "trt_int8") {
+        precision = paddle_infer::Config::Precision::kInt8;
+      } else {
+        printf("run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or "
+               "'trt_int8'");
+      }
+      // set tensorrt
+      config.EnableTensorRtEngine(1 << 30, batch_size, this->min_subgraph_size_,
+                                  precision, false, this->trt_calib_mode_);
+
+      // set use dynamic shape
+      if (this->use_dynamic_shape_) {
+        // set DynamicShape for image tensor
+        const std::vector<int> min_input_shape = {
+            batch_size, 3, this->trt_min_shape_, this->trt_min_shape_};
+        const std::vector<int> max_input_shape = {
+            batch_size, 3, this->trt_max_shape_, this->trt_max_shape_};
+        const std::vector<int> opt_input_shape = {
+            batch_size, 3, this->trt_opt_shape_, this->trt_opt_shape_};
+        const std::map<std::string, std::vector<int>> map_min_input_shape = {
+            {"image", min_input_shape}};
+        const std::map<std::string, std::vector<int>> map_max_input_shape = {
+            {"image", max_input_shape}};
+        const std::map<std::string, std::vector<int>> map_opt_input_shape = {
+            {"image", opt_input_shape}};
+
+        config.SetTRTDynamicShapeInfo(map_min_input_shape, map_max_input_shape,
+                                      map_opt_input_shape);
+        std::cout << "TensorRT dynamic shape enabled" << std::endl;
+      }
+    }
+
+  } else if (this->device_ == "XPU") {
+    config.EnableXpu(10 * 1024 * 1024);
+  } else {
+    config.DisableGpu();
+    if (this->use_mkldnn_) {
+      config.EnableMKLDNN();
+      // cache 10 different shapes for mkldnn to avoid memory leak
+      config.SetMkldnnCacheCapacity(10);
+    }
+    config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
+  }
+  config.SwitchUseFeedFetchOps(false);
+  config.SwitchIrOptim(true);
+  config.DisableGlogInfo();
+  // Memory optimization
+  config.EnableMemoryOptim();
+  predictor_ = std::move(CreatePredictor(config));
+}
+
+// Visualiztion MaskDetector results
+cv::Mat
+VisualizeResult(const cv::Mat &img,
+                const std::vector<PaddleDetection::ObjectResult> &results,
+                const std::vector<std::string> &lables,
+                const std::vector<int> &colormap, const bool is_rbox = false) {
+  cv::Mat vis_img = img.clone();
+  int img_h = vis_img.rows;
+  int img_w = vis_img.cols;
+  for (int i = 0; i < results.size(); ++i) {
+    // Configure color and text size
+    std::ostringstream oss;
+    oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    oss << lables[results[i].class_id] << " ";
+    oss << results[i].confidence;
+    std::string text = oss.str();
+    int c1 = colormap[3 * results[i].class_id + 0];
+    int c2 = colormap[3 * results[i].class_id + 1];
+    int c3 = colormap[3 * results[i].class_id + 2];
+    cv::Scalar roi_color = cv::Scalar(c1, c2, c3);
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 0.5f;
+    float thickness = 0.5;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    cv::Point origin;
+
+    if (is_rbox) {
+      // Draw object, text, and background
+      for (int k = 0; k < 4; k++) {
+        cv::Point pt1 = cv::Point(results[i].rect[(k * 2) % 8],
+                                  results[i].rect[(k * 2 + 1) % 8]);
+        cv::Point pt2 = cv::Point(results[i].rect[(k * 2 + 2) % 8],
+                                  results[i].rect[(k * 2 + 3) % 8]);
+        cv::line(vis_img, pt1, pt2, roi_color, 2);
+      }
+    } else {
+      int w = results[i].rect[2] - results[i].rect[0];
+      int h = results[i].rect[3] - results[i].rect[1];
+      cv::Rect roi = cv::Rect(results[i].rect[0], results[i].rect[1], w, h);
+      // Draw roi object, text, and background
+      cv::rectangle(vis_img, roi, roi_color, 2);
+
+      // Draw mask
+      std::vector<int> mask_v = results[i].mask;
+      if (mask_v.size() > 0) {
+        cv::Mat mask = cv::Mat(img_h, img_w, CV_32S);
+        std::memcpy(mask.data, mask_v.data(), mask_v.size() * sizeof(int));
+
+        cv::Mat colored_img = vis_img.clone();
+
+        std::vector<cv::Mat> contours;
+        cv::Mat hierarchy;
+        mask.convertTo(mask, CV_8U);
+        cv::findContours(mask, contours, hierarchy, cv::RETR_CCOMP,
+                         cv::CHAIN_APPROX_SIMPLE);
+        cv::drawContours(colored_img, contours, -1, roi_color, -1, cv::LINE_8,
+                         hierarchy, 100);
+
+        cv::Mat debug_roi = vis_img;
+        colored_img = 0.4 * colored_img + 0.6 * vis_img;
+        colored_img.copyTo(vis_img, mask);
+      }
+    }
+
+    origin.x = results[i].rect[0];
+    origin.y = results[i].rect[1];
+
+    // Configure text background
+    cv::Rect text_back =
+        cv::Rect(results[i].rect[0], results[i].rect[1] - text_size.height,
+                 text_size.width, text_size.height);
+    // Draw text, and background
+    cv::rectangle(vis_img, text_back, roi_color, -1);
+    cv::putText(vis_img, text, origin, font_face, font_scale,
+                cv::Scalar(255, 255, 255), thickness);
+  }
+  return vis_img;
+}
+
+void ObjectDetector::Preprocess(const cv::Mat &ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void ObjectDetector::Postprocess(
+    const std::vector<cv::Mat> mats,
+    std::vector<PaddleDetection::ObjectResult> *result,
+    std::vector<int> bbox_num, std::vector<float> output_data_,
+    std::vector<int> output_mask_data_, bool is_rbox = false) {
+  result->clear();
+  int start_idx = 0;
+  int total_num = std::accumulate(bbox_num.begin(), bbox_num.end(), 0);
+  int out_mask_dim = -1;
+  if (config_.mask_) {
+    out_mask_dim = output_mask_data_.size() / total_num;
+  }
+
+  for (int im_id = 0; im_id < mats.size(); im_id++) {
+    cv::Mat raw_mat = mats[im_id];
+    int rh = 1;
+    int rw = 1;
+    for (int j = start_idx; j < start_idx + bbox_num[im_id]; j++) {
+      if (is_rbox) {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 10]));
+        // Confidence score
+        float score = output_data_[1 + j * 10];
+        int x1 = (output_data_[2 + j * 10] * rw);
+        int y1 = (output_data_[3 + j * 10] * rh);
+        int x2 = (output_data_[4 + j * 10] * rw);
+        int y2 = (output_data_[5 + j * 10] * rh);
+        int x3 = (output_data_[6 + j * 10] * rw);
+        int y3 = (output_data_[7 + j * 10] * rh);
+        int x4 = (output_data_[8 + j * 10] * rw);
+        int y4 = (output_data_[9 + j * 10] * rh);
+
+        PaddleDetection::ObjectResult result_item;
+        result_item.rect = {x1, y1, x2, y2, x3, y3, x4, y4};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      } else {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 6]));
+        // Confidence score
+        float score = output_data_[1 + j * 6];
+        int xmin = (output_data_[2 + j * 6] * rw);
+        int ymin = (output_data_[3 + j * 6] * rh);
+        int xmax = (output_data_[4 + j * 6] * rw);
+        int ymax = (output_data_[5 + j * 6] * rh);
+        int wd = xmax - xmin;
+        int hd = ymax - ymin;
+
+        PaddleDetection::ObjectResult result_item;
+        result_item.rect = {xmin, ymin, xmax, ymax};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+
+        if (config_.mask_) {
+          std::vector<int> mask;
+          for (int k = 0; k < out_mask_dim; ++k) {
+            if (output_mask_data_[k + j * out_mask_dim] > -1) {
+              mask.push_back(output_mask_data_[k + j * out_mask_dim]);
+            }
+          }
+          result_item.mask = mask;
+        }
+
+        result->push_back(result_item);
+      }
+    }
+    start_idx += bbox_num[im_id];
+  }
+}
+
+// This function is to convert output result from SOLOv2 to class ObjectResult
+void ObjectDetector::SOLOv2Postprocess(
+    const std::vector<cv::Mat> mats, std::vector<ObjectResult> *result,
+    std::vector<int> *bbox_num, std::vector<int> out_bbox_num_data_,
+    std::vector<int64_t> out_label_data_, std::vector<float> out_score_data_,
+    std::vector<uint8_t> out_global_mask_data_, float threshold) {
+
+  for (int im_id = 0; im_id < mats.size(); im_id++) {
+    cv::Mat mat = mats[im_id];
+
+    int valid_bbox_count = 0;
+    for (int bbox_id = 0; bbox_id < out_bbox_num_data_[im_id]; ++bbox_id) {
+      if (out_score_data_[bbox_id] >= threshold) {
+        ObjectResult result_item;
+        result_item.class_id = out_label_data_[bbox_id];
+        result_item.confidence = out_score_data_[bbox_id];
+        std::vector<int> global_mask;
+
+        for (int k = 0; k < mat.rows * mat.cols; ++k) {
+          global_mask.push_back(static_cast<int>(
+              out_global_mask_data_[k + bbox_id * mat.rows * mat.cols]));
+        }
+
+        // find minimize bounding box from mask
+        cv::Mat mask(mat.rows, mat.cols, CV_32SC1);
+        std::memcpy(mask.data, global_mask.data(),
+                    global_mask.size() * sizeof(int));
+
+        cv::Mat mask_fp;
+        cv::Mat rowSum;
+        cv::Mat colSum;
+        std::vector<float> sum_of_row(mat.rows);
+        std::vector<float> sum_of_col(mat.cols);
+
+        mask.convertTo(mask_fp, CV_32FC1);
+        cv::reduce(mask_fp, colSum, 0, CV_REDUCE_SUM, CV_32FC1);
+        cv::reduce(mask_fp, rowSum, 1, CV_REDUCE_SUM, CV_32FC1);
+
+        for (int row_id = 0; row_id < mat.rows; ++row_id) {
+          sum_of_row[row_id] = rowSum.at<float>(row_id, 0);
+        }
+
+        for (int col_id = 0; col_id < mat.cols; ++col_id) {
+          sum_of_col[col_id] = colSum.at<float>(0, col_id);
+        }
+
+        auto it = std::find_if(sum_of_row.begin(), sum_of_row.end(),
+                               [](int x) { return x > 0.5; });
+        int y1 = std::distance(sum_of_row.begin(), it);
+
+        auto it2 = std::find_if(sum_of_col.begin(), sum_of_col.end(),
+                                [](int x) { return x > 0.5; });
+        int x1 = std::distance(sum_of_col.begin(), it2);
+
+        auto rit = std::find_if(sum_of_row.rbegin(), sum_of_row.rend(),
+                                [](int x) { return x > 0.5; });
+        int y2 = std::distance(rit, sum_of_row.rend());
+
+        auto rit2 = std::find_if(sum_of_col.rbegin(), sum_of_col.rend(),
+                                 [](int x) { return x > 0.5; });
+        int x2 = std::distance(rit2, sum_of_col.rend());
+
+        result_item.rect = {x1, y1, x2, y2};
+        result_item.mask = global_mask;
+
+        result->push_back(result_item);
+        valid_bbox_count++;
+      }
+    }
+    bbox_num->push_back(valid_bbox_count);
+  }
+}
+
+void ObjectDetector::Predict(const std::vector<cv::Mat> imgs,
+                             const double threshold, const int warmup,
+                             const int repeats,
+                             std::vector<PaddleDetection::ObjectResult> *result,
+                             std::vector<int> *bbox_num,
+                             std::vector<double> *times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+  std::vector<const float *> output_data_list_;
+  std::vector<int> out_bbox_num_data_;
+  std::vector<int> out_mask_data_;
+
+  // these parameters are for SOLOv2 output
+  std::vector<float> out_score_data_;
+  std::vector<uint8_t> out_global_mask_data_;
+  std::vector<int64_t> out_label_data_;
+
+  // in_net img for each batch
+  std::vector<cv::Mat> in_net_img_all(batch_size);
+
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    in_data_all.insert(in_data_all.end(), inputs_.im_data_.begin(),
+                       inputs_.im_data_.end());
+
+    // collect in_net img
+    in_net_img_all[bs_idx] = inputs_.in_net_im_;
+  }
+
+  // Pad Batch if batch size > 1
+  if (batch_size > 1 && CheckDynamicInput(in_net_img_all)) {
+    in_data_all.clear();
+    std::vector<cv::Mat> pad_img_all = PadBatch(in_net_img_all);
+    int rh = pad_img_all[0].rows;
+    int rw = pad_img_all[0].cols;
+    int rc = pad_img_all[0].channels();
+
+    for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+      cv::Mat pad_img = pad_img_all[bs_idx];
+      pad_img.convertTo(pad_img, CV_32FC3);
+      std::vector<float> pad_data;
+      pad_data.resize(rc * rh * rw);
+      float *base = pad_data.data();
+      for (int i = 0; i < rc; ++i) {
+        cv::extractChannel(pad_img,
+                           cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+      }
+      in_data_all.insert(in_data_all.end(), pad_data.begin(), pad_data.end());
+    }
+    // update in_net_shape
+    inputs_.in_net_shape_ = {static_cast<float>(rh), static_cast<float>(rw)};
+  }
+
+  auto preprocess_end = std::chrono::steady_clock::now();
+  // Prepare input tensor
+  auto input_names = predictor_->GetInputNames();
+  for (const auto &tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputHandle(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Reshape({batch_size, 3, rh, rw});
+      in_tensor->CopyFromCpu(in_data_all.data());
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(im_shape_all.data());
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(scale_factor_all.data());
+    }
+  }
+
+  // Run predictor
+  std::vector<std::vector<float>> out_tensor_list;
+  std::vector<std::vector<int>> output_shape_list;
+  bool is_rbox = false;
+  int reg_max = 7;
+  int num_class = 80;
+
+  auto inference_start = std::chrono::steady_clock::now();
+  if (config_.arch_ == "SOLOv2") {
+    // warmup
+    for (int i = 0; i < warmup; i++) {
+      predictor_->Run();
+      // Get output tensor
+      auto output_names = predictor_->GetOutputNames();
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetOutputHandle(output_names[j]);
+        std::vector<int> output_shape = output_tensor->shape();
+        int out_num = std::accumulate(output_shape.begin(), output_shape.end(),
+                                      1, std::multiplies<int>());
+        if (j == 0) {
+          out_bbox_num_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_bbox_num_data_.data());
+        } else if (j == 1) {
+          out_label_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_label_data_.data());
+        } else if (j == 2) {
+          out_score_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_score_data_.data());
+        } else if (config_.mask_ && (j == 3)) {
+          out_global_mask_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_global_mask_data_.data());
+        }
+      }
+    }
+
+    inference_start = std::chrono::steady_clock::now();
+    for (int i = 0; i < repeats; i++) {
+      predictor_->Run();
+      // Get output tensor
+      out_tensor_list.clear();
+      output_shape_list.clear();
+      auto output_names = predictor_->GetOutputNames();
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetOutputHandle(output_names[j]);
+        std::vector<int> output_shape = output_tensor->shape();
+        int out_num = std::accumulate(output_shape.begin(), output_shape.end(),
+                                      1, std::multiplies<int>());
+        output_shape_list.push_back(output_shape);
+        if (j == 0) {
+          out_bbox_num_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_bbox_num_data_.data());
+        } else if (j == 1) {
+          out_label_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_label_data_.data());
+        } else if (j == 2) {
+          out_score_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_score_data_.data());
+        } else if (config_.mask_ && (j == 3)) {
+          out_global_mask_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_global_mask_data_.data());
+        }
+      }
+    }
+  } else {
+    // warmup
+    for (int i = 0; i < warmup; i++) {
+      predictor_->Run();
+      // Get output tensor
+      auto output_names = predictor_->GetOutputNames();
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetOutputHandle(output_names[j]);
+        std::vector<int> output_shape = output_tensor->shape();
+        int out_num = std::accumulate(output_shape.begin(), output_shape.end(),
+                                      1, std::multiplies<int>());
+        if (config_.mask_ && (j == 2)) {
+          out_mask_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_mask_data_.data());
+        } else if (output_tensor->type() == paddle_infer::DataType::INT32) {
+          out_bbox_num_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_bbox_num_data_.data());
+        } else {
+          std::vector<float> out_data;
+          out_data.resize(out_num);
+          output_tensor->CopyToCpu(out_data.data());
+          out_tensor_list.push_back(out_data);
+        }
+      }
+    }
+
+    inference_start = std::chrono::steady_clock::now();
+    for (int i = 0; i < repeats; i++) {
+      predictor_->Run();
+      // Get output tensor
+      out_tensor_list.clear();
+      output_shape_list.clear();
+      auto output_names = predictor_->GetOutputNames();
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetOutputHandle(output_names[j]);
+        std::vector<int> output_shape = output_tensor->shape();
+        int out_num = std::accumulate(output_shape.begin(), output_shape.end(),
+                                      1, std::multiplies<int>());
+        output_shape_list.push_back(output_shape);
+        if (config_.mask_ && (j == 2)) {
+          out_mask_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_mask_data_.data());
+        } else if (output_tensor->type() == paddle_infer::DataType::INT32) {
+          out_bbox_num_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_bbox_num_data_.data());
+        } else {
+          std::vector<float> out_data;
+          out_data.resize(out_num);
+          output_tensor->CopyToCpu(out_data.data());
+          out_tensor_list.push_back(out_data);
+        }
+      }
+    }
+  }
+
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  result->clear();
+  bbox_num->clear();
+  if (config_.arch_ == "PicoDet") {
+    for (int i = 0; i < out_tensor_list.size(); i++) {
+      if (i == 0) {
+        num_class = output_shape_list[i][2];
+      }
+      if (i == config_.fpn_stride_.size()) {
+        reg_max = output_shape_list[i][2] / 4 - 1;
+      }
+      float *buffer = new float[out_tensor_list[i].size()];
+      memcpy(buffer, &out_tensor_list[i][0],
+             out_tensor_list[i].size() * sizeof(float));
+      output_data_list_.push_back(buffer);
+    }
+    PaddleDetection::PicoDetPostProcess(
+        result, output_data_list_, config_.fpn_stride_, inputs_.im_shape_,
+        inputs_.scale_factor_, config_.nms_info_["score_threshold"].as<float>(),
+        config_.nms_info_["nms_threshold"].as<float>(), num_class, reg_max);
+    bbox_num->push_back(result->size());
+  } else if (config_.arch_ == "SOLOv2") {
+    SOLOv2Postprocess(imgs, result, bbox_num, out_bbox_num_data_,
+                      out_label_data_, out_score_data_, out_global_mask_data_,
+                      threshold);
+  } else {
+    is_rbox = output_shape_list[0][output_shape_list[0].size() - 1] % 10 == 0;
+    Postprocess(imgs, result, out_bbox_num_data_, out_tensor_list[0],
+                out_mask_data_, is_rbox);
+    for (int k = 0; k < out_bbox_num_data_.size(); k++) {
+      int tmp = out_bbox_num_data_[k];
+      bbox_num->push_back(tmp);
+    }
+  }
+
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(static_cast<double>(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(
+      static_cast<double>(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(static_cast<double>(postprocess_diff.count() * 1000));
+}
+
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+
+} // namespace PaddleDetection
diff --git a/deploy/cpp/src/picodet_postprocess.cc b/deploy/cpp/src/picodet_postprocess.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f40a2658ac04d98e73646996e12f2dd4e016006
--- /dev/null
+++ b/deploy/cpp/src/picodet_postprocess.cc
@@ -0,0 +1,128 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on:
+// https://github.com/RangiLyu/nanodet/blob/main/demo_mnn/nanodet_mnn.cpp
+
+#include "include/picodet_postprocess.h"
+
+namespace PaddleDetection {
+
+float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+// PicoDet decode
+PaddleDetection::ObjectResult
+disPred2Bbox(const float *&dfl_det, int label, float score, int x, int y,
+             int stride, std::vector<float> im_shape, int reg_max) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(dfl_det + i * (reg_max + 1), dis_after_sm,
+                                reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  int xmin = (int)(std::max)(ct_x - dis_pred[0], .0f);
+  int ymin = (int)(std::max)(ct_y - dis_pred[1], .0f);
+  int xmax = (int)(std::min)(ct_x + dis_pred[2], (float)im_shape[0]);
+  int ymax = (int)(std::min)(ct_y + dis_pred[3], (float)im_shape[1]);
+
+  PaddleDetection::ObjectResult result_item;
+  result_item.rect = {xmin, ymin, xmax, ymax};
+  result_item.class_id = label;
+  result_item.confidence = score;
+
+  return result_item;
+}
+
+void PicoDetPostProcess(std::vector<PaddleDetection::ObjectResult> *results,
+                        std::vector<const float *> outs,
+                        std::vector<int> fpn_stride,
+                        std::vector<float> im_shape,
+                        std::vector<float> scale_factor, float score_threshold,
+                        float nms_threshold, int num_class, int reg_max) {
+  std::vector<std::vector<PaddleDetection::ObjectResult>> bbox_results;
+  bbox_results.resize(num_class);
+  int in_h = im_shape[0], in_w = im_shape[1];
+  for (int i = 0; i < fpn_stride.size(); ++i) {
+    int feature_h = std::ceil((float)in_h / fpn_stride[i]);
+    int feature_w = std::ceil((float)in_w / fpn_stride[i]);
+    for (int idx = 0; idx < feature_h * feature_w; idx++) {
+      const float *scores = outs[i] + (idx * num_class);
+
+      int row = idx / feature_w;
+      int col = idx % feature_w;
+      float score = 0;
+      int cur_label = 0;
+      for (int label = 0; label < num_class; label++) {
+        if (scores[label] > score) {
+          score = scores[label];
+          cur_label = label;
+        }
+      }
+      if (score > score_threshold) {
+        const float *bbox_pred =
+            outs[i + fpn_stride.size()] + (idx * 4 * (reg_max + 1));
+        bbox_results[cur_label].push_back(
+            disPred2Bbox(bbox_pred, cur_label, score, col, row, fpn_stride[i],
+                         im_shape, reg_max));
+      }
+    }
+  }
+  for (int i = 0; i < (int)bbox_results.size(); i++) {
+    PaddleDetection::nms(bbox_results[i], nms_threshold);
+
+    for (auto box : bbox_results[i]) {
+      box.rect[0] = box.rect[0] / scale_factor[1];
+      box.rect[2] = box.rect[2] / scale_factor[1];
+      box.rect[1] = box.rect[1] / scale_factor[0];
+      box.rect[3] = box.rect[3] / scale_factor[0];
+      results->push_back(box);
+    }
+  }
+}
+
+} // namespace PaddleDetection
diff --git a/deploy/cpp/src/preprocess_op.cc b/deploy/cpp/src/preprocess_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1cbfe4f15a49930ac9759e8d1b71232f167ad04
--- /dev/null
+++ b/deploy/cpp/src/preprocess_op.cc
@@ -0,0 +1,355 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "include/preprocess_op.h"
+
+namespace PaddleDetection {
+
+void InitInfo::Run(cv::Mat* im, ImageBlob* data) {
+  data->im_shape_ = {static_cast<float>(im->rows),
+                     static_cast<float>(im->cols)};
+  data->scale_factor_ = {1., 1.};
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+}
+
+void NormalizeImage::Run(cv::Mat* im, ImageBlob* data) {
+  double e = 1.0;
+  if (is_scale_) {
+    e /= 255.0;
+  }
+  (*im).convertTo(*im, CV_32FC3, e);
+  if (norm_type_ == "mean_std"){
+    for (int h = 0; h < im->rows; h++) {
+      for (int w = 0; w < im->cols; w++) {
+        im->at<cv::Vec3f>(h, w)[0] =
+            (im->at<cv::Vec3f>(h, w)[0] - mean_[0]) / scale_[0];
+        im->at<cv::Vec3f>(h, w)[1] =
+            (im->at<cv::Vec3f>(h, w)[1] - mean_[1]) / scale_[1];
+        im->at<cv::Vec3f>(h, w)[2] =
+            (im->at<cv::Vec3f>(h, w)[2] - mean_[2]) / scale_[2];
+      }
+    }
+  }
+}
+
+void Permute::Run(cv::Mat* im, ImageBlob* data) {
+  (*im).convertTo(*im, CV_32FC3);
+  int rh = im->rows;
+  int rw = im->cols;
+  int rc = im->channels();
+  (data->im_data_).resize(rc * rh * rw);
+  float* base = (data->im_data_).data();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+  }
+}
+
+void Resize::Run(cv::Mat* im, ImageBlob* data) {
+  auto resize_scale = GenerateScale(*im);
+  cv::resize(
+      *im, *im, cv::Size(), resize_scale.first, resize_scale.second, interp_);
+
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+  data->im_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  data->scale_factor_ = {
+      resize_scale.second, resize_scale.first,
+  };
+}
+
+std::pair<float, float> Resize::GenerateScale(const cv::Mat& im) {
+  std::pair<float, float> resize_scale;
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+
+  if (keep_ratio_) {
+    int im_size_max = std::max(origin_w, origin_h);
+    int im_size_min = std::min(origin_w, origin_h);
+    int target_size_max =
+        *std::max_element(target_size_.begin(), target_size_.end());
+    int target_size_min =
+        *std::min_element(target_size_.begin(), target_size_.end());
+    float scale_min =
+        static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+    float scale_max =
+        static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+    float scale_ratio = std::min(scale_min, scale_max);
+    resize_scale = {scale_ratio, scale_ratio};
+  } else {
+    resize_scale.first =
+        static_cast<float>(target_size_[1]) / static_cast<float>(origin_w);
+    resize_scale.second =
+        static_cast<float>(target_size_[0]) / static_cast<float>(origin_h);
+  }
+  return resize_scale;
+}
+
+void LetterBoxResize::Run(cv::Mat* im, ImageBlob* data) {
+  float resize_scale = GenerateScale(*im);
+  int new_shape_w = std::round(im->cols * resize_scale);
+  int new_shape_h = std::round(im->rows * resize_scale);
+  data->im_shape_ = {static_cast<float>(new_shape_h),
+                     static_cast<float>(new_shape_w)};
+  float padw = (target_size_[1] - new_shape_w) / 2.;
+  float padh = (target_size_[0] - new_shape_h) / 2.;
+
+  int top = std::round(padh - 0.1);
+  int bottom = std::round(padh + 0.1);
+  int left = std::round(padw - 0.1);
+  int right = std::round(padw + 0.1);
+
+  cv::resize(
+      *im, *im, cv::Size(new_shape_w, new_shape_h), 0, 0, cv::INTER_AREA);
+
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  cv::copyMakeBorder(*im,
+                     *im,
+                     top,
+                     bottom,
+                     left,
+                     right,
+                     cv::BORDER_CONSTANT,
+                     cv::Scalar(127.5));
+
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+
+  data->scale_factor_ = {
+      resize_scale, resize_scale,
+  };
+}
+
+float LetterBoxResize::GenerateScale(const cv::Mat& im) {
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+
+  int target_h = target_size_[0];
+  int target_w = target_size_[1];
+
+  float ratio_h = static_cast<float>(target_h) / static_cast<float>(origin_h);
+  float ratio_w = static_cast<float>(target_w) / static_cast<float>(origin_w);
+  float resize_scale = std::min(ratio_h, ratio_w);
+  return resize_scale;
+}
+
+void PadStride::Run(cv::Mat* im, ImageBlob* data) {
+  if (stride_ <= 0) {
+    data->in_net_im_ = im->clone();
+    return;
+  }
+  int rc = im->channels();
+  int rh = im->rows;
+  int rw = im->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(
+      *im, *im, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT, cv::Scalar(0));
+  data->in_net_im_ = im->clone();
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+
+void TopDownEvalAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::resize(*im, *im, cv::Size(trainsize_[0], trainsize_[1]), 0, 0, interp_);
+  // todo: Simd::ResizeBilinear();
+  data->in_net_shape_ = {
+      static_cast<float>(trainsize_[1]), static_cast<float>(trainsize_[0]),
+  };
+}
+
+void GetAffineTrans(const cv::Point2f center,
+                    const cv::Point2f input_size,
+                    const cv::Point2f output_size,
+                    cv::Mat* trans) {
+  cv::Point2f srcTri[3];
+  cv::Point2f dstTri[3];
+  float src_w = input_size.x;
+  float dst_w = output_size.x;
+  float dst_h = output_size.y;
+
+  cv::Point2f src_dir(0, -0.5 * src_w);
+  cv::Point2f dst_dir(0, -0.5 * dst_w);
+
+  srcTri[0] = center;
+  srcTri[1] = center + src_dir;
+  cv::Point2f src_d = srcTri[0] - srcTri[1];
+  srcTri[2] = srcTri[1] + cv::Point2f(-src_d.y, src_d.x);
+
+  dstTri[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstTri[1] = cv::Point2f(dst_w * 0.5, dst_h * 0.5) + dst_dir;
+  cv::Point2f dst_d = dstTri[0] - dstTri[1];
+  dstTri[2] = dstTri[1] + cv::Point2f(-dst_d.y, dst_d.x);
+
+  *trans = cv::getAffineTransform(srcTri, dstTri);
+}
+
+void WarpAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::cvtColor(*im, *im, cv::COLOR_RGB2BGR);
+  cv::Mat trans(2, 3, CV_32FC1);
+  cv::Point2f center;
+  cv::Point2f input_size;
+  int h = im->rows;
+  int w = im->cols;
+  if (keep_res_) {
+    input_h_ = (h | pad_) + 1;
+    input_w_ = (w + pad_) + 1;
+    input_size = cv::Point2f(input_w_, input_h_);
+    center = cv::Point2f(w / 2, h / 2);
+  } else {
+    float s = std::max(h, w) * 1.0;
+    input_size = cv::Point2f(s, s);
+    center = cv::Point2f(w / 2., h / 2.);
+  }
+  cv::Point2f output_size(input_w_, input_h_);
+
+  GetAffineTrans(center, input_size, output_size, &trans);
+  cv::warpAffine(*im, *im, trans, cv::Size(input_w_, input_h_));
+  data->in_net_shape_ = {
+      static_cast<float>(input_h_), static_cast<float>(input_w_),
+  };
+}
+
+void Pad::Run(cv::Mat* im, ImageBlob* data) {
+  int h = size_[0];
+  int w = size_[1];
+  int rh = im->rows;
+  int rw = im->cols;
+  if (h == rh && w == rw){
+    data->in_net_im_ = im->clone();
+    return;
+  }
+  cv::copyMakeBorder(
+      *im, *im, 0, h - rh, 0, w - rw, cv::BORDER_CONSTANT, cv::Scalar(114));
+  data->in_net_im_ = im->clone();
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+
+// Preprocessor op running order
+const std::vector<std::string> Preprocessor::RUN_ORDER = {"InitInfo",
+                                                          "TopDownEvalAffine",
+                                                          "Resize",
+                                                          "LetterBoxResize",
+                                                          "WarpAffine",
+                                                          "NormalizeImage",
+                                                          "PadStride",
+                                                          "Pad",
+                                                          "Permute"};
+
+void Preprocessor::Run(cv::Mat* im, ImageBlob* data) {
+  for (const auto& name : RUN_ORDER) {
+    if (ops_.find(name) != ops_.end()) {
+      ops_[name]->Run(im, data);
+    }
+  }
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  // adjust h or w to keep image ratio, expand the shorter edge
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
+
+bool CheckDynamicInput(const std::vector<cv::Mat>& imgs) {
+  if (imgs.size() == 1) return false;
+
+  int h = imgs.at(0).rows;
+  int w = imgs.at(0).cols;
+  for (int i = 1; i < imgs.size(); ++i) {
+    int hi = imgs.at(i).rows;
+    int wi = imgs.at(i).cols;
+    if (hi != h || wi != w) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<cv::Mat> PadBatch(const std::vector<cv::Mat>& imgs) {
+  std::vector<cv::Mat> out_imgs;
+  int max_h = 0;
+  int max_w = 0;
+  int rh = 0;
+  int rw = 0;
+  // find max_h and max_w in batch
+  for (int i = 0; i < imgs.size(); ++i) {
+    rh = imgs.at(i).rows;
+    rw = imgs.at(i).cols;
+    if (rh > max_h) max_h = rh;
+    if (rw > max_w) max_w = rw;
+  }
+  for (int i = 0; i < imgs.size(); ++i) {
+    cv::Mat im = imgs.at(i);
+    cv::copyMakeBorder(im,
+                       im,
+                       0,
+                       max_h - imgs.at(i).rows,
+                       0,
+                       max_w - imgs.at(i).cols,
+                       cv::BORDER_CONSTANT,
+                       cv::Scalar(0));
+    out_imgs.push_back(im);
+  }
+  return out_imgs;
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/cpp/src/tracker.cc b/deploy/cpp/src/tracker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f40cb0dd699a4687f4f77714e4bc5ae5416141f6
--- /dev/null
+++ b/deploy/cpp/src/tracker.cc
@@ -0,0 +1,333 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/CnybTseng/JDE/blob/master/platforms/common/jdetracker.cpp
+// Ths copyright of CnybTseng/JDE is as follows:
+// MIT License 
+
+#include <map>
+#include <stdio.h>
+#include <limits.h>
+#include <algorithm>
+
+#include "include/lapjv.h"
+#include "include/tracker.h"
+
+#define mat2vec4f(m) cv::Vec4f(*m.ptr<float>(0,0), *m.ptr<float>(0,1), *m.ptr<float>(0,2), *m.ptr<float>(0,3))
+
+namespace PaddleDetection {
+
+static std::map<int, float> chi2inv95 = {
+    {1,  3.841459f},
+    {2,  5.991465f},
+    {3,  7.814728f},
+    {4,  9.487729f},
+    {5, 11.070498f},
+    {6, 12.591587f},
+    {7, 14.067140f},
+    {8, 15.507313f},
+    {9, 16.918978f}
+};
+
+JDETracker *JDETracker::me = new JDETracker;
+
+JDETracker *JDETracker::instance(void)
+{
+    return me;
+}
+
+JDETracker::JDETracker(void) : timestamp(0), max_lost_time(30), lambda(0.98f), det_thresh(0.3f)
+{    
+}
+
+bool JDETracker::update(const cv::Mat &dets, const cv::Mat &emb, std::vector<Track> &tracks)
+{
+    ++timestamp;
+    TrajectoryPool candidates(dets.rows);
+    for (int i = 0; i < dets.rows; ++i)
+    {
+        float score = *dets.ptr<float>(i, 1);
+        const cv::Mat &ltrb_ = dets(cv::Rect(2, i, 4, 1));
+        cv::Vec4f ltrb = mat2vec4f(ltrb_);
+        const cv::Mat &embedding = emb(cv::Rect(0, i, emb.cols, 1));
+        candidates[i] = Trajectory(ltrb, score, embedding);
+    }
+
+    
+    TrajectoryPtrPool tracked_trajectories;
+    TrajectoryPtrPool unconfirmed_trajectories;
+    for (size_t i = 0; i < this->tracked_trajectories.size(); ++i)
+    {
+        if (this->tracked_trajectories[i].is_activated)
+            tracked_trajectories.push_back(&this->tracked_trajectories[i]);
+        else
+            unconfirmed_trajectories.push_back(&this->tracked_trajectories[i]);
+    }
+    
+    
+    TrajectoryPtrPool trajectory_pool = tracked_trajectories + this->lost_trajectories;
+    
+    for (size_t i = 0; i < trajectory_pool.size(); ++i)
+        trajectory_pool[i]->predict();   
+    
+    Match matches;
+    std::vector<int> mismatch_row;
+    std::vector<int> mismatch_col;
+
+    cv::Mat cost = motion_distance(trajectory_pool, candidates);
+    linear_assignment(cost, 0.7f, matches, mismatch_row, mismatch_col);
+    
+    MatchIterator miter;
+    TrajectoryPtrPool activated_trajectories;
+    TrajectoryPtrPool retrieved_trajectories;    
+
+    
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        Trajectory *pt = trajectory_pool[miter->first];
+        Trajectory &ct = candidates[miter->second];
+        if (pt->state == Tracked)
+        {
+            pt->update(ct, timestamp);
+            activated_trajectories.push_back(pt);
+        }
+        else
+        {
+            pt->reactivate(ct, timestamp);
+            retrieved_trajectories.push_back(pt);
+        }
+    }
+    
+    TrajectoryPtrPool next_candidates(mismatch_col.size());
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+        next_candidates[i] = &candidates[mismatch_col[i]];
+    
+    TrajectoryPtrPool next_trajectory_pool;
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        int j = mismatch_row[i];
+        if (trajectory_pool[j]->state == Tracked)
+            next_trajectory_pool.push_back(trajectory_pool[j]);
+    }
+    
+    cost = iou_distance(next_trajectory_pool, next_candidates);
+    linear_assignment(cost, 0.5f, matches, mismatch_row, mismatch_col);
+    
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        Trajectory *pt = next_trajectory_pool[miter->first];
+        Trajectory *ct = next_candidates[miter->second];
+        if (pt->state == Tracked)
+        {
+            pt->update(*ct, timestamp);
+            activated_trajectories.push_back(pt);
+        }
+        else
+        {
+            pt->reactivate(*ct, timestamp);
+            retrieved_trajectories.push_back(pt);
+        }
+    }
+    
+    TrajectoryPtrPool lost_trajectories;
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        Trajectory *pt = next_trajectory_pool[mismatch_row[i]];
+        if (pt->state != Lost)
+        {
+            pt->mark_lost();
+            lost_trajectories.push_back(pt);
+        }
+    }
+    
+    TrajectoryPtrPool nnext_candidates(mismatch_col.size());
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+        nnext_candidates[i] = next_candidates[mismatch_col[i]];
+    cost = iou_distance(unconfirmed_trajectories, nnext_candidates);
+    linear_assignment(cost, 0.7f, matches, mismatch_row, mismatch_col);
+    
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        unconfirmed_trajectories[miter->first]->update(*nnext_candidates[miter->second], timestamp);
+        activated_trajectories.push_back(unconfirmed_trajectories[miter->first]);
+    }
+    
+    TrajectoryPtrPool removed_trajectories;
+
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        unconfirmed_trajectories[mismatch_row[i]]->mark_removed();
+        removed_trajectories.push_back(unconfirmed_trajectories[mismatch_row[i]]);
+    }
+    
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+    {
+        if (nnext_candidates[mismatch_col[i]]->score < det_thresh) continue;
+        nnext_candidates[mismatch_col[i]]->activate(timestamp);
+        activated_trajectories.push_back(nnext_candidates[mismatch_col[i]]);
+    }
+    
+    for (size_t i = 0; i < this->lost_trajectories.size(); ++i)
+    {
+        Trajectory &lt = this->lost_trajectories[i];
+        if (timestamp - lt.timestamp > max_lost_time)
+        {
+            lt.mark_removed();
+            removed_trajectories.push_back(&lt);
+        }
+    }
+    
+    TrajectoryPoolIterator piter;
+    for (piter = this->tracked_trajectories.begin(); piter != this->tracked_trajectories.end(); )
+    {
+        if (piter->state != Tracked)
+            piter = this->tracked_trajectories.erase(piter);
+        else
+            ++piter;
+    }
+    
+    this->tracked_trajectories += activated_trajectories;
+    this->tracked_trajectories += retrieved_trajectories;
+
+    this->lost_trajectories -= this->tracked_trajectories;
+    this->lost_trajectories += lost_trajectories;
+    this->lost_trajectories -= this->removed_trajectories;
+    this->removed_trajectories += removed_trajectories;
+    remove_duplicate_trajectory(this->tracked_trajectories, this->lost_trajectories);
+    
+    tracks.clear();
+    for (size_t i = 0; i < this->tracked_trajectories.size(); ++i)
+    {
+        if (this->tracked_trajectories[i].is_activated)
+        {
+            Track track = {
+                .id = this->tracked_trajectories[i].id,
+                .score = this->tracked_trajectories[i].score,
+                .ltrb = this->tracked_trajectories[i].ltrb};
+            tracks.push_back(track);
+        }
+    }
+    return 0;
+}
+
+
+cv::Mat JDETracker::motion_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    if (0 == a.size() || 0 == b.size())
+        return cv::Mat(a.size(), b.size(), CV_32F);
+    
+    cv::Mat edists = embedding_distance(a, b);
+    cv::Mat mdists = mahalanobis_distance(a, b);
+    cv::Mat fdists = lambda * edists + (1 - lambda) * mdists;
+    
+    const float gate_thresh = chi2inv95[4];
+    for (int i = 0; i < fdists.rows; ++i)
+    {
+        for (int j = 0; j < fdists.cols; ++j)
+        {
+            if (*mdists.ptr<float>(i, j) > gate_thresh)
+                *fdists.ptr<float>(i, j) = FLT_MAX;
+        }
+    }
+    
+    return fdists;
+}
+
+void JDETracker::linear_assignment(const cv::Mat &cost, float cost_limit, Match &matches,
+    std::vector<int> &mismatch_row, std::vector<int> &mismatch_col)
+{
+    matches.clear();
+    mismatch_row.clear();    
+    mismatch_col.clear();
+    if (cost.empty())
+    {
+        for (int i = 0; i < cost.rows; ++i)
+            mismatch_row.push_back(i);
+        for (int i = 0; i < cost.cols; ++i)
+            mismatch_col.push_back(i);
+        return;
+    }
+    
+    float opt = 0; 
+    cv::Mat x(cost.rows, 1, CV_32S);
+    cv::Mat y(cost.cols, 1, CV_32S);    
+
+    lapjv_internal(cost, true, cost_limit,
+        (int *)x.data, (int *)y.data);
+     
+    for (int i = 0; i < x.rows; ++i)
+    {
+        int j = *x.ptr<int>(i);
+        if (j >= 0)
+            matches.insert({i, j});
+        else
+            mismatch_row.push_back(i);
+    }
+    
+    for (int i = 0; i < y.rows; ++i)
+    {
+        int j = *y.ptr<int>(i);
+        if (j < 0)
+            mismatch_col.push_back(i);
+    }
+    
+    return;
+}
+
+void JDETracker::remove_duplicate_trajectory(TrajectoryPool &a, TrajectoryPool &b, float iou_thresh)
+{
+    if (0 == a.size() || 0 == b.size())
+        return;
+    
+    cv::Mat dist = iou_distance(a, b);
+    cv::Mat mask = dist < iou_thresh;
+    std::vector<cv::Point> idx;
+    cv::findNonZero(mask, idx);
+    
+    std::vector<int> da;
+    std::vector<int> db;
+    for (size_t i = 0; i < idx.size(); ++i)
+    {
+        int ta = a[idx[i].y].timestamp - a[idx[i].y].starttime;
+        int tb = b[idx[i].x].timestamp - b[idx[i].x].starttime;
+        if (ta > tb)
+            db.push_back(idx[i].x);
+        else
+            da.push_back(idx[i].y);
+    }
+    
+    int id = 0;
+    TrajectoryPoolIterator piter;
+    for (piter = a.begin(); piter != a.end(); )
+    {
+        std::vector<int>::iterator iter = find(da.begin(), da.end(), id++);
+        if (iter != da.end())
+            piter = a.erase(piter);
+        else
+            ++piter;
+    }
+    
+    id = 0;
+    for (piter = b.begin(); piter != b.end(); )
+    {
+        std::vector<int>::iterator iter = find(db.begin(), db.end(), id++);
+        if (iter != db.end())
+            piter = b.erase(piter);
+        else
+            ++piter;
+    }
+}
+
+}   // namespace PaddleDetection
diff --git a/deploy/cpp/src/trajectory.cc b/deploy/cpp/src/trajectory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e69b350fad4fcf43b2ef9cf350c97ce5f8cd884
--- /dev/null
+++ b/deploy/cpp/src/trajectory.cc
@@ -0,0 +1,584 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/CnybTseng/JDE/blob/master/platforms/common/trajectory.cpp
+// Ths copyright of CnybTseng/JDE is as follows:
+// MIT License
+
+#include <algorithm>
+
+#include "include/trajectory.h"
+
+namespace PaddleDetection {
+
+void TKalmanFilter::init(const cv::Mat &measurement)
+{
+    measurement.copyTo(statePost(cv::Rect(0, 0, 1, 4)));
+    statePost(cv::Rect(0, 4, 1, 4)).setTo(0);
+    statePost.copyTo(statePre);
+
+    float varpos = 2 *  std_weight_position * (*measurement.ptr<float>(3));
+    varpos *= varpos;
+    float varvel = 10 * std_weight_velocity * (*measurement.ptr<float>(3));
+    varvel *= varvel;
+    
+    errorCovPost.setTo(0);
+    *errorCovPost.ptr<float>(0, 0) = varpos;
+    *errorCovPost.ptr<float>(1, 1) = varpos;
+    *errorCovPost.ptr<float>(2, 2) = 1e-4f;
+    *errorCovPost.ptr<float>(3, 3) = varpos;
+    *errorCovPost.ptr<float>(4, 4) = varvel;
+    *errorCovPost.ptr<float>(5, 5) = varvel;
+    *errorCovPost.ptr<float>(6, 6) = 1e-10f;
+    *errorCovPost.ptr<float>(7, 7) = varvel;  
+    errorCovPost.copyTo(errorCovPre);
+}
+
+const cv::Mat &TKalmanFilter::predict()
+{
+    float varpos = std_weight_position * (*statePre.ptr<float>(3));
+    varpos *= varpos;
+    float varvel = std_weight_velocity * (*statePre.ptr<float>(3));
+    varvel *= varvel;
+    
+    processNoiseCov.setTo(0);
+    *processNoiseCov.ptr<float>(0, 0) = varpos;
+    *processNoiseCov.ptr<float>(1, 1) = varpos;
+    *processNoiseCov.ptr<float>(2, 2) = 1e-4f;
+    *processNoiseCov.ptr<float>(3, 3) = varpos;
+    *processNoiseCov.ptr<float>(4, 4) = varvel;
+    *processNoiseCov.ptr<float>(5, 5) = varvel;
+    *processNoiseCov.ptr<float>(6, 6) = 1e-10f;
+    *processNoiseCov.ptr<float>(7, 7) = varvel;
+    
+    return cv::KalmanFilter::predict();
+}
+
+const cv::Mat &TKalmanFilter::correct(const cv::Mat &measurement)
+{
+    float varpos = std_weight_position * (*measurement.ptr<float>(3));
+    varpos *= varpos;
+
+    measurementNoiseCov.setTo(0);
+    *measurementNoiseCov.ptr<float>(0, 0) = varpos;
+    *measurementNoiseCov.ptr<float>(1, 1) = varpos;
+    *measurementNoiseCov.ptr<float>(2, 2) = 1e-2f;
+    *measurementNoiseCov.ptr<float>(3, 3) = varpos;
+    
+    return cv::KalmanFilter::correct(measurement);
+}
+
+void TKalmanFilter::project(cv::Mat &mean, cv::Mat &covariance) const
+{    
+    float varpos = std_weight_position * (*statePost.ptr<float>(3));
+    varpos *= varpos;
+    
+    cv::Mat measurementNoiseCov_ = cv::Mat::eye(4, 4, CV_32F);
+    *measurementNoiseCov_.ptr<float>(0, 0) = varpos;
+    *measurementNoiseCov_.ptr<float>(1, 1) = varpos;
+    *measurementNoiseCov_.ptr<float>(2, 2) = 1e-2f;
+    *measurementNoiseCov_.ptr<float>(3, 3) = varpos;
+        
+    mean = measurementMatrix * statePost;
+    cv::Mat temp = measurementMatrix * errorCovPost;
+    gemm(temp, measurementMatrix, 1, measurementNoiseCov_, 1, covariance, cv::GEMM_2_T);
+}
+
+int Trajectory::count = 0;
+
+const cv::Mat &Trajectory::predict(void)
+{
+    if (state != Tracked)
+        *cv::KalmanFilter::statePost.ptr<float>(7) = 0;
+    return TKalmanFilter::predict();
+}
+
+void Trajectory::update(Trajectory &traj, int timestamp_, bool update_embedding_)
+{
+    timestamp = timestamp_;
+    ++length;
+    ltrb = traj.ltrb;
+    xyah = traj.xyah;    
+    TKalmanFilter::correct(cv::Mat(traj.xyah));    
+    state = Tracked;
+    is_activated = true;
+    score = traj.score;   
+    if (update_embedding_)
+        update_embedding(traj.current_embedding);
+}
+
+void Trajectory::activate(int timestamp_)
+{
+    id = next_id();
+    TKalmanFilter::init(cv::Mat(xyah));    
+    length = 0;
+    state = Tracked;
+    if (timestamp_ == 1) {
+      is_activated = true;
+    }
+    timestamp = timestamp_;
+    starttime = timestamp_;
+}
+
+void Trajectory::reactivate(Trajectory &traj, int timestamp_, bool newid)
+{
+    TKalmanFilter::correct(cv::Mat(traj.xyah));
+    update_embedding(traj.current_embedding);
+    length = 0;
+    state = Tracked;
+    is_activated = true;
+    timestamp = timestamp_;
+    if (newid)
+        id = next_id();
+}
+
+void Trajectory::update_embedding(const cv::Mat &embedding)
+{
+    current_embedding = embedding / cv::norm(embedding);
+    if (smooth_embedding.empty())
+    {
+        smooth_embedding = current_embedding;
+    }
+    else
+    {
+        smooth_embedding = eta * smooth_embedding + (1 - eta) * current_embedding;
+    }
+    smooth_embedding = smooth_embedding / cv::norm(smooth_embedding);
+}
+
+TrajectoryPool operator+(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    TrajectoryPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i].id);
+        if (iter == ids.end())
+        {
+            sum.push_back(b[i]);
+            ids.push_back(b[i].id);
+        }
+    }
+    
+    return sum;
+}
+
+TrajectoryPool operator+(const TrajectoryPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            sum.push_back(*b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    
+    return sum;
+}
+
+TrajectoryPool &operator+=(TrajectoryPool &a, const TrajectoryPtrPool &b)
+{    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        if (b[i]->smooth_embedding.empty())
+            continue;
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            a.push_back(*b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    
+    return a;
+}
+
+TrajectoryPool operator-(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    TrajectoryPool dif;
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i].id;
+    
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), a[i].id);
+        if (iter == ids.end())
+            dif.push_back(a[i]);
+    }
+    
+    return dif;
+}
+
+TrajectoryPool &operator-=(TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i].id;
+    
+    TrajectoryPoolIterator piter;
+    for (piter = a.begin(); piter != a.end(); )
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), piter->id);
+        if (iter == ids.end())
+            ++piter;
+        else
+            piter = a.erase(piter);
+    }
+    
+    return a;
+}
+
+TrajectoryPtrPool operator+(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPtrPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i]->id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            sum.push_back(b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    
+    return sum;
+}
+
+TrajectoryPtrPool operator+(const TrajectoryPtrPool &a, TrajectoryPool &b)
+{
+    TrajectoryPtrPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i]->id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i].id);
+        if (iter == ids.end())
+        {
+            sum.push_back(&b[i]);
+            ids.push_back(b[i].id);
+        }
+    }
+    
+    return sum;
+}
+
+TrajectoryPtrPool operator-(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPtrPool dif;
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i]->id;
+    
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), a[i]->id);
+        if (iter == ids.end())
+            dif.push_back(a[i]);
+    }
+    
+    return dif;
+}
+
+cv::Mat embedding_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            cv::Mat u = a[i].smooth_embedding;
+            cv::Mat v = b[j].smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            //double dist = cv::norm(a[i].smooth_embedding, b[j].smooth_embedding, cv::NORM_L2);
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+        }
+    }
+    return dists;
+}
+
+cv::Mat embedding_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            //double dist = cv::norm(a[i]->smooth_embedding, b[j]->smooth_embedding, cv::NORM_L2);
+            //distsi[j] = static_cast<float>(dist);
+            cv::Mat u = a[i]->smooth_embedding;
+            cv::Mat v = b[j]->smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat embedding_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            //double dist = cv::norm(a[i]->smooth_embedding, b[j].smooth_embedding, cv::NORM_L2);
+            //distsi[j] = static_cast<float>(dist);
+            cv::Mat u = a[i]->smooth_embedding;
+            cv::Mat v = b[j].smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat mahalanobis_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i].project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j].xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat mahalanobis_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i]->project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j]->xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat mahalanobis_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+ 
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i]->project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+    
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j].xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    
+    return dists;
+}
+
+static inline float calc_inter_area(const cv::Vec4f &a, const cv::Vec4f &b)
+{
+    if (a[2] < b[0] || a[0] > b[2] || a[3] < b[1] || a[1] > b[3])
+        return 0.f;
+    
+    float w = std::min(a[2], b[2]) - std::max(a[0], b[0]);
+    float h = std::min(a[3], b[3]) - std::max(a[1], b[1]);
+    return w * h;
+}
+
+cv::Mat iou_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i].ltrb[2] - a[i].ltrb[0];
+        float h = a[i].ltrb[3] - a[i].ltrb[1];
+        areaa[i] = w * h;
+    }
+    
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j].ltrb[2] - b[j].ltrb[0];
+        float h = b[j].ltrb[3] - b[j].ltrb[1];
+        areab[j] = w * h;
+    }
+    
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i].ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j].ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat iou_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i]->ltrb[2] - a[i]->ltrb[0];
+        float h = a[i]->ltrb[3] - a[i]->ltrb[1];
+        areaa[i] = w * h;
+    }
+    
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j]->ltrb[2] - b[j]->ltrb[0];
+        float h = b[j]->ltrb[3] - b[j]->ltrb[1];
+        areab[j] = w * h;
+    }
+    
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i]->ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j]->ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat iou_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i]->ltrb[2] - a[i]->ltrb[0];
+        float h = a[i]->ltrb[3] - a[i]->ltrb[1];
+        areaa[i] = w * h;
+    }
+    
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j].ltrb[2] - b[j].ltrb[0];
+        float h = b[j].ltrb[3] - b[j].ltrb[1];
+        areab[j] = w * h;
+    }
+    
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i]->ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j].ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    
+    return dists;
+}
+
+}   // namespace PaddleDetection
diff --git a/deploy/cpp/src/utils.cc b/deploy/cpp/src/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b4731cd9e25b3536417ade20d3f9ce5089755fd
--- /dev/null
+++ b/deploy/cpp/src/utils.cc
@@ -0,0 +1,49 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/utils.h"
+
+namespace PaddleDetection {
+
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold) {
+  std::sort(input_boxes.begin(),
+  input_boxes.end(), 
+  [](ObjectResult a, ObjectResult b) { return a.confidence > b.confidence; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).rect[2] - input_boxes.at(i).rect[0] + 1) 
+            * (input_boxes.at(i).rect[3] - input_boxes.at(i).rect[1] + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].rect[0], input_boxes[j].rect[0]);
+      float yy1 = (std::max)(input_boxes[i].rect[1], input_boxes[j].rect[1]);
+      float xx2 = (std::min)(input_boxes[i].rect[2], input_boxes[j].rect[2]);
+      float yy2 = (std::min)(input_boxes[i].rect[3], input_boxes[j].rect[3]);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= nms_threshold) {
+          input_boxes.erase(input_boxes.begin() + j);
+          vArea.erase(vArea.begin() + j);
+      }
+      else {
+          j++;
+      }
+    }
+  }
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/end2end_ppyoloe/README.md b/deploy/end2end_ppyoloe/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d470dccffe7c9927eac6946d3ee47ea96c346a56
--- /dev/null
+++ b/deploy/end2end_ppyoloe/README.md
@@ -0,0 +1,99 @@
+# Export ONNX Model
+## Download pretrain paddle models
+
+* [ppyoloe-s](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_300e_coco.pdparams)
+* [ppyoloe-m](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_m_300e_coco.pdparams)
+* [ppyoloe-l](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams)
+* [ppyoloe-x](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_x_300e_coco.pdparams)
+* [ppyoloe-s-400e](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_400e_coco.pdparams)
+
+
+## Export paddle model for deploying
+
+```shell
+python ./tools/export_model.py \
+    -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml \
+    -o weights=ppyoloe_crn_s_300e_coco.pdparams \
+    trt=True \
+    exclude_nms=True \
+    TestReader.inputs_def.image_shape=[3,640,640] \
+    --output_dir ./
+
+# if you want to try ppyoloe-s-400e model
+python ./tools/export_model.py \
+    -c configs/ppyoloe/ppyoloe_crn_s_400e_coco.yml \
+    -o weights=ppyoloe_crn_s_400e_coco.pdparams \
+    trt=True \
+    exclude_nms=True \
+    TestReader.inputs_def.image_shape=[3,640,640] \
+    --output_dir ./
+```
+
+## Check requirements
+```shell
+pip install onnx>=1.10.0
+pip install paddle2onnx
+pip install onnx-simplifier
+pip install onnx-graphsurgeon --index-url https://pypi.ngc.nvidia.com
+# if use cuda-python infer, please install it
+pip install cuda-python
+# if use cupy infer, please install it
+pip install cupy-cuda117 # cuda110-cuda117 are all available
+```
+
+## Export script
+```shell
+python ./deploy/end2end_ppyoloe/end2end.py \
+    --model-dir ppyoloe_crn_s_300e_coco \
+    --save-file ppyoloe_crn_s_300e_coco.onnx \
+    --opset 11 \
+    --batch-size 1 \
+    --topk-all 100 \
+    --iou-thres 0.6 \
+    --conf-thres 0.4
+# if you want to try ppyoloe-s-400e model
+python ./deploy/end2end_ppyoloe/end2end.py \
+    --model-dir ppyoloe_crn_s_400e_coco \
+    --save-file ppyoloe_crn_s_400e_coco.onnx \
+    --opset 11 \
+    --batch-size 1 \
+    --topk-all 100 \
+    --iou-thres 0.6 \
+    --conf-thres 0.4
+```
+#### Description of all arguments
+
+- `--model-dir` : the path of ppyoloe export dir.
+- `--save-file` : the path of export onnx.
+- `--opset` : onnx opset version.
+- `--img-size` : image size for exporting ppyoloe.
+- `--batch-size` : batch size for exporting ppyoloe.
+- `--topk-all` : topk objects for every image.
+- `--iou-thres` : iou threshold for NMS algorithm.
+- `--conf-thres` : confidence threshold for NMS algorithm.
+
+### TensorRT backend (TensorRT version>= 8.0.0)
+#### TensorRT engine export
+``` shell
+/path/to/trtexec \
+    --onnx=ppyoloe_crn_s_300e_coco.onnx \
+    --saveEngine=ppyoloe_crn_s_300e_coco.engine \
+    --fp16 # if export TensorRT fp16 model
+# if you want to try ppyoloe-s-400e model
+/path/to/trtexec \
+    --onnx=ppyoloe_crn_s_400e_coco.onnx \
+    --saveEngine=ppyoloe_crn_s_400e_coco.engine \
+    --fp16 # if export TensorRT fp16 model
+```
+#### TensorRT image infer
+
+``` shell
+# cuda-python infer script
+python ./deploy/end2end_ppyoloe/cuda-python.py ppyoloe_crn_s_300e_coco.engine
+# cupy infer script
+python ./deploy/end2end_ppyoloe/cupy-python.py ppyoloe_crn_s_300e_coco.engine
+# if you want to try ppyoloe-s-400e model
+python ./deploy/end2end_ppyoloe/cuda-python.py ppyoloe_crn_s_400e_coco.engine
+# or
+python ./deploy/end2end_ppyoloe/cuda-python.py ppyoloe_crn_s_400e_coco.engine
+```
\ No newline at end of file
diff --git a/deploy/end2end_ppyoloe/cuda-python.py b/deploy/end2end_ppyoloe/cuda-python.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7bd7c84b3eeaa6bea55416d8a5eabd37ac4d33
--- /dev/null
+++ b/deploy/end2end_ppyoloe/cuda-python.py
@@ -0,0 +1,161 @@
+import sys
+import requests
+import cv2
+import random
+import time
+import numpy as np
+import tensorrt as trt
+from cuda import cudart
+from pathlib import Path
+from collections import OrderedDict, namedtuple
+
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, r, (dw, dh)
+
+
+w = Path(sys.argv[1])
+
+assert w.exists() and w.suffix in ('.engine', '.plan'), 'Wrong engine path'
+
+names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+         'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+         'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+         'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+         'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+         'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+         'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+         'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+         'hair drier', 'toothbrush']
+colors = {name: [random.randint(0, 255) for _ in range(3)] for i, name in enumerate(names)}
+
+url = 'https://oneflow-static.oss-cn-beijing.aliyuncs.com/tripleMu/image1.jpg'
+file = requests.get(url)
+img = cv2.imdecode(np.frombuffer(file.content, np.uint8), 1)
+
+_, stream = cudart.cudaStreamCreate()
+
+mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 3, 1, 1)
+std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 3, 1, 1)
+
+# Infer TensorRT Engine
+Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+logger = trt.Logger(trt.Logger.ERROR)
+trt.init_libnvinfer_plugins(logger, namespace="")
+with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
+    model = runtime.deserialize_cuda_engine(f.read())
+bindings = OrderedDict()
+fp16 = False  # default updated below
+for index in range(model.num_bindings):
+    name = model.get_binding_name(index)
+    dtype = trt.nptype(model.get_binding_dtype(index))
+    shape = tuple(model.get_binding_shape(index))
+    data = np.empty(shape, dtype=np.dtype(dtype))
+    _, data_ptr = cudart.cudaMallocAsync(data.nbytes, stream)
+    bindings[name] = Binding(name, dtype, shape, data, data_ptr)
+    if model.binding_is_input(index) and dtype == np.float16:
+        fp16 = True
+binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
+context = model.create_execution_context()
+
+image = img.copy()
+image, ratio, dwdh = letterbox(image, auto=False)
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+image_copy = image.copy()
+
+image = image.transpose((2, 0, 1))
+image = np.expand_dims(image, 0)
+image = np.ascontiguousarray(image)
+
+im = image.astype(np.float32)
+im /= 255
+im -= mean
+im /= std
+
+_, image_ptr = cudart.cudaMallocAsync(im.nbytes, stream)
+cudart.cudaMemcpyAsync(image_ptr, im.ctypes.data, im.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+
+# warmup for 10 times
+for _ in range(10):
+    tmp = np.random.randn(1, 3, 640, 640).astype(np.float32)
+    _, tmp_ptr = cudart.cudaMallocAsync(tmp.nbytes, stream)
+    binding_addrs['image'] = tmp_ptr
+    context.execute_v2(list(binding_addrs.values()))
+
+start = time.perf_counter()
+binding_addrs['image'] = image_ptr
+context.execute_v2(list(binding_addrs.values()))
+print(f'Cost {(time.perf_counter() - start) * 1000}ms')
+
+nums = bindings['num_dets'].data
+boxes = bindings['det_boxes'].data
+scores = bindings['det_scores'].data
+classes = bindings['det_classes'].data
+
+cudart.cudaMemcpyAsync(nums.ctypes.data,
+                       bindings['num_dets'].ptr,
+                       nums.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                       stream)
+cudart.cudaMemcpyAsync(boxes.ctypes.data,
+                       bindings['det_boxes'].ptr,
+                       boxes.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                       stream)
+cudart.cudaMemcpyAsync(scores.ctypes.data,
+                       bindings['det_scores'].ptr,
+                       scores.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                       stream)
+cudart.cudaMemcpyAsync(classes.ctypes.data,
+                       bindings['det_classes'].ptr,
+                       classes.data.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                       stream)
+
+cudart.cudaStreamSynchronize(stream)
+cudart.cudaStreamDestroy(stream)
+
+for i in binding_addrs.values():
+    cudart.cudaFree(i)
+
+num = int(nums[0][0])
+box_img = boxes[0, :num].round().astype(np.int32)
+score_img = scores[0, :num]
+clss_img = classes[0, :num]
+for i, (box, score, clss) in enumerate(zip(box_img, score_img, clss_img)):
+    name = names[int(clss)]
+    color = colors[name]
+    cv2.rectangle(image_copy, box[:2].tolist(), box[2:].tolist(), color, 2)
+    cv2.putText(image_copy, name, (int(box[0]), int(box[1]) - 2), cv2.FONT_HERSHEY_SIMPLEX,
+                0.75, [225, 255, 255], thickness=2)
+
+cv2.imshow('Result', cv2.cvtColor(image_copy, cv2.COLOR_RGB2BGR))
+cv2.waitKey(0)
diff --git a/deploy/end2end_ppyoloe/cupy-python.py b/deploy/end2end_ppyoloe/cupy-python.py
new file mode 100644
index 0000000000000000000000000000000000000000..a66eb77ecf3aa4c76c143050764429a2a06e8ba1
--- /dev/null
+++ b/deploy/end2end_ppyoloe/cupy-python.py
@@ -0,0 +1,131 @@
+import sys
+import requests
+import cv2
+import random
+import time
+import numpy as np
+import cupy as cp
+import tensorrt as trt
+from PIL import Image
+from collections import OrderedDict, namedtuple
+from pathlib import Path
+
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, r, (dw, dh)
+
+
+names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+         'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+         'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+         'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+         'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+         'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+         'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+         'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+         'hair drier', 'toothbrush']
+colors = {name: [random.randint(0, 255) for _ in range(3)] for i, name in enumerate(names)}
+
+url = 'https://oneflow-static.oss-cn-beijing.aliyuncs.com/tripleMu/image1.jpg'
+file = requests.get(url)
+img = cv2.imdecode(np.frombuffer(file.content, np.uint8), 1)
+
+w = Path(sys.argv[1])
+
+assert w.exists() and w.suffix in ('.engine', '.plan'), 'Wrong engine path'
+
+mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 3, 1, 1)
+std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 3, 1, 1)
+
+mean = cp.asarray(mean)
+std = cp.asarray(std)
+
+# Infer TensorRT Engine
+Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+logger = trt.Logger(trt.Logger.INFO)
+trt.init_libnvinfer_plugins(logger, namespace="")
+with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
+    model = runtime.deserialize_cuda_engine(f.read())
+bindings = OrderedDict()
+fp16 = False  # default updated below
+for index in range(model.num_bindings):
+    name = model.get_binding_name(index)
+    dtype = trt.nptype(model.get_binding_dtype(index))
+    shape = tuple(model.get_binding_shape(index))
+    data = cp.empty(shape, dtype=cp.dtype(dtype))
+    bindings[name] = Binding(name, dtype, shape, data, int(data.data.ptr))
+    if model.binding_is_input(index) and dtype == np.float16:
+        fp16 = True
+binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
+context = model.create_execution_context()
+
+image = img.copy()
+image, ratio, dwdh = letterbox(image, auto=False)
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+image_copy = image.copy()
+
+image = image.transpose((2, 0, 1))
+image = np.expand_dims(image, 0)
+image = np.ascontiguousarray(image)
+
+im = cp.asarray(image)
+im = im.astype(cp.float32)
+im /= 255
+im -= mean
+im /= std
+
+# warmup for 10 times
+for _ in range(10):
+    tmp = cp.random.randn(1, 3, 640, 640).astype(cp.float32)
+    binding_addrs['image'] = int(tmp.data.ptr)
+    context.execute_v2(list(binding_addrs.values()))
+
+start = time.perf_counter()
+binding_addrs['image'] = int(im.data.ptr)
+context.execute_v2(list(binding_addrs.values()))
+print(f'Cost {(time.perf_counter() - start) * 1000}ms')
+
+nums = bindings['num_dets'].data
+boxes = bindings['det_boxes'].data
+scores = bindings['det_scores'].data
+classes = bindings['det_classes'].data
+
+num = int(nums[0][0])
+box_img = boxes[0, :num].round().astype(cp.int32)
+score_img = scores[0, :num]
+clss_img = classes[0, :num]
+for i, (box, score, clss) in enumerate(zip(box_img, score_img, clss_img)):
+    name = names[int(clss)]
+    color = colors[name]
+    cv2.rectangle(image_copy, box[:2].tolist(), box[2:].tolist(), color, 2)
+    cv2.putText(image_copy, name, (int(box[0]), int(box[1]) - 2), cv2.FONT_HERSHEY_SIMPLEX,
+                0.75, [225, 255, 255], thickness=2)
+
+cv2.imshow('Result', cv2.cvtColor(image_copy, cv2.COLOR_RGB2BGR))
+cv2.waitKey(0)
diff --git a/deploy/end2end_ppyoloe/end2end.py b/deploy/end2end_ppyoloe/end2end.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcfbf019a5d5755768e7defd573203a20a020ef7
--- /dev/null
+++ b/deploy/end2end_ppyoloe/end2end.py
@@ -0,0 +1,97 @@
+import argparse
+import onnx
+import onnx_graphsurgeon as gs
+import numpy as np
+
+from pathlib import Path
+from paddle2onnx.legacy.command import program2onnx
+from collections import OrderedDict
+
+
+def main(opt):
+    model_dir = Path(opt.model_dir)
+    save_file = Path(opt.save_file)
+    assert model_dir.exists() and model_dir.is_dir()
+    if save_file.is_dir():
+        save_file = (save_file / model_dir.stem).with_suffix('.onnx')
+    elif save_file.is_file() and save_file.suffix != '.onnx':
+        save_file = save_file.with_suffix('.onnx')
+    input_shape_dict = {'image': [opt.batch_size, 3, *opt.img_size],
+                        'scale_factor': [opt.batch_size, 2]}
+    program2onnx(str(model_dir), str(save_file),
+                 'model.pdmodel', 'model.pdiparams',
+                 opt.opset, input_shape_dict=input_shape_dict)
+    onnx_model = onnx.load(save_file)
+    try:
+        import onnxsim
+        onnx_model, check = onnxsim.simplify(onnx_model)
+        assert check, 'assert check failed'
+    except Exception as e:
+        print(f'Simplifier failure: {e}')
+    onnx.checker.check_model(onnx_model)
+    graph = gs.import_onnx(onnx_model)
+    graph.fold_constants()
+    graph.cleanup().toposort()
+    mul = concat = None
+    for node in graph.nodes:
+        if node.op == 'Div' and node.i(0).op == 'Mul':
+            mul = node.i(0)
+        if node.op == 'Concat' and node.o().op == 'Reshape' and node.o().o().op == 'ReduceSum':
+            concat = node
+
+    assert mul.outputs[0].shape[1] == concat.outputs[0].shape[2], 'Something wrong in outputs shape'
+
+    anchors = mul.outputs[0].shape[1]
+    classes = concat.outputs[0].shape[1]
+
+    scores = gs.Variable(name='scores', shape=[opt.batch_size, anchors, classes], dtype=np.float32)
+    graph.layer(op='Transpose', name='lastTranspose',
+                inputs=[concat.outputs[0]],
+                outputs=[scores],
+                attrs=OrderedDict(perm=[0, 2, 1]))
+
+    graph.inputs = [graph.inputs[0]]
+
+    attrs = OrderedDict(
+        plugin_version="1",
+        background_class=-1,
+        max_output_boxes=opt.topk_all,
+        score_threshold=opt.conf_thres,
+        iou_threshold=opt.iou_thres,
+        score_activation=False,
+        box_coding=0, )
+    outputs = [gs.Variable("num_dets", np.int32, [opt.batch_size, 1]),
+               gs.Variable("det_boxes", np.float32, [opt.batch_size, opt.topk_all, 4]),
+               gs.Variable("det_scores", np.float32, [opt.batch_size, opt.topk_all]),
+               gs.Variable("det_classes", np.int32, [opt.batch_size, opt.topk_all])]
+    graph.layer(op='EfficientNMS_TRT', name="batched_nms",
+                inputs=[mul.outputs[0], scores],
+                outputs=outputs,
+                attrs=attrs)
+    graph.outputs = outputs
+    graph.cleanup().toposort()
+    onnx.save(gs.export_onnx(graph), save_file)
+
+
+def parse_opt():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model-dir', type=str,
+                        default=None,
+                        help='paddle static model')
+    parser.add_argument('--save-file', type=str,
+                        default=None,
+                        help='onnx model save path')
+    parser.add_argument('--opset', type=int, default=11, help='opset version')
+    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size')
+    parser.add_argument('--batch-size', type=int, default=1, help='batch size')
+    parser.add_argument('--topk-all', type=int, default=100, help='topk objects for every images')
+    parser.add_argument('--iou-thres', type=float, default=0.45, help='iou threshold for NMS')
+    parser.add_argument('--conf-thres', type=float, default=0.25, help='conf threshold for NMS')
+    opt = parser.parse_args()
+    opt.img_size *= 2 if len(opt.img_size) == 1 else 1
+    return opt
+
+
+if __name__ == '__main__':
+    opt = parse_opt()
+    main(opt)
diff --git a/deploy/lite/Makefile b/deploy/lite/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9c439382acfafea440de93bb2f3fa91977ad3891
--- /dev/null
+++ b/deploy/lite/Makefile
@@ -0,0 +1,90 @@
+ARM_ABI = arm8#[arm7/arm8]
+export ARM_ABI
+
+ifeq ($(ARM_ABI), arm8)
+    ARM_PLAT=arm64-v8a
+else
+    ARM_PLAT=armeabi-v7a
+endif
+${info ARM_ABI: ${ARM_ABI}}
+${info ARM_PLAT: ${ARM_PLAT}; option[arm7/arm8]}
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+${info LITE_ROOT: $(abspath ${LITE_ROOT})}
+
+THIRD_PARTY_DIR=third_party
+${info THIRD_PARTY_DIR: $(abspath ${THIRD_PARTY_DIR})}
+
+
+OPENCV_VERSION=opencv4.1.0
+OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgcodecs.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgproc.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_core.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtegra_hal.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibjpeg-turbo.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibwebp.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibpng.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibjasper.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibtiff.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libIlmImf.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtbb.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libcpufeatures.a
+
+
+LITE_LIBS = -L${LITE_ROOT}/cxx/lib/ -lpaddle_light_api_shared
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+# LITE_LIBS = ${LITE_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
+
+CXX_LIBS = $(LITE_LIBS) ${OPENCV_LIBS} $(SYSTEM_LIBS)
+
+LOCAL_DIRSRCS=$(wildcard src/*.cc)
+LOCAL_SRCS=$(notdir $(LOCAL_DIRSRCS))
+LOCAL_OBJS=$(patsubst %.cpp, %.o, $(patsubst %.cc, %.o, $(LOCAL_SRCS)))
+
+JSON_OBJS = json_reader.o json_value.o json_writer.o
+
+main: $(LOCAL_OBJS) $(JSON_OBJS) fetch_opencv
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) $(LOCAL_OBJS) $(JSON_OBJS) -o main $(CXX_LIBS) $(LDFLAGS)
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+fetch_json_code:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/jsoncpp_code.tar.gz || \
+      (echo "fetch jsoncpp_code.tar.gz" && \
+      wget -P ${THIRD_PARTY_DIR} https://bj.bcebos.com/v1/paddledet/deploy/jsoncpp_code.tar.gz )
+	@ test -d ${THIRD_PARTY_DIR}/jsoncpp_code || \
+      tar -zxf ${THIRD_PARTY_DIR}/jsoncpp_code.tar.gz -C ${THIRD_PARTY_DIR}
+
+LOCAL_INCLUDES = -I./ -Iinclude
+OPENCV_INCLUDE = -I${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/include
+JSON_INCLUDE = -I${THIRD_PARTY_DIR}/jsoncpp_code/include
+CXX_INCLUDES = ${LOCAL_INCLUDES} ${INCLUDES} ${OPENCV_INCLUDE} ${JSON_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+
+$(LOCAL_OBJS): %.o: src/%.cc fetch_opencv fetch_json_code
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@
+
+$(JSON_OBJS): %.o: ${THIRD_PARTY_DIR}/jsoncpp_code/%.cpp fetch_json_code
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@
+
+.PHONY: clean fetch_opencv fetch_json_code
+clean:
+	rm -rf $(LOCAL_OBJS) $(JSON_OBJS)
+	rm -f main
diff --git a/deploy/lite/README.md b/deploy/lite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..30447460eb6c4ccdf5c1013d1ea2d631d9073fba
--- /dev/null
+++ b/deploy/lite/README.md
@@ -0,0 +1,306 @@
+# Paddle-Lite端侧部署
+
+[Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite)是飞桨轻量化推理引擎，为手机、IOT端提供高效推理能力，并广泛整合跨平台硬件，为端侧部署及应用落地问题提供轻量化的部署方案。
+本目录提供了PaddleDetection中主要模型在Paddle-Lite上的端到端部署代码。用户可以通过本教程了解如何使用该部分代码，基于Paddle-Lite实现在移动端部署PaddleDetection模型。
+
+
+## 1. 准备环境
+
+### 运行准备
+- 电脑（编译Paddle Lite）
+- 安卓手机（armv7或armv8）
+
+### 1.1 准备交叉编译环境
+交叉编译环境用于编译 Paddle Lite 和 PaddleDetection 的C++ demo。
+支持多种开发环境，不同开发环境的编译流程请参考对应文档，请确保安装完成Java jdk、Android NDK(R17 < version < R21，其他版本以上未做测试)。
+设置NDK_ROOT命令：
+```shell
+export NDK_ROOT=[YOUR_NDK_PATH]/android-ndk-r17c
+```
+
+
+1. [Docker](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#docker)
+2. [Linux](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#linux)
+3. [MAC OS](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#mac-os)
+
+### 1.2 准备预测库
+
+预测库有两种获取方式：
+1. [**建议**]直接从[Paddle-Lite Release](https://github.com/PaddlePaddle/Paddle-Lite/releases)中, 根据设备类型与架构选择对应的预编译库，请注意使用模型FP32/16版本需要与库相对应，库文件的说明请参考[官方文档](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#android-toolchain-gcc)。
+
+**注意**：（1） 如果是从 Paddle-Lite [官方文档](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#android-toolchain-gcc)下载的预测库，注意选择`with_extra=ON，with_cv=ON`的下载链接。2. 目前只提供Android端demo，IOS端demo可以参考[Paddle-Lite IOS demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)
+（2）PP-PicoDet部署需要Paddle Lite 2.11以上版本。
+
+
+2. 编译Paddle-Lite得到预测库，Paddle-Lite的编译方式如下(Lite库在不断更新，如若下列命令无效，请以Lite官方repo为主)：
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+# 如果使用编译方式，建议使用develop分支编译预测库
+git checkout develop
+# FP32
+./lite/tools/build_android.sh --arch=armv8 --toolchain=clang --with_cv=ON --with_extra=ON
+# FP16
+./lite/tools/build_android.sh --arch=armv8 --toolchain=clang --with_cv=ON --with_extra=ON --with_arm82_fp16=ON
+```
+
+**注意**：编译Paddle-Lite获得预测库时，需要打开`--with_cv=ON --with_extra=ON`两个选项，`--arch`表示`arm`版本，这里指定为armv8，更多编译命令介绍请参考[链接](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_options.html)。
+
+直接下载预测库并解压后，可以得到`inference_lite_lib.android.armv8.clang.c++_static.with_extra.with_cv/`文件夹，通过编译Paddle-Lite得到的预测库位于`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/`文件夹下。
+预测库的文件目录如下：
+
+```
+inference_lite_lib.android.armv8/
+|-- cxx                                        C++ 预测库和头文件
+|   |-- include                                C++ 头文件
+|   |   |-- paddle_api.h
+|   |   |-- paddle_image_preprocess.h
+|   |   |-- paddle_lite_factory_helper.h
+|   |   |-- paddle_place.h
+|   |   |-- paddle_use_kernels.h
+|   |   |-- paddle_use_ops.h
+|   |   `-- paddle_use_passes.h
+|   `-- lib                                           C++预测库
+|       |-- libpaddle_api_light_bundled.a             C++静态库
+|       `-- libpaddle_light_api_shared.so             C++动态库
+|-- java                                     Java预测库
+|   |-- jar
+|   |   `-- PaddlePredictor.jar
+|   |-- so
+|   |   `-- libpaddle_lite_jni.so
+|   `-- src
+|-- demo                                     C++和Java示例代码
+|   |-- cxx                                  C++  预测库demo, 请将本文档目录下的PaddleDetection相关代码拷贝至该文件夹下执行交叉编译。
+|   `-- java                                 Java 预测库demo
+```
+
+## 2 开始运行
+
+### 2.1 模型转换
+
+Paddle-Lite 提供了多种策略来自动优化原始的模型，其中包括量化、子图融合、混合调度、Kernel优选等方法，使用Paddle-Lite的`opt`工具可以自动对inference模型进行优化，并转换为推理所使用的文件格式。目前支持两种优化方式，优化后的模型更轻量，模型运行速度更快。
+
+**注意**：如果已经准备好了 `.nb` 结尾的模型文件，可以跳过此步骤。
+
+#### 2.1.1 安装paddle_lite_opt工具
+安装`paddle_lite_opt`工具有如下两种方法, **请注意**，无论使用哪种方法，请尽量保证`paddle_lite_opt`工具和预测库的版本一致，以避免未知的Bug。
+1. [**建议**]pip安装paddlelite并进行转换
+    ```shell
+    pip install paddlelite
+    ```
+
+2. 源码编译Paddle-Lite生成`paddle_lite_opt`工具
+
+    模型优化需要Paddle-Lite的`opt`可执行文件，可以通过编译Paddle-Lite源码获得，编译步骤如下：
+    ```shell
+    # 如果准备环境时已经clone了Paddle-Lite，则不用重新clone Paddle-Lite
+    git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+    cd Paddle-Lite
+    git checkout develop
+    # 启动编译
+    ./lite/tools/build.sh build_optimize_tool
+    ```
+
+    编译完成后，`opt`文件位于`build.opt/lite/api/`下，可通过如下方式查看`opt`的运行选项和使用方式；
+    ```shell
+    cd build.opt/lite/api/
+    ./opt
+    ```
+
+    `opt`的使用方式与参数与上面的`paddle_lite_opt`完全一致。
+
+之后使用`paddle_lite_opt`工具可以进行inference模型的转换。`paddle_lite_opt`的部分参数如下：
+
+|选项|说明|
+|-|-|
+|--model_file|待优化的PaddlePaddle模型（combined形式）的网络结构文件路径|
+|--param_file|待优化的PaddlePaddle模型（combined形式）的权重文件路径|
+|--optimize_out_type|输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现，默认为naive_buffer|
+|--optimize_out|优化模型的输出路径|
+|--valid_targets|指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm|
+| --enable_fp16| true/false，是否使用fp16进行推理。如果开启，需要使用对应fp16的预测库|
+
+更详细的`paddle_lite_opt`工具使用说明请参考[使用opt转化模型文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/opt/opt_bin.html)
+
+`--model_file`表示inference模型的model文件地址，`--param_file`表示inference模型的param文件地址；`optimize_out`用于指定输出文件的名称（不需要添加`.nb`的后缀）。直接在命令行中运行`paddle_lite_opt`，也可以查看所有参数及其说明。
+
+
+#### 2.1.2 转换示例
+
+下面以PaddleDetection中的 `PicoDet` 模型为例，介绍使用`paddle_lite_opt`完成预训练模型到inference模型，再到Paddle-Lite优化模型的转换。
+
+```shell
+# 进入PaddleDetection根目录
+cd PaddleDetection_root_path
+
+# 将预训练模型导出为inference模型
+python tools/export_model.py -c configs/picodet/picodet_s_320_coco.yml \
+              -o weights=https://paddledet.bj.bcebos.com/models/picodet_s_320_coco.pdparams --output_dir=output_inference
+
+# 将inference模型转化为Paddle-Lite优化模型
+# FP32
+paddle_lite_opt  --valid_targets=arm --model_file=output_inference/picodet_s_320_coco/model.pdmodel --param_file=output_inference/picodet_s_320_coco/model.pdiparams --optimize_out=output_inference/picodet_s_320_coco/model
+# FP16
+paddle_lite_opt  --valid_targets=arm --model_file=output_inference/picodet_s_320_coco/model.pdmodel --param_file=output_inference/picodet_s_320_coco/model.pdiparams --optimize_out=output_inference/picodet_s_320_coco/model --enable_fp16=true
+
+# 将inference模型配置转化为json格式
+python deploy/lite/convert_yml_to_json.py output_inference/picodet_s_320_coco/infer_cfg.yml
+```
+
+最终在output_inference/picodet_s_320_coco/文件夹下生成`model.nb` 和 `infer_cfg.json`的文件。
+
+**注意**：`--optimize_out` 参数为优化后模型的保存路径，无需加后缀`.nb`；`--model_file` 参数为模型结构信息文件的路径，`--param_file` 参数为模型权重信息文件的路径，请注意文件名。
+
+### 2.2 与手机联调
+
+首先需要进行一些准备工作。
+1. 准备一台arm8的安卓手机，如果编译的预测库是armv7，则需要arm7的手机，并修改Makefile中`ARM_ABI=arm7`。
+2. 电脑上安装ADB工具，用于调试。 ADB安装方式如下：
+
+    2.1. MAC电脑安装ADB:
+
+    ```shell
+    brew cask install android-platform-tools
+    ```
+    2.2. Linux安装ADB
+    ```shell
+    sudo apt update
+    sudo apt install -y wget adb
+    ```
+    2.3. Window安装ADB
+
+    win上安装需要去谷歌的安卓平台下载ADB软件包进行安装：[链接](https://developer.android.com/studio)
+
+3. 手机连接电脑后，开启手机`USB调试`选项，选择`文件传输`模式，在电脑终端中输入：
+
+```shell
+adb devices
+```
+如果有device输出，则表示安装成功，如下所示：
+```
+List of devices attached
+744be294    device
+```
+
+4. 编译lite部署代码生成移动端可执行文件
+
+```shell
+cd {PadddleDetection_Root}
+cd deploy/lite/
+
+inference_lite_path=/{lite prediction library path}/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv/
+mkdir $inference_lite_path/demo/cxx/lite
+
+cp -r Makefile src/ include/ *runtime_config.json $inference_lite_path/demo/cxx/lite
+
+cd $inference_lite_path/demo/cxx/lite
+
+# 执行编译，等待完成后得到可执行文件main
+make ARM_ABI=arm8
+#如果是arm7，则执行 make ARM_ABI = arm7 (或者在Makefile中修改该项)
+
+```
+
+5. 准备优化后的模型、预测库文件、测试图像。
+
+```shell
+mkdir deploy
+cp main *runtime_config.json deploy/
+cd deploy
+mkdir model_det
+mkdir model_keypoint
+
+# 将优化后的模型、预测库文件、测试图像放置在预测库中的demo/cxx/detection文件夹下
+cp {PadddleDetection_Root}/output_inference/picodet_s_320_coco/model.nb ./model_det/
+cp {PadddleDetection_Root}/output_inference/picodet_s_320_coco/infer_cfg.json ./model_det/
+
+# 如果需要关键点模型，则只需操作：
+cp {PadddleDetection_Root}/output_inference/hrnet_w32_256x192/model.nb ./model_keypoint/
+cp {PadddleDetection_Root}/output_inference/hrnet_w32_256x192/infer_cfg.json ./model_keypoint/
+
+# 将测试图像复制到deploy文件夹中
+cp [your_test_img].jpg ./demo.jpg
+
+# 将C++预测动态库so文件复制到deploy文件夹中
+cp ../../../cxx/lib/libpaddle_light_api_shared.so ./
+```
+
+执行完成后，deploy文件夹下将有如下文件格式：
+
+```
+deploy/
+|-- model_det/
+|   |--model.nb                    优化后的检测模型文件
+|   |--infer_cfg.json              检测器模型配置文件
+|-- model_keypoint/
+|   |--model.nb                    优化后的关键点模型文件
+|   |--infer_cfg.json              关键点模型配置文件
+|-- main                           生成的移动端执行文件
+|-- det_runtime_config.json        目标检测执行时参数配置文件
+|-- keypoint_runtime_config.json   关键点检测执行时参数配置文件
+|-- libpaddle_light_api_shared.so  Paddle-Lite库文件
+```
+
+**注意：**
+*  `det_runtime_config.json` 包含了目标检测的超参数，请按需进行修改：
+
+```shell
+{
+  "model_dir_det": "./model_det/",              #检测器模型路径
+  "batch_size_det": 1,                          #检测预测时batchsize
+  "threshold_det": 0.5,                         #检测器输出阈值
+  "image_file": "demo.jpg",                     #测试图片
+  "image_dir": "",                              #测试图片文件夹
+  "run_benchmark": true,                       #性能测试开关
+  "cpu_threads": 4                              #线程数
+}
+```
+
+*  `keypoint_runtime_config.json` 同时包含了目标检测和关键点检测的超参数，支持Top-Down方案的推理流程，请按需进行修改：
+```shell
+{
+  "model_dir_det": "./model_det/",              #检测模型路径
+  "batch_size_det": 1,                          #检测模型预测时batchsize, 存在关键点模型时只能为1
+  "threshold_det": 0.5,                         #检测器输出阈值
+  "model_dir_keypoint": "./model_keypoint/",    #关键点模型路径（不使用需为空字符）
+  "batch_size_keypoint": 8,                     #关键点预测时batchsize
+  "threshold_keypoint": 0.5,                    #关键点输出阈值
+  "image_file": "demo.jpg",                     #测试图片
+  "image_dir": "",                              #测试图片文件夹
+  "run_benchmark": true,                        #性能测试开关
+  "cpu_threads": 4                              #线程数
+  "use_dark_decode": true                       #是否使用DARK解码关键点坐标
+}
+```
+
+6. 启动调试，上述步骤完成后就可以使用ADB将文件夹 `deploy/` push到手机上运行，步骤如下：
+
+```shell
+# 将上述deploy文件夹push到手机上
+adb push deploy /data/local/tmp/
+
+adb shell
+cd /data/local/tmp/deploy
+export LD_LIBRARY_PATH=/data/local/tmp/deploy:$LD_LIBRARY_PATH
+
+# 修改权限为可执行
+chmod 777 main
+# 以检测为例，执行程序
+./main det_runtime_config.json
+```
+
+如果对代码做了修改，则需要重新编译并push到手机上。
+
+运行效果如下：
+
+<div align="center">
+    <img src="../../docs/images/lite_demo.jpg" width="600">
+</div>
+
+
+## FAQ
+Q1：如果想更换模型怎么办，需要重新按照流程走一遍吗？  
+A1：如果已经走通了上述步骤，更换模型只需要替换 `.nb` 模型文件及其对应模型配置文件`infer_cfg.json`，同时要注意修改下配置文件中的 `.nb` 文件路径以及类别映射文件（如有必要）。
+
+Q2：换一个图测试怎么做？  
+A2：替换 deploy 下的测试图像为你想要测试的图像，使用 ADB 再次 push 到手机上即可。
diff --git a/deploy/lite/convert_yml_to_json.py b/deploy/lite/convert_yml_to_json.py
new file mode 100644
index 0000000000000000000000000000000000000000..6282c783050b26a9b07e7e96e87cac4711a9d20b
--- /dev/null
+++ b/deploy/lite/convert_yml_to_json.py
@@ -0,0 +1,14 @@
+import yaml
+import json
+import sys
+
+yamlf = sys.argv[1]
+
+assert yamlf.endswith(".yml")
+
+with open(yamlf, 'r') as rf:
+    yaml_data = yaml.safe_load(rf)
+
+jsonf = yamlf[:-4] + ".json"
+with open(jsonf, 'w') as wf:
+    json.dump(yaml_data, wf, indent=4)
diff --git a/deploy/lite/include/config_parser.h b/deploy/lite/include/config_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..60d94c69e3b17aa9afea5dfb90e286f44d63f0bc
--- /dev/null
+++ b/deploy/lite/include/config_parser.h
@@ -0,0 +1,104 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "json/json.h"
+
+#ifdef _WIN32
+#define OS_PATH_SEP "\\"
+#else
+#define OS_PATH_SEP "/"
+#endif
+
+namespace PaddleDetection {
+
+void load_jsonf(std::string jsonfile, Json::Value& jsondata);
+
+// Inference model configuration parser
+class ConfigPaser {
+ public:
+  ConfigPaser() {}
+
+  ~ConfigPaser() {}
+
+  bool load_config(const std::string& model_dir,
+                   const std::string& cfg = "infer_cfg") {
+    Json::Value config;
+    load_jsonf(model_dir + OS_PATH_SEP + cfg + ".json", config);
+
+    // Get model arch : YOLO, SSD, RetinaNet, RCNN, Face, PicoDet, HRNet
+    if (config.isMember("arch")) {
+      arch_ = config["arch"].as<std::string>();
+    } else {
+      std::cerr
+          << "Please set model arch,"
+          << "support value : YOLO, SSD, RetinaNet, RCNN, Face, PicoDet, HRNet."
+          << std::endl;
+      return false;
+    }
+
+    // Get draw_threshold for visualization
+    if (config.isMember("draw_threshold")) {
+      draw_threshold_ = config["draw_threshold"].as<float>();
+    } else {
+      std::cerr << "Please set draw_threshold." << std::endl;
+      return false;
+    }
+    // Get Preprocess for preprocessing
+    if (config.isMember("Preprocess")) {
+      preprocess_info_ = config["Preprocess"];
+    } else {
+      std::cerr << "Please set Preprocess." << std::endl;
+      return false;
+    }
+    // Get label_list for visualization
+    if (config.isMember("label_list")) {
+      label_list_.clear();
+      for (auto item : config["label_list"]) {
+        label_list_.emplace_back(item.as<std::string>());
+      }
+    } else {
+      std::cerr << "Please set label_list." << std::endl;
+      return false;
+    }
+
+    // Get NMS for postprocess
+    if (config.isMember("NMS")) {
+      nms_info_ = config["NMS"];
+    }
+    // Get fpn_stride in PicoDet
+    if (config.isMember("fpn_stride")) {
+      fpn_stride_.clear();
+      for (auto item : config["fpn_stride"]) {
+        fpn_stride_.emplace_back(item.as<int>());
+      }
+    }
+
+    return true;
+  }
+  float draw_threshold_;
+  std::string arch_;
+  Json::Value preprocess_info_;
+  Json::Value nms_info_;
+  std::vector<std::string> label_list_;
+  std::vector<int> fpn_stride_;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/include/keypoint_detector.h b/deploy/lite/include/keypoint_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..d41ba0adde31b81c6a797a7c70cae7ec7fdac37d
--- /dev/null
+++ b/deploy/lite/include/keypoint_detector.h
@@ -0,0 +1,107 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "paddle_api.h"  // NOLINT
+
+#include "include/config_parser.h"
+#include "include/keypoint_postprocess.h"
+#include "include/preprocess_op.h"
+
+using namespace paddle::lite_api;  // NOLINT
+
+namespace PaddleDetection {
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold = 0.2);
+
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_dir,
+                            int cpu_threads = 1,
+                            const int batch_size = 1,
+                            bool use_dark = true) {
+    config_.load_config(model_dir);
+    threshold_ = config_.draw_threshold_;
+    use_dark_ = use_dark;
+    preprocessor_.Init(config_.preprocess_info_);
+    printf("before keypoint detector\n");
+    LoadModel(model_dir, cpu_threads);
+    printf("create keypoint detector\n");
+  }
+
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               const int warmup = 0,
+               const int repeats = 1,
+               std::vector<KeyPointResult>* result = nullptr,
+               std::vector<double>* times = nullptr);
+
+  // Get Model Label list
+  const std::vector<std::string>& GetLabelList() const {
+    return config_.label_list_;
+  }
+
+  bool use_dark(){return this->use_dark_;}
+
+  inline float get_threshold() {return threshold_;};
+
+ private:
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(std::vector<float>& output,
+                   std::vector<int64_t>& output_shape,
+                   std::vector<int64_t>& idxout,
+                   std::vector<int64_t>& idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+
+  std::shared_ptr<PaddlePredictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> output_data_;
+  std::vector<int64_t> idx_data_;
+  float threshold_;
+  ConfigPaser config_;
+  bool use_dark_;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/include/keypoint_postprocess.h b/deploy/lite/include/keypoint_postprocess.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e0e54c2640104488ef85e733af1c16bdc2d86aa
--- /dev/null
+++ b/deploy/lite/include/keypoint_postprocess.h
@@ -0,0 +1,57 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& x, int p, int num);
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine);
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int64_t>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx);
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int64_t>& dim,
+                     std::vector<int64_t>& idxout,
+                     std::vector<int64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK = true);
diff --git a/deploy/lite/include/object_detector.h b/deploy/lite/include/object_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..7874a9b8bba087f5731ac9d91ebd308a8e0d5ef2
--- /dev/null
+++ b/deploy/lite/include/object_detector.h
@@ -0,0 +1,98 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "paddle_api.h"  // NOLINT
+
+#include "include/config_parser.h"
+#include "include/preprocess_op.h"
+#include "include/utils.h"
+#include "include/picodet_postprocess.h"
+
+using namespace paddle::lite_api;  // NOLINT
+
+namespace PaddleDetection {
+
+// Generate visualization colormap for each class
+std::vector<int> GenerateColorMap(int num_class);
+
+// Visualiztion Detection Result
+cv::Mat VisualizeResult(const cv::Mat& img,
+                        const std::vector<PaddleDetection::ObjectResult>& results,
+                        const std::vector<std::string>& lables,
+                        const std::vector<int>& colormap,
+                        const bool is_rbox);
+
+class ObjectDetector {
+ public:
+  explicit ObjectDetector(const std::string& model_dir,
+                          int cpu_threads = 1,
+                          const int batch_size = 1) {
+    config_.load_config(model_dir);
+    printf("config created\n");
+    threshold_ = config_.draw_threshold_;
+    preprocessor_.Init(config_.preprocess_info_);
+    printf("before object detector\n");
+    LoadModel(model_dir, cpu_threads);
+    printf("create object detector\n");
+  }
+
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat>& imgs,
+               const double threshold = 0.5,
+               const int warmup = 0,
+               const int repeats = 1,
+               std::vector<PaddleDetection::ObjectResult>* result = nullptr,
+               std::vector<int>* bbox_num = nullptr,
+               std::vector<double>* times = nullptr);
+
+  // Get Model Label list
+  const std::vector<std::string>& GetLabelList() const {
+    return config_.label_list_;
+  }
+
+ private:
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(const std::vector<cv::Mat> mats,
+                   std::vector<PaddleDetection::ObjectResult>* result,
+                   std::vector<int> bbox_num,
+                   bool is_rbox);
+
+  std::shared_ptr<PaddlePredictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> output_data_;
+  std::vector<int> out_bbox_num_data_;
+  float threshold_;
+  ConfigPaser config_;
+
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/include/picodet_postprocess.h b/deploy/lite/include/picodet_postprocess.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac33e92ba167cb8a9c3bfaae9991522c358d6d0c
--- /dev/null
+++ b/deploy/lite/include/picodet_postprocess.h
@@ -0,0 +1,39 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <utility>
+#include <ctime>
+#include <numeric>
+#include <math.h>
+
+#include "include/utils.h"
+
+namespace PaddleDetection {
+
+void PicoDetPostProcess(std::vector<PaddleDetection::ObjectResult>* results,
+                         std::vector<const float *> outs,
+                         std::vector<int> fpn_stride,
+                         std::vector<float> im_shape,
+                         std::vector<float> scale_factor,
+                         float score_threshold = 0.3,
+                         float nms_threshold = 0.5,
+                         int num_class = 80,
+                         int reg_max = 7);
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/include/preprocess_op.h b/deploy/lite/include/preprocess_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..86bb56c80a6f24afdf8a0e139639fe032f170ba3
--- /dev/null
+++ b/deploy/lite/include/preprocess_op.h
@@ -0,0 +1,188 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "json/json.h"
+
+namespace PaddleDetection {
+
+// Object for storing all preprocessed data
+class ImageBlob {
+ public:
+  // image width and height
+  std::vector<float> im_shape_;
+  // Buffer for image data after preprocessing
+  std::vector<float> im_data_;
+  // in net data shape(after pad)
+  std::vector<float> in_net_shape_;
+  // Evaluation image width and height
+  // std::vector<float>  eval_im_size_f_;
+  // Scale factor for image size to origin image size
+  std::vector<float> scale_factor_;
+};
+
+// Abstraction of preprocessing opration class
+class PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) = 0;
+  virtual void Run(cv::Mat* im, ImageBlob* data) = 0;
+};
+
+class InitInfo : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+
+class NormalizeImage : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    mean_.clear();
+    scale_.clear();
+    for (auto tmp : item["mean"]) {
+      mean_.emplace_back(tmp.as<float>());
+    }
+    for (auto tmp : item["std"]) {
+      scale_.emplace_back(tmp.as<float>());
+    }
+    is_scale_ = item["is_scale"].as<bool>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  // CHW or HWC
+  std::vector<float> mean_;
+  std::vector<float> scale_;
+  bool is_scale_;
+};
+
+class Permute : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+
+class Resize : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    interp_ = item["interp"].as<int>();
+    // max_size_ = item["target_size"].as<int>();
+    keep_ratio_ = item["keep_ratio"].as<bool>();
+    target_size_.clear();
+    for (auto tmp : item["target_size"]) {
+      target_size_.emplace_back(tmp.as<int>());
+    }
+  }
+
+  // Compute best resize scale for x-dimension, y-dimension
+  std::pair<float, float> GenerateScale(const cv::Mat& im);
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int interp_;
+  bool keep_ratio_;
+  std::vector<int> target_size_;
+  std::vector<int> in_net_shape_;
+};
+
+// Models with FPN need input shape % stride == 0
+class PadStride : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    stride_ = item["stride"].as<int>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int stride_;
+};
+
+class TopDownEvalAffine : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    trainsize_.clear();
+    for (auto tmp : item["trainsize"]) {
+      trainsize_.emplace_back(tmp.as<int>());
+    }
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int interp_ = 1;
+  std::vector<int> trainsize_;
+};
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.15);
+
+class Preprocessor {
+ public:
+  void Init(const Json::Value& config_node) {
+    // initialize image info at first
+    ops_["InitInfo"] = std::make_shared<InitInfo>();
+    for (const auto& item : config_node) {
+      auto op_name = item["type"].as<std::string>();
+
+      ops_[op_name] = CreateOp(op_name);
+      ops_[op_name]->Init(item);
+    }
+  }
+
+  std::shared_ptr<PreprocessOp> CreateOp(const std::string& name) {
+    if (name == "Resize") {
+      return std::make_shared<Resize>();
+    } else if (name == "Permute") {
+      return std::make_shared<Permute>();
+    } else if (name == "NormalizeImage") {
+      return std::make_shared<NormalizeImage>();
+    } else if (name == "PadStride") {
+      // use PadStride instead of PadBatch
+      return std::make_shared<PadStride>();
+    } else if (name == "TopDownEvalAffine") {
+      return std::make_shared<TopDownEvalAffine>();
+    }
+    std::cerr << "can not find function of OP: " << name
+              << " and return: nullptr" << std::endl;
+    return nullptr;
+  }
+
+  void Run(cv::Mat* im, ImageBlob* data);
+
+ public:
+  static const std::vector<std::string> RUN_ORDER;
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<PreprocessOp>> ops_;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/include/utils.h b/deploy/lite/include/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3802e1267176a050402d1fdf742e54a79f33ffb9
--- /dev/null
+++ b/deploy/lite/include/utils.h
@@ -0,0 +1,39 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <utility>
+#include <ctime>
+#include <numeric>
+#include <algorithm>
+
+namespace PaddleDetection {
+
+// Object Detection Result
+struct ObjectResult {
+  // Rectangle coordinates of detected object: left, right, top, down
+  std::vector<int> rect;
+  // Class id of detected object
+  int class_id;
+  // Confidence of detected object
+  float confidence;
+};
+
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold);
+
+}  // namespace PaddleDetection
\ No newline at end of file
diff --git a/deploy/lite/src/config_parser.cc b/deploy/lite/src/config_parser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70c43e76c2c85d2917eb1c3384304260c591b85c
--- /dev/null
+++ b/deploy/lite/src/config_parser.cc
@@ -0,0 +1,32 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/config_parser.h"
+
+namespace PaddleDetection {
+
+void load_jsonf(std::string jsonfile, Json::Value &jsondata) {
+  std::ifstream ifs;
+  ifs.open(jsonfile);
+
+  Json::CharReaderBuilder builder;
+  builder["collectComments"] = true;
+  JSONCPP_STRING errs;
+  if (!parseFromStream(builder, ifs, &jsondata, &errs)) {
+    std::cout << errs << std::endl;
+    return;
+  }
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/src/keypoint_detector.cc b/deploy/lite/src/keypoint_detector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2be7471779355614457f52292443bf05ec73d21c
--- /dev/null
+++ b/deploy/lite/src/keypoint_detector.cc
@@ -0,0 +1,224 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/keypoint_detector.h"
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void KeyPointDetector::LoadModel(std::string model_file, int num_theads) {
+  MobileConfig config;
+  config.set_threads(num_theads);
+  config.set_model_from_file(model_file + "/model.nb");
+  config.set_power_mode(LITE_POWER_HIGH);
+
+  predictor_ = std::move(CreatePaddlePredictor<MobileConfig>(config));
+}
+
+// Visualiztion MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold) {
+  const int edge[][2] = {{0, 1},
+                         {0, 2},
+                         {1, 3},
+                         {2, 4},
+                         {3, 5},
+                         {4, 6},
+                         {5, 7},
+                         {6, 8},
+                         {7, 9},
+                         {8, 10},
+                         {5, 11},
+                         {6, 12},
+                         {11, 13},
+                         {12, 14},
+                         {13, 15},
+                         {14, 16},
+                         {11, 12}};
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > threshold) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[edge[i][0] * 3] > threshold &&
+          results[batchid].keypoints[edge[i][1] * 3] > threshold) {
+        int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+        int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+        int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+        int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+        cv::line(vis_img,
+                 cv::Point2d(x_start, y_start),
+                 cv::Point2d(x_end, y_end),
+                 colormap[i],
+                 1);
+      }
+    }
+  }
+  return vis_img;
+}
+
+void KeyPointDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void KeyPointDetector::Postprocess(std::vector<float>& output,
+                                   std::vector<int64_t>& output_shape,
+                                   std::vector<int64_t>& idxout,
+                                   std::vector<int64_t>& idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  std::vector<float> preds(output_shape[1] * 3, 0);
+
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(output,
+                    output_shape,
+                    idxout,
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid,
+                    this->use_dark());
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+}
+
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               const int warmup,
+                               const int repeats,
+                               std::vector<KeyPointResult>* result,
+                               std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+
+  // Prepare input tensor
+
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputByName(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Resize({batch_size, 3, rh, rw});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(in_data_all.data(), in_data_all.size(), inptr);
+    }
+  }
+
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<int64_t> output_shape, idx_shape;
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetTensor(output_names[0]);
+    auto idx_tensor = predictor_->GetTensor(output_names[1]);
+  }
+
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetTensor(output_names[0]);
+    output_shape = out_tensor->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    std::copy_n(
+        out_tensor->mutable_data<float>(), output_size, output_data_.data());
+
+    auto idx_tensor = predictor_->GetTensor(output_names[1]);
+    idx_shape = idx_tensor->shape();
+    // Calculate output length
+    output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+    idx_data_.resize(output_size);
+    std::copy_n(
+        idx_tensor->mutable_data<int64_t>(), output_size, idx_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/src/keypoint_postprocess.cc b/deploy/lite/src/keypoint_postprocess.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c75ece87c2c8f743f0f112ab6bd23fdcc96a270
--- /dev/null
+++ b/deploy/lite/src/keypoint_postprocess.cc
@@ -0,0 +1,231 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/keypoint_postprocess.h"
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{static_cast<float>(-0.5) * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int64_t>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine=false) {
+  if (affine) {
+    cv::Mat trans(2, 3, CV_64FC1);
+    get_affine_transform(center, scale, 0, output_size, trans, 1);
+    for (int p = 0; p < dim[1]; ++p) {
+      affine_tranform(
+          coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+    }
+  } else {
+    float heat_w = static_cast<float>(output_size[0]);
+    float heat_h = static_cast<float>(output_size[1]);
+    float x_scale = scale[0] / heat_w;
+    float y_scale = scale[1] / heat_h;
+    float offset_x = center[0] - scale[0] / 2.;
+    float offset_y = center[1] - scale[1] / 2.;
+    for (int i = 0; i < dim[1]; i++) {
+      target_coords[i * 3 + 1] = x_scale * coords[i * 2] + offset_x;
+      target_coords[i * 3 + 2] = y_scale * coords[i * 2 + 1] + offset_y;
+    }
+  }
+}
+
+// only for batchsize == 1
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+
+
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<int64_t>& dim,
+                std::vector<float>& coords,
+                int px, 
+                int py, 
+                int index,
+                int ch){
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 = heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0,dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1,1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1,1));
+
+  float epsilon = 1e-10;
+  //sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+
+  //compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2*xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2*xy + yd2);
+
+  //finally get offset by derivative and hassian, which combined by dx/dy and dxx/dyy
+  if(dxx * dyy - dxy*dxy != 0){
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2,2,CV_32F,M);
+    cv::Mat derivative(2,1,CV_32F,D);
+    cv::Mat offset = - hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0,0);
+    coords[ch * 2 + 1] += offset.at<float>(1,0);
+  }
+}
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int64_t>& dim,
+                     std::vector<int64_t>& idxout,
+                     std::vector<int64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+
+    int idx = idxout[batchid * dim[1] + j];
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+
+    if(DARK && px > 1 && px < heatmap_width - 2){
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    }
+    else{
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                      heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                      heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+  
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
diff --git a/deploy/lite/src/main.cc b/deploy/lite/src/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51f3b338064a90e7b7fd411f964d08ce72f4441e
--- /dev/null
+++ b/deploy/lite/src/main.cc
@@ -0,0 +1,388 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "include/config_parser.h"
+#include "include/keypoint_detector.h"
+#include "include/object_detector.h"
+#include "include/preprocess_op.h"
+#include "json/json.h"
+
+Json::Value RT_Config;
+
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  std::cout << "----------------------- Config info -----------------------"
+            << std::endl;
+  std::cout << "num_threads: " << RT_Config["cpu_threads"].as<int>()
+            << std::endl;
+  std::cout << "----------------------- Data info -----------------------"
+            << std::endl;
+  std::cout << "batch_size_det: " << RT_Config["batch_size_det"].as<int>()
+            << std::endl;
+  std::cout << "----------------------- Model info -----------------------"
+            << std::endl;
+  RT_Config["model_dir_det"].as<std::string>().erase(
+      RT_Config["model_dir_det"].as<std::string>().find_last_not_of("/") + 1);
+  std::cout << "detection model_name: "
+            << RT_Config["model_dir_det"].as<std::string>() << std::endl;
+  std::cout << "----------------------- Perf info ------------------------"
+            << std::endl;
+  std::cout << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0.)
+            << std::endl;
+  img_num = std::max(1, img_num);
+  std::cout << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num << std::endl;
+}
+
+void PrintKptsBenchmarkLog(std::vector<double> det_time, int img_num) {
+  std::cout << "----------------------- Data info -----------------------"
+            << std::endl;
+  std::cout << "batch_size_keypoint: "
+            << RT_Config["batch_size_keypoint"].as<int>() << std::endl;
+  std::cout << "----------------------- Model info -----------------------"
+            << std::endl;
+  RT_Config["model_dir_keypoint"].as<std::string>().erase(
+      RT_Config["model_dir_keypoint"].as<std::string>().find_last_not_of("/") +
+      1);
+  std::cout << "keypoint model_name: "
+            << RT_Config["model_dir_keypoint"].as<std::string>() << std::endl;
+  std::cout << "----------------------- Perf info ------------------------"
+            << std::endl;
+  std::cout << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0.)
+            << std::endl;
+  img_num = std::max(1, img_num);
+  std::cout << "Average time cost per person:" << std::endl
+            << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num << std::endl;
+}
+
+void PrintTotalIimeLog(double det_time,
+                       double keypoint_time,
+                       double crop_time) {
+  std::cout << "----------------------- Time info ------------------------"
+            << std::endl;
+  std::cout << "Total Pipeline time(ms) per image: "
+            << det_time + keypoint_time + crop_time << std::endl;
+  std::cout << "Average det time(ms) per image: " << det_time
+            << ", average keypoint time(ms) per image: " << keypoint_time
+            << ", average crop time(ms) per image: " << crop_time << std::endl;
+}
+
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string& path) {
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+}
+
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+  ret = mkdir(path.c_str(), 0755);
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void PredictImage(const std::vector<std::string> all_img_paths,
+                  const int batch_size_det,
+                  const double threshold_det,
+                  const bool run_benchmark,
+                  PaddleDetection::ObjectDetector* det,
+                  PaddleDetection::KeyPointDetector* keypoint,
+                  const std::string& output_dir = "output") {
+  std::vector<double> det_t = {0, 0, 0};
+  int steps = ceil(static_cast<float>(all_img_paths.size()) / batch_size_det);
+  int kpts_imgs = 0;
+  std::vector<double> keypoint_t = {0, 0, 0};
+  double midtimecost = 0;
+  for (int idx = 0; idx < steps; idx++) {
+    std::vector<cv::Mat> batch_imgs;
+    int left_image_cnt = all_img_paths.size() - idx * batch_size_det;
+    if (left_image_cnt > batch_size_det) {
+      left_image_cnt = batch_size_det;
+    }
+    for (int bs = 0; bs < left_image_cnt; bs++) {
+      std::string image_file_path = all_img_paths.at(idx * batch_size_det + bs);
+      cv::Mat im = cv::imread(image_file_path, 1);
+      batch_imgs.insert(batch_imgs.end(), im);
+    }
+    // Store all detected result
+    std::vector<PaddleDetection::ObjectResult> result;
+    std::vector<int> bbox_num;
+    std::vector<double> det_times;
+
+    // Store keypoint results
+    std::vector<PaddleDetection::KeyPointResult> result_kpts;
+    std::vector<cv::Mat> imgs_kpts;
+    std::vector<std::vector<float>> center_bs;
+    std::vector<std::vector<float>> scale_bs;
+    std::vector<int> colormap_kpts = PaddleDetection::GenerateColorMap(20);
+    bool is_rbox = false;
+    if (run_benchmark) {
+      det->Predict(
+          batch_imgs, threshold_det, 50, 50, &result, &bbox_num, &det_times);
+    } else {
+      det->Predict(
+          batch_imgs, threshold_det, 0, 1, &result, &bbox_num, &det_times);
+    }
+
+    // get labels and colormap
+    auto labels = det->GetLabelList();
+    auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+    int item_start_idx = 0;
+    for (int i = 0; i < left_image_cnt; i++) {
+      cv::Mat im = batch_imgs[i];
+      std::vector<PaddleDetection::ObjectResult> im_result;
+      int detect_num = 0;
+      for (int j = 0; j < bbox_num[i]; j++) {
+        PaddleDetection::ObjectResult item = result[item_start_idx + j];
+        if (item.confidence < threshold_det || item.class_id == -1) {
+          continue;
+        }
+        detect_num += 1;
+        im_result.push_back(item);
+        if (item.rect.size() > 6) {
+          is_rbox = true;
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3],
+                 item.rect[4],
+                 item.rect[5],
+                 item.rect[6],
+                 item.rect[7]);
+        } else {
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3]);
+        }
+      }
+      std::cout << all_img_paths.at(idx * batch_size_det + i)
+                << " The number of detected box: " << detect_num << std::endl;
+      item_start_idx = item_start_idx + bbox_num[i];
+
+      std::vector<int> compression_params;
+      compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+      compression_params.push_back(95);
+      std::string output_path(output_dir);
+      if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+        output_path += OS_PATH_SEP;
+      }
+      std::string image_file_path = all_img_paths.at(idx * batch_size_det + i);
+      if (keypoint) {
+        int imsize = im_result.size();
+        for (int i = 0; i < imsize; i++) {
+          auto keypoint_start_time = std::chrono::steady_clock::now();
+          auto item = im_result[i];
+          cv::Mat crop_img;
+          std::vector<double> keypoint_times;
+          std::vector<int> rect = {
+              item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+          std::vector<float> center;
+          std::vector<float> scale;
+          if (item.class_id == 0) {
+            PaddleDetection::CropImg(im, crop_img, rect, center, scale);
+            center_bs.emplace_back(center);
+            scale_bs.emplace_back(scale);
+            imgs_kpts.emplace_back(crop_img);
+            kpts_imgs += 1;
+          }
+          auto keypoint_crop_time = std::chrono::steady_clock::now();
+
+          std::chrono::duration<float> midtimediff =
+              keypoint_crop_time - keypoint_start_time;
+          midtimecost += static_cast<double>(midtimediff.count() * 1000);
+
+          if (imgs_kpts.size() == RT_Config["batch_size_keypoint"].as<int>() ||
+              ((i == imsize - 1) && !imgs_kpts.empty())) {
+            if (run_benchmark) {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                10,
+                                10,
+                                &result_kpts,
+                                &keypoint_times);
+            } else {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0,
+                                1,
+                                &result_kpts,
+                                &keypoint_times);
+            }
+            imgs_kpts.clear();
+            center_bs.clear();
+            scale_bs.clear();
+            keypoint_t[0] += keypoint_times[0];
+            keypoint_t[1] += keypoint_times[1];
+            keypoint_t[2] += keypoint_times[2];
+          }
+        }
+        std::string kpts_savepath =
+            output_path + "keypoint_" +
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::Mat kpts_vis_img = VisualizeKptsResult(
+            im, result_kpts, colormap_kpts, keypoint->get_threshold());
+        cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+        printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+      } else {
+        // Visualization result
+        cv::Mat vis_img = PaddleDetection::VisualizeResult(
+            im, im_result, labels, colormap, is_rbox);
+        std::string det_savepath =
+            output_path + "result_" +
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::imwrite(det_savepath, vis_img, compression_params);
+        printf("Visualized output saved as %s\n", det_savepath.c_str());
+      }
+    }
+
+    det_t[0] += det_times[0];
+    det_t[1] += det_times[1];
+    det_t[2] += det_times[2];
+  }
+  PrintBenchmarkLog(det_t, all_img_paths.size());
+  if (keypoint) {
+    PrintKptsBenchmarkLog(keypoint_t, kpts_imgs);
+    PrintTotalIimeLog(
+        (det_t[0] + det_t[1] + det_t[2]) / all_img_paths.size(),
+        (keypoint_t[0] + keypoint_t[1] + keypoint_t[2]) / all_img_paths.size(),
+        midtimecost / all_img_paths.size());
+  }
+}
+
+int main(int argc, char** argv) {
+  std::cout << "Usage: " << argv[0] << " [config_path] [image_dir](option)\n";
+  if (argc < 2) {
+    std::cout << "Usage: ./main det_runtime_config.json" << std::endl;
+    return -1;
+  }
+  std::string config_path = argv[1];
+  std::string img_path = "";
+
+  if (argc >= 3) {
+    img_path = argv[2];
+  }
+  // Parsing command-line
+  PaddleDetection::load_jsonf(config_path, RT_Config);
+  if (RT_Config["model_dir_det"].as<std::string>().empty()) {
+    std::cout << "Please set [model_det_dir] in " << config_path << std::endl;
+    return -1;
+  }
+  if (RT_Config["image_file"].as<std::string>().empty() &&
+      RT_Config["image_dir"].as<std::string>().empty() && img_path.empty()) {
+    std::cout << "Please set [image_file] or [image_dir] in " << config_path
+              << " Or use command: <" << argv[0] << " [image_dir]>"
+              << std::endl;
+    return -1;
+  }
+  if (!img_path.empty()) {
+    std::cout << "Use image_dir in command line overide the path in config file"
+              << std::endl;
+    RT_Config["image_dir"] = img_path;
+    RT_Config["image_file"] = "";
+  }
+  // Load model and create a object detector
+  PaddleDetection::ObjectDetector det(
+      RT_Config["model_dir_det"].as<std::string>(),
+      RT_Config["cpu_threads"].as<int>(),
+      RT_Config["batch_size_det"].as<int>());
+
+  PaddleDetection::KeyPointDetector* keypoint = nullptr;
+  if (!RT_Config["model_dir_keypoint"].as<std::string>().empty()) {
+    keypoint = new PaddleDetection::KeyPointDetector(
+        RT_Config["model_dir_keypoint"].as<std::string>(),
+        RT_Config["cpu_threads"].as<int>(),
+        RT_Config["batch_size_keypoint"].as<int>(),
+        RT_Config["use_dark_decode"].as<bool>());
+    RT_Config["batch_size_det"] = 1;
+    printf(
+        "batchsize of detection forced to be 1 while keypoint model is not "
+        "empty()");
+  }
+  // Do inference on input image
+
+  if (!RT_Config["image_file"].as<std::string>().empty() ||
+      !RT_Config["image_dir"].as<std::string>().empty()) {
+    if (!PathExists(RT_Config["output_dir"].as<std::string>())) {
+      MkDirs(RT_Config["output_dir"].as<std::string>());
+    }
+    std::vector<std::string> all_img_paths;
+    std::vector<cv::String> cv_all_img_paths;
+    if (!RT_Config["image_file"].as<std::string>().empty()) {
+      all_img_paths.push_back(RT_Config["image_file"].as<std::string>());
+      if (RT_Config["batch_size_det"].as<int>() > 1) {
+        std::cout << "batch_size_det should be 1, when set `image_file`."
+                  << std::endl;
+        return -1;
+      }
+    } else {
+      cv::glob(RT_Config["image_dir"].as<std::string>(), cv_all_img_paths);
+      for (const auto& img_path : cv_all_img_paths) {
+        all_img_paths.push_back(img_path);
+      }
+    }
+    PredictImage(all_img_paths,
+                 RT_Config["batch_size_det"].as<int>(),
+                 RT_Config["threshold_det"].as<float>(),
+                 RT_Config["run_benchmark"].as<bool>(),
+                 &det,
+                 keypoint,
+                 RT_Config["output_dir"].as<std::string>());
+  }
+  delete keypoint;
+  keypoint = nullptr;
+  return 0;
+}
diff --git a/deploy/lite/src/object_detector.cc b/deploy/lite/src/object_detector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0909bd9194679485fd2a8b735ff6f7ffdb0bb2c9
--- /dev/null
+++ b/deploy/lite/src/object_detector.cc
@@ -0,0 +1,329 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/object_detector.h"
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void ObjectDetector::LoadModel(std::string model_file, int num_theads) {
+  MobileConfig config;
+  config.set_threads(num_theads);
+  config.set_model_from_file(model_file + "/model.nb");
+  config.set_power_mode(LITE_POWER_HIGH);
+
+  predictor_ = CreatePaddlePredictor<MobileConfig>(config);
+}
+
+// Visualiztion MaskDetector results
+cv::Mat VisualizeResult(const cv::Mat& img,
+                        const std::vector<PaddleDetection::ObjectResult>& results,
+                        const std::vector<std::string>& lables,
+                        const std::vector<int>& colormap,
+                        const bool is_rbox = false) {
+  cv::Mat vis_img = img.clone();
+  for (int i = 0; i < results.size(); ++i) {
+    // Configure color and text size
+    std::ostringstream oss;
+    oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    oss << lables[results[i].class_id] << " ";
+    oss << results[i].confidence;
+    std::string text = oss.str();
+    int c1 = colormap[3 * results[i].class_id + 0];
+    int c2 = colormap[3 * results[i].class_id + 1];
+    int c3 = colormap[3 * results[i].class_id + 2];
+    cv::Scalar roi_color = cv::Scalar(c1, c2, c3);
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 0.5f;
+    float thickness = 0.5;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    cv::Point origin;
+
+    if (is_rbox) {
+      // Draw object, text, and background
+      for (int k = 0; k < 4; k++) {
+        cv::Point pt1 = cv::Point(results[i].rect[(k * 2) % 8],
+                                  results[i].rect[(k * 2 + 1) % 8]);
+        cv::Point pt2 = cv::Point(results[i].rect[(k * 2 + 2) % 8],
+                                  results[i].rect[(k * 2 + 3) % 8]);
+        cv::line(vis_img, pt1, pt2, roi_color, 2);
+      }
+    } else {
+      int w = results[i].rect[2] - results[i].rect[0];
+      int h = results[i].rect[3] - results[i].rect[1];
+      cv::Rect roi = cv::Rect(results[i].rect[0], results[i].rect[1], w, h);
+      // Draw roi object, text, and background
+      cv::rectangle(vis_img, roi, roi_color, 2);
+    }
+
+    origin.x = results[i].rect[0];
+    origin.y = results[i].rect[1];
+
+    // Configure text background
+    cv::Rect text_back = cv::Rect(results[i].rect[0],
+                                  results[i].rect[1] - text_size.height,
+                                  text_size.width,
+                                  text_size.height);
+    // Draw text, and background
+    cv::rectangle(vis_img, text_back, roi_color, -1);
+    cv::putText(vis_img,
+                text,
+                origin,
+                font_face,
+                font_scale,
+                cv::Scalar(255, 255, 255),
+                thickness);
+  }
+  return vis_img;
+}
+
+void ObjectDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void ObjectDetector::Postprocess(const std::vector<cv::Mat> mats,
+                                 std::vector<PaddleDetection::ObjectResult>* result,
+                                 std::vector<int> bbox_num,
+                                 bool is_rbox = false) {
+  result->clear();
+  int start_idx = 0;
+  for (int im_id = 0; im_id < mats.size(); im_id++) {
+    cv::Mat raw_mat = mats[im_id];
+    int rh = 1;
+    int rw = 1;
+    if (config_.arch_ == "Face") {
+      rh = raw_mat.rows;
+      rw = raw_mat.cols;
+    }
+    for (int j = start_idx; j < start_idx + bbox_num[im_id]; j++) {
+      if (is_rbox) {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 10]));
+        // Confidence score
+        float score = output_data_[1 + j * 10];
+        int x1 = (output_data_[2 + j * 10] * rw);
+        int y1 = (output_data_[3 + j * 10] * rh);
+        int x2 = (output_data_[4 + j * 10] * rw);
+        int y2 = (output_data_[5 + j * 10] * rh);
+        int x3 = (output_data_[6 + j * 10] * rw);
+        int y3 = (output_data_[7 + j * 10] * rh);
+        int x4 = (output_data_[8 + j * 10] * rw);
+        int y4 = (output_data_[9 + j * 10] * rh);
+
+        PaddleDetection::ObjectResult result_item;
+        result_item.rect = {x1, y1, x2, y2, x3, y3, x4, y4};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      } else {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 6]));
+        // Confidence score
+        float score = output_data_[1 + j * 6];
+        int xmin = (output_data_[2 + j * 6] * rw);
+        int ymin = (output_data_[3 + j * 6] * rh);
+        int xmax = (output_data_[4 + j * 6] * rw);
+        int ymax = (output_data_[5 + j * 6] * rh);
+        int wd = xmax - xmin;
+        int hd = ymax - ymin;
+
+        PaddleDetection::ObjectResult result_item;
+        result_item.rect = {xmin, ymin, xmax, ymax};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      }
+    }
+    start_idx += bbox_num[im_id];
+  }
+}
+
+void ObjectDetector::Predict(const std::vector<cv::Mat>& imgs,
+                             const double threshold,
+                             const int warmup,
+                             const int repeats,
+                             std::vector<PaddleDetection::ObjectResult>* result,
+                             std::vector<int>* bbox_num,
+                             std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<const float *> output_data_list_;
+  // Prepare input tensor
+
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputByName(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Resize({batch_size, 3, rh, rw});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(in_data_all.data(), in_data_all.size(), inptr);
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Resize({batch_size, 2});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(im_shape_all.data(), im_shape_all.size(), inptr);
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Resize({batch_size, 2});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(scale_factor_all.data(), scale_factor_all.size(), inptr);
+    }
+  }
+
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    if (config_.arch_ == "PicoDet") {
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetTensor(output_names[j]);
+        const float* outptr = output_tensor->data<float>();
+        std::vector<int64_t> output_shape = output_tensor->shape();
+        output_data_list_.push_back(outptr);
+      }
+    } else {
+      auto out_tensor = predictor_->GetTensor(output_names[0]);
+      auto out_bbox_num = predictor_->GetTensor(output_names[1]);
+    }
+  }
+
+  bool is_rbox = false;
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Get output tensor
+  output_data_list_.clear();
+  int num_class = 80;
+  int reg_max = 7;
+  auto output_names = predictor_->GetOutputNames();
+  // TODO: Unified model output.
+  if (config_.arch_ == "PicoDet") {
+    for (int i = 0; i < output_names.size(); i++) {
+      auto output_tensor = predictor_->GetTensor(output_names[i]);
+      const float* outptr = output_tensor->data<float>();
+      std::vector<int64_t> output_shape = output_tensor->shape();
+      if (i == 0) {
+        num_class = output_shape[2];
+      }
+      if (i == config_.fpn_stride_.size()) {
+        reg_max = output_shape[2] / 4 - 1;
+      }
+      output_data_list_.push_back(outptr);
+    }
+  } else {
+    auto output_tensor = predictor_->GetTensor(output_names[0]);
+    auto output_shape = output_tensor->shape();
+    auto out_bbox_num = predictor_->GetTensor(output_names[1]);
+    auto out_bbox_num_shape = out_bbox_num->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    is_rbox = output_shape[output_shape.size() - 1] % 10 == 0;
+
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    std::copy_n(
+        output_tensor->mutable_data<float>(), output_size, output_data_.data());
+
+    int out_bbox_num_size = 1;
+    for (int j = 0; j < out_bbox_num_shape.size(); ++j) {
+      out_bbox_num_size *= out_bbox_num_shape[j];
+    }
+    out_bbox_num_data_.resize(out_bbox_num_size);
+    std::copy_n(out_bbox_num->mutable_data<int>(),
+                out_bbox_num_size,
+                out_bbox_num_data_.data());
+  }
+  // Postprocessing result
+  result->clear();
+  if (config_.arch_ == "PicoDet") {
+    PaddleDetection::PicoDetPostProcess(
+        result, output_data_list_, config_.fpn_stride_, 
+        inputs_.im_shape_, inputs_.scale_factor_,
+        config_.nms_info_["score_threshold"].as<float>(), 
+        config_.nms_info_["nms_threshold"].as<float>(), num_class, reg_max);
+    bbox_num->push_back(result->size());
+  } else {
+    Postprocess(imgs, result, out_bbox_num_data_, is_rbox);
+    bbox_num->clear();
+    for (int k = 0; k < out_bbox_num_data_.size(); k++) {
+      int tmp = out_bbox_num_data_[k];
+      bbox_num->push_back(tmp);
+    }
+  }
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/src/picodet_postprocess.cc b/deploy/lite/src/picodet_postprocess.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32625249fabf04745ea239a6ec924df244426c86
--- /dev/null
+++ b/deploy/lite/src/picodet_postprocess.cc
@@ -0,0 +1,128 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on:
+// https://github.com/RangiLyu/nanodet/blob/main/demo_mnn/nanodet_mnn.cpp
+
+#include "include/picodet_postprocess.h"
+
+namespace PaddleDetection {
+
+float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+// PicoDet decode
+PaddleDetection::ObjectResult
+disPred2Bbox(const float *&dfl_det, int label, float score, int x, int y,
+             int stride, std::vector<float> im_shape, int reg_max) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(dfl_det + i * (reg_max + 1), dis_after_sm,
+                                reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  int xmin = (int)(std::max)(ct_x - dis_pred[0], .0f);
+  int ymin = (int)(std::max)(ct_y - dis_pred[1], .0f);
+  int xmax = (int)(std::min)(ct_x + dis_pred[2], (float)im_shape[0]);
+  int ymax = (int)(std::min)(ct_y + dis_pred[3], (float)im_shape[1]);
+
+  PaddleDetection::ObjectResult result_item;
+  result_item.rect = {xmin, ymin, xmax, ymax};
+  result_item.class_id = label;
+  result_item.confidence = score;
+
+  return result_item;
+}
+
+void PicoDetPostProcess(std::vector<PaddleDetection::ObjectResult> *results,
+                        std::vector<const float *> outs,
+                        std::vector<int> fpn_stride,
+                        std::vector<float> im_shape,
+                        std::vector<float> scale_factor, float score_threshold,
+                        float nms_threshold, int num_class, int reg_max) {
+  std::vector<std::vector<PaddleDetection::ObjectResult>> bbox_results;
+  bbox_results.resize(num_class);
+  int in_h = im_shape[0], in_w = im_shape[1];
+  for (int i = 0; i < fpn_stride.size(); ++i) {
+    int feature_h = ceil((float)in_h / fpn_stride[i]);
+    int feature_w = ceil((float)in_w / fpn_stride[i]);
+    for (int idx = 0; idx < feature_h * feature_w; idx++) {
+      const float *scores = outs[i] + (idx * num_class);
+
+      int row = idx / feature_w;
+      int col = idx % feature_w;
+      float score = 0;
+      int cur_label = 0;
+      for (int label = 0; label < num_class; label++) {
+        if (scores[label] > score) {
+          score = scores[label];
+          cur_label = label;
+        }
+      }
+      if (score > score_threshold) {
+        const float *bbox_pred =
+            outs[i + fpn_stride.size()] + (idx * 4 * (reg_max + 1));
+        bbox_results[cur_label].push_back(
+            disPred2Bbox(bbox_pred, cur_label, score, col, row, fpn_stride[i],
+                         im_shape, reg_max));
+      }
+    }
+  }
+  for (int i = 0; i < (int)bbox_results.size(); i++) {
+    PaddleDetection::nms(bbox_results[i], nms_threshold);
+
+    for (auto box : bbox_results[i]) {
+      box.rect[0] = box.rect[0] / scale_factor[1];
+      box.rect[2] = box.rect[2] / scale_factor[1];
+      box.rect[1] = box.rect[1] / scale_factor[0];
+      box.rect[3] = box.rect[3] / scale_factor[0];
+      results->push_back(box);
+    }
+  }
+}
+
+} // namespace PaddleDetection
diff --git a/deploy/lite/src/preprocess_op.cc b/deploy/lite/src/preprocess_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbbc5adb1d431c800b0624107d8c281f4b53c9cd
--- /dev/null
+++ b/deploy/lite/src/preprocess_op.cc
@@ -0,0 +1,185 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "include/preprocess_op.h"
+
+namespace PaddleDetection {
+
+void InitInfo::Run(cv::Mat* im, ImageBlob* data) {
+  data->im_shape_ = {static_cast<float>(im->rows),
+                     static_cast<float>(im->cols)};
+  data->scale_factor_ = {1., 1.};
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+}
+
+void NormalizeImage::Run(cv::Mat* im, ImageBlob* data) {
+  double e = 1.0;
+  if (is_scale_) {
+    e *= 1./255.0;
+  }
+  (*im).convertTo(*im, CV_32FC3, e);
+  for (int h = 0; h < im->rows; h++) {
+    for (int w = 0; w < im->cols; w++) {
+      im->at<cv::Vec3f>(h, w)[0] =
+          (im->at<cv::Vec3f>(h, w)[0] - mean_[0]) / scale_[0];
+      im->at<cv::Vec3f>(h, w)[1] =
+          (im->at<cv::Vec3f>(h, w)[1] - mean_[1]) / scale_[1];
+      im->at<cv::Vec3f>(h, w)[2] =
+          (im->at<cv::Vec3f>(h, w)[2] - mean_[2]) / scale_[2];
+    }
+  }
+}
+
+void Permute::Run(cv::Mat* im, ImageBlob* data) {
+  (*im).convertTo(*im, CV_32FC3);
+  int rh = im->rows;
+  int rw = im->cols;
+  int rc = im->channels();
+  (data->im_data_).resize(rc * rh * rw);
+  float* base = (data->im_data_).data();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+  }
+}
+
+void Resize::Run(cv::Mat* im, ImageBlob* data) {
+  auto resize_scale = GenerateScale(*im);
+  data->im_shape_ = {static_cast<float>(im->cols * resize_scale.first),
+                     static_cast<float>(im->rows * resize_scale.second)};
+  data->in_net_shape_ = {static_cast<float>(im->cols * resize_scale.first),
+                         static_cast<float>(im->rows * resize_scale.second)};
+  cv::resize(
+      *im, *im, cv::Size(), resize_scale.first, resize_scale.second, interp_);
+  data->im_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  data->scale_factor_ = {
+      resize_scale.second, resize_scale.first,
+  };
+}
+
+std::pair<float, float> Resize::GenerateScale(const cv::Mat& im) {
+  std::pair<float, float> resize_scale;
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+
+  if (keep_ratio_) {
+    int im_size_max = std::max(origin_w, origin_h);
+    int im_size_min = std::min(origin_w, origin_h);
+    int target_size_max =
+        *std::max_element(target_size_.begin(), target_size_.end());
+    int target_size_min =
+        *std::min_element(target_size_.begin(), target_size_.end());
+    float scale_min =
+        static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+    float scale_max =
+        static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+    float scale_ratio = std::min(scale_min, scale_max);
+    resize_scale = {scale_ratio, scale_ratio};
+  } else {
+    resize_scale.first =
+        static_cast<float>(target_size_[1]) / static_cast<float>(origin_w);
+    resize_scale.second =
+        static_cast<float>(target_size_[0]) / static_cast<float>(origin_h);
+  }
+  return resize_scale;
+}
+
+void PadStride::Run(cv::Mat* im, ImageBlob* data) {
+  if (stride_ <= 0) {
+    return;
+  }
+  int rc = im->channels();
+  int rh = im->rows;
+  int rw = im->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(
+      *im, *im, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT, cv::Scalar(0));
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+
+void TopDownEvalAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::resize(*im, *im, cv::Size(trainsize_[0], trainsize_[1]), 0, 0, interp_);
+  // todo: Simd::ResizeBilinear();
+  data->in_net_shape_ = {
+      static_cast<float>(trainsize_[1]), static_cast<float>(trainsize_[0]),
+  };
+}
+
+// Preprocessor op running order
+const std::vector<std::string> Preprocessor::RUN_ORDER = {"InitInfo",
+                                                          "TopDownEvalAffine",
+                                                          "Resize",
+                                                          "NormalizeImage",
+                                                          "PadStride",
+                                                          "Permute"};
+
+void Preprocessor::Run(cv::Mat* im, ImageBlob* data) {
+  for (const auto& name : RUN_ORDER) {
+    if (ops_.find(name) != ops_.end()) {
+      ops_[name]->Run(im, data);
+    }
+  }
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+  
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/lite/src/utils.cc b/deploy/lite/src/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b4731cd9e25b3536417ade20d3f9ce5089755fd
--- /dev/null
+++ b/deploy/lite/src/utils.cc
@@ -0,0 +1,49 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/utils.h"
+
+namespace PaddleDetection {
+
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold) {
+  std::sort(input_boxes.begin(),
+  input_boxes.end(), 
+  [](ObjectResult a, ObjectResult b) { return a.confidence > b.confidence; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).rect[2] - input_boxes.at(i).rect[0] + 1) 
+            * (input_boxes.at(i).rect[3] - input_boxes.at(i).rect[1] + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].rect[0], input_boxes[j].rect[0]);
+      float yy1 = (std::max)(input_boxes[i].rect[1], input_boxes[j].rect[1]);
+      float xx2 = (std::min)(input_boxes[i].rect[2], input_boxes[j].rect[2]);
+      float yy2 = (std::min)(input_boxes[i].rect[3], input_boxes[j].rect[3]);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= nms_threshold) {
+          input_boxes.erase(input_boxes.begin() + j);
+          vArea.erase(vArea.begin() + j);
+      }
+      else {
+          j++;
+      }
+    }
+  }
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/python/README.md b/deploy/python/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..052fc1cf1c2de746a7deef0037d422fe86f28fce
--- /dev/null
+++ b/deploy/python/README.md
@@ -0,0 +1,61 @@
+# Python端预测部署
+
+在PaddlePaddle中预测引擎和训练引擎底层有着不同的优化方法, 预测引擎使用了AnalysisPredictor，专门针对推理进行了优化，是基于[C++预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/native_infer.html)的Python接口，该引擎可以对模型进行多项图优化，减少不必要的内存拷贝。如果用户在部署已训练模型的过程中对性能有较高的要求，我们提供了独立于PaddleDetection的预测脚本，方便用户直接集成部署。
+
+
+Python端预测部署主要包含两个步骤：
+- 导出预测模型
+- 基于Python进行预测
+
+## 1. 导出预测模型
+
+PaddleDetection在训练过程包括网络的前向和优化器相关参数，而在部署过程中，我们只需要前向参数，具体参考:[导出模型](../EXPORT_MODEL.md)，例如
+
+```bash
+# 导出YOLOv3检测模型
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \
+ -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams
+```
+
+导出后目录下，包括`infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel`四个文件。
+
+
+## 2. 基于Python的预测
+
+### 2.1 通用检测
+在终端输入以下命令进行预测：
+```bash
+python deploy/python/infer.py --model_dir=./output_inference/yolov3_darknet53_270e_coco --image_file=./demo/000000014439.jpg --device=GPU
+```
+
+# 参数说明
+
+参数说明如下:
+
+| 参数 | 是否必须| 含义                                                                                          |
+|-------|-------|---------------------------------------------------------------------------------------------|
+| --model_dir | Yes| 上述导出的模型路径                                                                                   |
+| --image_file | Option | 需要预测的图片                                                                                     |
+| --image_dir  | Option | 要预测的图片文件夹路径                                                                                 |
+| --video_file | Option | 需要预测的视频                                                                                     |
+| --camera_id | Option | 用来预测的摄像头ID，默认为-1(表示不使用摄像头预测，可设置为：0 - (摄像头数目-1) )，预测过程中在可视化界面按`q`退出输出预测结果到：output/output.mp4 |
+| --device | Option | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`                                                            |
+| --run_mode | Option | 使用GPU时，默认为paddle, 可选（paddle/trt_fp32/trt_fp16/trt_int8）                                     |
+| --batch_size | Option | 预测时的batch size，在指定`image_dir`时有效，默认为1                                                       |
+| --threshold | Option| 预测得分的阈值，默认为0.5                                                                              |
+| --output_dir | Option| 可视化结果保存的根目录，默认为output/                                                                      |
+| --run_benchmark | Option| 是否运行benchmark，同时需指定`--image_file`或`--image_dir`，默认为False                                    |
+| --enable_mkldnn | Option | CPU预测中是否开启MKLDNN加速，默认为False                                                                 |
+| --cpu_threads | Option| 设置cpu线程数，默认为1                                                                               |
+| --trt_calib_mode | Option| TensorRT是否使用校准功能，默认为False。使用TensorRT的int8功能时，需设置为True，使用PaddleSlim量化后的模型时需要设置为False         |
+| --save_images | Option| 是否保存可视化结果                                                                                   |
+| --save_results | Option| 是否在文件夹下将图片的预测结果以JSON的形式保存                                                                   |
+
+
+说明：
+
+- 参数优先级顺序：`camera_id` > `video_file` > `image_dir` > `image_file`。
+- run_mode：paddle代表使用AnalysisPredictor，精度float32来推理，其他参数指用AnalysisPredictor，TensorRT不同精度来推理。
+- 如果安装的PaddlePaddle不支持基于TensorRT进行预测，需要自行编译，详细可参考[预测库编译教程](https://paddleinference.paddlepaddle.org.cn/user_guides/source_compile.html)。
+- --run_benchmark如果设置为True，则需要安装依赖`pip install pynvml psutil GPUtil`。
+- 如果需要使用导出模型在coco数据集上进行评估，请在推理时添加`--save_results`和`--use_coco_category`参数用以保存coco评估所需要的json文件
diff --git a/deploy/python/benchmark_utils.py b/deploy/python/benchmark_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf36217955ed71103ad46a7e7ae5cb488e93d96
--- /dev/null
+++ b/deploy/python/benchmark_utils.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+
+import paddle
+import paddle.inference as paddle_infer
+
+from pathlib import Path
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+LOG_PATH_ROOT = f"{CUR_DIR}/../../output"
+
+
+class PaddleInferBenchmark(object):
+    def __init__(self,
+                 config,
+                 model_info: dict={},
+                 data_info: dict={},
+                 perf_info: dict={},
+                 resource_info: dict={},
+                 **kwargs):
+        """
+        Construct PaddleInferBenchmark Class to format logs.
+        args:
+            config(paddle.inference.Config): paddle inference config
+            model_info(dict): basic model info
+                {'model_name': 'resnet50'
+                 'precision': 'fp32'}
+            data_info(dict): input data info
+                {'batch_size': 1
+                 'shape': '3,224,224'
+                 'data_num': 1000}
+            perf_info(dict): performance result
+                {'preprocess_time_s': 1.0
+                'inference_time_s': 2.0
+                'postprocess_time_s': 1.0
+                'total_time_s': 4.0}
+            resource_info(dict): 
+                cpu and gpu resources
+                {'cpu_rss': 100
+                 'gpu_rss': 100
+                 'gpu_util': 60}
+        """
+        # PaddleInferBenchmark Log Version
+        self.log_version = "1.0.3"
+
+        # Paddle Version
+        self.paddle_version = paddle.__version__
+        self.paddle_commit = paddle.__git_commit__
+        paddle_infer_info = paddle_infer.get_version()
+        self.paddle_branch = paddle_infer_info.strip().split(': ')[-1]
+
+        # model info
+        self.model_info = model_info
+
+        # data info
+        self.data_info = data_info
+
+        # perf info
+        self.perf_info = perf_info
+
+        try:
+            # required value
+            self.model_name = model_info['model_name']
+            self.precision = model_info['precision']
+
+            self.batch_size = data_info['batch_size']
+            self.shape = data_info['shape']
+            self.data_num = data_info['data_num']
+
+            self.inference_time_s = round(perf_info['inference_time_s'], 4)
+        except:
+            self.print_help()
+            raise ValueError(
+                "Set argument wrong, please check input argument and its type")
+
+        self.preprocess_time_s = perf_info.get('preprocess_time_s', 0)
+        self.postprocess_time_s = perf_info.get('postprocess_time_s', 0)
+        self.with_tracker = True if 'tracking_time_s' in perf_info else False
+        self.tracking_time_s = perf_info.get('tracking_time_s', 0)
+        self.total_time_s = perf_info.get('total_time_s', 0)
+
+        self.inference_time_s_90 = perf_info.get("inference_time_s_90", "")
+        self.inference_time_s_99 = perf_info.get("inference_time_s_99", "")
+        self.succ_rate = perf_info.get("succ_rate", "")
+        self.qps = perf_info.get("qps", "")
+
+        # conf info
+        self.config_status = self.parse_config(config)
+
+        # mem info
+        if isinstance(resource_info, dict):
+            self.cpu_rss_mb = int(resource_info.get('cpu_rss_mb', 0))
+            self.cpu_vms_mb = int(resource_info.get('cpu_vms_mb', 0))
+            self.cpu_shared_mb = int(resource_info.get('cpu_shared_mb', 0))
+            self.cpu_dirty_mb = int(resource_info.get('cpu_dirty_mb', 0))
+            self.cpu_util = round(resource_info.get('cpu_util', 0), 2)
+
+            self.gpu_rss_mb = int(resource_info.get('gpu_rss_mb', 0))
+            self.gpu_util = round(resource_info.get('gpu_util', 0), 2)
+            self.gpu_mem_util = round(resource_info.get('gpu_mem_util', 0), 2)
+        else:
+            self.cpu_rss_mb = 0
+            self.cpu_vms_mb = 0
+            self.cpu_shared_mb = 0
+            self.cpu_dirty_mb = 0
+            self.cpu_util = 0
+
+            self.gpu_rss_mb = 0
+            self.gpu_util = 0
+            self.gpu_mem_util = 0
+
+        # init benchmark logger
+        self.benchmark_logger()
+
+    def benchmark_logger(self):
+        """
+        benchmark logger
+        """
+        # remove other logging handler
+        for handler in logging.root.handlers[:]:
+            logging.root.removeHandler(handler)
+
+        # Init logger
+        FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        log_output = f"{LOG_PATH_ROOT}/{self.model_name}.log"
+        Path(f"{LOG_PATH_ROOT}").mkdir(parents=True, exist_ok=True)
+        logging.basicConfig(
+            level=logging.INFO,
+            format=FORMAT,
+            handlers=[
+                logging.FileHandler(
+                    filename=log_output, mode='w'),
+                logging.StreamHandler(),
+            ])
+        self.logger = logging.getLogger(__name__)
+        self.logger.info(
+            f"Paddle Inference benchmark log will be saved to {log_output}")
+
+    def parse_config(self, config) -> dict:
+        """
+        parse paddle predictor config
+        args:
+            config(paddle.inference.Config): paddle inference config
+        return:
+            config_status(dict): dict style config info
+        """
+        if isinstance(config, paddle_infer.Config):
+            config_status = {}
+            config_status['runtime_device'] = "gpu" if config.use_gpu(
+            ) else "cpu"
+            config_status['ir_optim'] = config.ir_optim()
+            config_status['enable_tensorrt'] = config.tensorrt_engine_enabled()
+            config_status['precision'] = self.precision
+            config_status['enable_mkldnn'] = config.mkldnn_enabled()
+            config_status[
+                'cpu_math_library_num_threads'] = config.cpu_math_library_num_threads(
+                )
+        elif isinstance(config, dict):
+            config_status['runtime_device'] = config.get('runtime_device', "")
+            config_status['ir_optim'] = config.get('ir_optim', "")
+            config_status['enable_tensorrt'] = config.get('enable_tensorrt', "")
+            config_status['precision'] = config.get('precision', "")
+            config_status['enable_mkldnn'] = config.get('enable_mkldnn', "")
+            config_status['cpu_math_library_num_threads'] = config.get(
+                'cpu_math_library_num_threads', "")
+        else:
+            self.print_help()
+            raise ValueError(
+                "Set argument config wrong, please check input argument and its type"
+            )
+        return config_status
+
+    def report(self, identifier=None):
+        """
+        print log report
+        args:
+            identifier(string): identify log
+        """
+        if identifier:
+            identifier = f"[{identifier}]"
+        else:
+            identifier = ""
+
+        self.logger.info("\n")
+        self.logger.info(
+            "---------------------- Paddle info ----------------------")
+        self.logger.info(f"{identifier} paddle_version: {self.paddle_version}")
+        self.logger.info(f"{identifier} paddle_commit: {self.paddle_commit}")
+        self.logger.info(f"{identifier} paddle_branch: {self.paddle_branch}")
+        self.logger.info(f"{identifier} log_api_version: {self.log_version}")
+        self.logger.info(
+            "----------------------- Conf info -----------------------")
+        self.logger.info(
+            f"{identifier} runtime_device: {self.config_status['runtime_device']}"
+        )
+        self.logger.info(
+            f"{identifier} ir_optim: {self.config_status['ir_optim']}")
+        self.logger.info(f"{identifier} enable_memory_optim: {True}")
+        self.logger.info(
+            f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}"
+        )
+        self.logger.info(
+            f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}")
+        self.logger.info(
+            f"{identifier} cpu_math_library_num_threads: {self.config_status['cpu_math_library_num_threads']}"
+        )
+        self.logger.info(
+            "----------------------- Model info ----------------------")
+        self.logger.info(f"{identifier} model_name: {self.model_name}")
+        self.logger.info(f"{identifier} precision: {self.precision}")
+        self.logger.info(
+            "----------------------- Data info -----------------------")
+        self.logger.info(f"{identifier} batch_size: {self.batch_size}")
+        self.logger.info(f"{identifier} input_shape: {self.shape}")
+        self.logger.info(f"{identifier} data_num: {self.data_num}")
+        self.logger.info(
+            "----------------------- Perf info -----------------------")
+        self.logger.info(
+            f"{identifier} cpu_rss(MB): {self.cpu_rss_mb}, cpu_vms: {self.cpu_vms_mb}, cpu_shared_mb: {self.cpu_shared_mb}, cpu_dirty_mb: {self.cpu_dirty_mb}, cpu_util: {self.cpu_util}%"
+        )
+        self.logger.info(
+            f"{identifier} gpu_rss(MB): {self.gpu_rss_mb}, gpu_util: {self.gpu_util}%, gpu_mem_util: {self.gpu_mem_util}%"
+        )
+        self.logger.info(
+            f"{identifier} total time spent(s): {self.total_time_s}")
+
+        if self.with_tracker:
+            self.logger.info(
+                f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
+                f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
+                f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}, "
+                f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}")
+        else:
+            self.logger.info(
+                f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
+                f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
+                f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}"
+            )
+        if self.inference_time_s_90:
+            self.looger.info(
+                f"{identifier} 90%_cost: {self.inference_time_s_90}, 99%_cost: {self.inference_time_s_99}, succ_rate: {self.succ_rate}"
+            )
+        if self.qps:
+            self.logger.info(f"{identifier} QPS: {self.qps}")
+
+    def print_help(self):
+        """
+        print function help
+        """
+        print("""Usage: 
+            ==== Print inference benchmark logs. ====
+            config = paddle.inference.Config()
+            model_info = {'model_name': 'resnet50'
+                          'precision': 'fp32'}
+            data_info = {'batch_size': 1
+                         'shape': '3,224,224'
+                         'data_num': 1000}
+            perf_info = {'preprocess_time_s': 1.0
+                         'inference_time_s': 2.0
+                         'postprocess_time_s': 1.0
+                         'total_time_s': 4.0}
+            resource_info = {'cpu_rss_mb': 100
+                             'gpu_rss_mb': 100
+                             'gpu_util': 60}
+            log = PaddleInferBenchmark(config, model_info, data_info, perf_info, resource_info)
+            log('Test')
+            """)
+
+    def __call__(self, identifier=None):
+        """
+        __call__
+        args:
+            identifier(string): identify log
+        """
+        self.report(identifier)
diff --git a/deploy/python/infer.py b/deploy/python/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..86fc30a73d41be50b74b96a8ce139c521cb5d139
--- /dev/null
+++ b/deploy/python/infer.py
@@ -0,0 +1,886 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import glob
+import json
+from pathlib import Path
+from functools import reduce
+
+import cv2
+import numpy as np
+import math
+import paddle
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+import sys
+# add deploy path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'])))
+sys.path.insert(0, parent_path)
+
+from benchmark_utils import PaddleInferBenchmark
+from preprocess import preprocess, Resize, NormalizeImage, Permute, Pad, decode_image
+from visualize import visualize_box_mask
+from utils import argsparser, Timer, get_current_memory_mb, multiclass_nms, coco_clsid2catid
+
+# Global dictionary
+SUPPORT_MODELS = {
+    'YOLO', 'PPYOLOE', 'YOLOX', 'YOLOF', 'YOLOv5', 'RTMDet', 'YOLOv6', 'YOLOv7', 'YOLOv8', 'DETR'
+}
+
+
+def bench_log(detector, img_list, model_info, batch_size=1, name=None):
+    mems = {
+        'cpu_rss_mb': detector.cpu_mem / len(img_list),
+        'gpu_rss_mb': detector.gpu_mem / len(img_list),
+        'gpu_util': detector.gpu_util * 100 / len(img_list)
+    }
+    perf_info = detector.det_times.report(average=True)
+    data_info = {
+        'batch_size': batch_size,
+        'shape': "dynamic_shape",
+        'data_num': perf_info['img_num']
+    }
+    log = PaddleInferBenchmark(detector.config, model_info, data_info,
+                               perf_info, mems)
+    log(name)
+
+
+class Detector(object):
+    """
+    Args:
+        pred_config (object): config of model, defined by `Config(model_dir)`
+        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
+        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
+        batch_size (int): size of pre batch in inference
+        trt_min_shape (int): min shape for dynamic shape in trt
+        trt_max_shape (int): max shape for dynamic shape in trt
+        trt_opt_shape (int): opt shape for dynamic shape in trt
+        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
+            calibration, trt_calib_mode need to set True
+        cpu_threads (int): cpu threads
+        enable_mkldnn (bool): whether to open MKLDNN
+        enable_mkldnn_bfloat16 (bool): whether to turn on mkldnn bfloat16
+        output_dir (str): The path of output
+        threshold (float): The threshold of score for visualization
+        delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT. 
+                                    Used by action model.
+    """
+
+    def __init__(self,
+                 model_dir,
+                 device='CPU',
+                 run_mode='paddle',
+                 batch_size=1,
+                 trt_min_shape=1,
+                 trt_max_shape=1280,
+                 trt_opt_shape=640,
+                 trt_calib_mode=False,
+                 cpu_threads=1,
+                 enable_mkldnn=False,
+                 enable_mkldnn_bfloat16=False,
+                 output_dir='output',
+                 threshold=0.5,
+                 delete_shuffle_pass=False):
+        self.pred_config = self.set_config(model_dir)
+        self.predictor, self.config = load_predictor(
+            model_dir,
+            self.pred_config.arch,
+            run_mode=run_mode,
+            batch_size=batch_size,
+            min_subgraph_size=self.pred_config.min_subgraph_size,
+            device=device,
+            use_dynamic_shape=self.pred_config.use_dynamic_shape,
+            trt_min_shape=trt_min_shape,
+            trt_max_shape=trt_max_shape,
+            trt_opt_shape=trt_opt_shape,
+            trt_calib_mode=trt_calib_mode,
+            cpu_threads=cpu_threads,
+            enable_mkldnn=enable_mkldnn,
+            enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
+            delete_shuffle_pass=delete_shuffle_pass)
+        self.det_times = Timer()
+        self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
+        self.batch_size = batch_size
+        self.output_dir = output_dir
+        self.threshold = threshold
+
+    def set_config(self, model_dir):
+        return PredictConfig(model_dir)
+
+    def preprocess(self, image_list):
+        preprocess_ops = []
+        for op_info in self.pred_config.preprocess_infos:
+            new_op_info = op_info.copy()
+            op_type = new_op_info.pop('type')
+            preprocess_ops.append(eval(op_type)(**new_op_info))
+
+        input_im_lst = []
+        input_im_info_lst = []
+        for im_path in image_list:
+            im, im_info = preprocess(im_path, preprocess_ops)
+            input_im_lst.append(im)
+            input_im_info_lst.append(im_info)
+        inputs = create_inputs(input_im_lst, input_im_info_lst)
+        input_names = self.predictor.get_input_names()
+        for i in range(len(input_names)):
+            input_tensor = self.predictor.get_input_handle(input_names[i])
+            if input_names[i] == 'x':
+                input_tensor.copy_from_cpu(inputs['image'])
+            else:
+                input_tensor.copy_from_cpu(inputs[input_names[i]])
+
+        return inputs
+
+    def postprocess(self, inputs, result):
+        # postprocess output of predictor
+        np_boxes_num = result['boxes_num']
+        assert isinstance(np_boxes_num, np.ndarray), \
+            '`np_boxes_num` should be a `numpy.ndarray`'
+
+        result = {k: v for k, v in result.items() if v is not None}
+        return result
+
+    def filter_box(self, result, threshold):
+        np_boxes_num = result['boxes_num']
+        boxes = result['boxes']
+        start_idx = 0
+        filter_boxes = []
+        filter_num = []
+        for i in range(len(np_boxes_num)):
+            boxes_num = np_boxes_num[i]
+            boxes_i = boxes[start_idx:start_idx + boxes_num, :]
+            idx = boxes_i[:, 1] > threshold
+            filter_boxes_i = boxes_i[idx, :]
+            filter_boxes.append(filter_boxes_i)
+            filter_num.append(filter_boxes_i.shape[0])
+            start_idx += boxes_num
+        boxes = np.concatenate(filter_boxes)
+        filter_num = np.array(filter_num)
+        filter_res = {'boxes': boxes, 'boxes_num': filter_num}
+        return filter_res
+
+    def predict(self, repeats=1, run_benchmark=False):
+        '''
+        Args:
+            repeats (int): repeats number for prediction
+        Returns:
+            result (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
+                            matix element:[class, score, x_min, y_min, x_max, y_max]
+                            MaskRCNN's result include 'masks': np.ndarray:
+                            shape: [N, im_h, im_w]
+        '''
+        # model prediction
+        np_boxes_num, np_boxes, np_masks = np.array([0]), None, None
+
+        if run_benchmark:
+            for i in range(repeats):
+                self.predictor.run()
+                paddle.device.cuda.synchronize()
+            result = dict(
+                boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
+            return result
+
+        for i in range(repeats):
+            self.predictor.run()
+            output_names = self.predictor.get_output_names()
+            boxes_tensor = self.predictor.get_output_handle(output_names[0])
+            np_boxes = boxes_tensor.copy_to_cpu()
+            if len(output_names) == 1:
+                # some exported model can not get tensor 'bbox_num' 
+                np_boxes_num = np.array([len(np_boxes)])
+            else:
+                boxes_num = self.predictor.get_output_handle(output_names[1])
+                np_boxes_num = boxes_num.copy_to_cpu()
+            if self.pred_config.mask:
+                masks_tensor = self.predictor.get_output_handle(output_names[2])
+                np_masks = masks_tensor.copy_to_cpu()
+        result = dict(boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
+        return result
+
+    def merge_batch_result(self, batch_result):
+        if len(batch_result) == 1:
+            return batch_result[0]
+        res_key = batch_result[0].keys()
+        results = {k: [] for k in res_key}
+        for res in batch_result:
+            for k, v in res.items():
+                results[k].append(v)
+        for k, v in results.items():
+            if k not in ['masks', 'segm']:
+                results[k] = np.concatenate(v)
+        return results
+
+    def get_timer(self):
+        return self.det_times
+
+    def predict_image_slice(self,
+                            img_list,
+                            slice_size=[640, 640],
+                            overlap_ratio=[0.25, 0.25],
+                            combine_method='nms',
+                            match_threshold=0.6,
+                            match_metric='ios',
+                            run_benchmark=False,
+                            repeats=1,
+                            visual=True,
+                            save_results=False):
+        # slice infer only support bs=1
+        results = []
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            print(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+        num_classes = len(self.pred_config.labels)
+        for i in range(len(img_list)):
+            ori_image = img_list[i]
+            slice_image_result = sahi.slicing.slice_image(
+                image=ori_image,
+                slice_height=slice_size[0],
+                slice_width=slice_size[1],
+                overlap_height_ratio=overlap_ratio[0],
+                overlap_width_ratio=overlap_ratio[1])
+            sub_img_num = len(slice_image_result)
+            merged_bboxs = []
+            print('slice to {} sub_samples.', sub_img_num)
+
+            batch_image_list = [
+                slice_image_result.images[_ind] for _ind in range(sub_img_num)
+            ]
+            if run_benchmark:
+                # preprocess
+                inputs = self.preprocess(batch_image_list)  # warmup
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                result = self.predict(repeats=50, run_benchmark=True)  # warmup
+                self.det_times.inference_time_s.start()
+                result = self.predict(repeats=repeats, run_benchmark=True)
+                self.det_times.inference_time_s.end(repeats=repeats)
+
+                # postprocess
+                result_warmup = self.postprocess(inputs, result)  # warmup
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += 1
+
+                cm, gm, gu = get_current_memory_mb()
+                self.cpu_mem += cm
+                self.gpu_mem += gm
+                self.gpu_util += gu
+            else:
+                # preprocess
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                self.det_times.inference_time_s.start()
+                result = self.predict()
+                self.det_times.inference_time_s.end()
+
+                # postprocess
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += 1
+
+            st, ed = 0, result['boxes_num'][0]  # start_index, end_index
+            for _ind in range(sub_img_num):
+                boxes_num = result['boxes_num'][_ind]
+                ed = st + boxes_num
+                shift_amount = slice_image_result.starting_pixels[_ind]
+                result['boxes'][st:ed][:, 2:4] = result['boxes'][
+                    st:ed][:, 2:4] + shift_amount
+                result['boxes'][st:ed][:, 4:6] = result['boxes'][
+                    st:ed][:, 4:6] + shift_amount
+                merged_bboxs.append(result['boxes'][st:ed])
+                st = ed
+
+            merged_results = {'boxes': []}
+            if combine_method == 'nms':
+                final_boxes = multiclass_nms(
+                    np.concatenate(merged_bboxs), num_classes, match_threshold,
+                    match_metric)
+                merged_results['boxes'] = np.concatenate(final_boxes)
+            elif combine_method == 'concat':
+                merged_results['boxes'] = np.concatenate(merged_bboxs)
+            else:
+                raise ValueError(
+                    "Now only support 'nms' or 'concat' to fuse detection results."
+                )
+            merged_results['boxes_num'] = np.array(
+                [len(merged_results['boxes'])], dtype=np.int32)
+
+            if visual:
+                visualize(
+                    [ori_image],  # should be list
+                    merged_results,
+                    self.pred_config.labels,
+                    output_dir=self.output_dir,
+                    threshold=self.threshold)
+
+            results.append(merged_results)
+            print('Test iter {}'.format(i))
+
+        results = self.merge_batch_result(results)
+        if save_results:
+            Path(self.output_dir).mkdir(exist_ok=True)
+            self.save_coco_results(
+                img_list, results, use_coco_category=FLAGS.use_coco_category)
+        return results
+
+    def predict_image(self,
+                      image_list,
+                      run_benchmark=False,
+                      repeats=1,
+                      visual=True,
+                      save_results=False):
+        batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
+        results = []
+        for i in range(batch_loop_cnt):
+            start_index = i * self.batch_size
+            end_index = min((i + 1) * self.batch_size, len(image_list))
+            batch_image_list = image_list[start_index:end_index]
+            if run_benchmark:
+                # preprocess
+                inputs = self.preprocess(batch_image_list)  # warmup
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                result = self.predict(repeats=50, run_benchmark=True)  # warmup
+                self.det_times.inference_time_s.start()
+                result = self.predict(repeats=repeats, run_benchmark=True)
+                self.det_times.inference_time_s.end(repeats=repeats)
+
+                # postprocess
+                result_warmup = self.postprocess(inputs, result)  # warmup
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += len(batch_image_list)
+
+                cm, gm, gu = get_current_memory_mb()
+                self.cpu_mem += cm
+                self.gpu_mem += gm
+                self.gpu_util += gu
+            else:
+                # preprocess
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                self.det_times.inference_time_s.start()
+                result = self.predict()
+                self.det_times.inference_time_s.end()
+
+                # postprocess
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += len(batch_image_list)
+
+                if visual:
+                    visualize(
+                        batch_image_list,
+                        result,
+                        self.pred_config.labels,
+                        output_dir=self.output_dir,
+                        threshold=self.threshold)
+            results.append(result)
+            print('Test iter {}'.format(i))
+        results = self.merge_batch_result(results)
+        if save_results:
+            Path(self.output_dir).mkdir(exist_ok=True)
+            self.save_coco_results(
+                image_list, results, use_coco_category=FLAGS.use_coco_category)
+        return results
+
+    def predict_video(self, video_file, camera_id):
+        video_out_name = 'output.mp4'
+        if camera_id != -1:
+            capture = cv2.VideoCapture(camera_id)
+        else:
+            capture = cv2.VideoCapture(video_file)
+            video_out_name = os.path.split(video_file)[-1]
+        # Get Video info : resolution, fps, frame count
+        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = int(capture.get(cv2.CAP_PROP_FPS))
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        print("fps: %d, frame_count: %d" % (fps, frame_count))
+
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+        out_path = os.path.join(self.output_dir, video_out_name)
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+        index = 1
+        while (1):
+            ret, frame = capture.read()
+            if not ret:
+                break
+            print('detect frame: %d' % (index))
+            index += 1
+            results = self.predict_image([frame[:, :, ::-1]], visual=False)
+
+            im = visualize_box_mask(
+                frame,
+                results,
+                self.pred_config.labels,
+                threshold=self.threshold)
+            im = np.array(im)
+            writer.write(im)
+            if camera_id != -1:
+                cv2.imshow('Mask Detection', im)
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
+        writer.release()
+
+    def save_coco_results(self, image_list, results, use_coco_category=False):
+        bbox_results = []
+        mask_results = []
+        idx = 0
+        print("Start saving coco json files...")
+        for i, box_num in enumerate(results['boxes_num']):
+            file_name = os.path.split(image_list[i])[-1]
+            if use_coco_category:
+                img_id = int(os.path.splitext(file_name)[0])
+            else:
+                img_id = i
+
+            if 'boxes' in results:
+                boxes = results['boxes'][idx:idx + box_num].tolist()
+                bbox_results.extend([{
+                    'image_id': img_id,
+                    'category_id': coco_clsid2catid[int(box[0])] \
+                        if use_coco_category else int(box[0]),
+                    'file_name': file_name,
+                    'bbox': [box[2], box[3], box[4] - box[2],
+                         box[5] - box[3]],  # xyxy -> xywh
+                    'score': box[1]} for box in boxes])
+
+            if 'masks' in results:
+                import pycocotools.mask as mask_util
+
+                boxes = results['boxes'][idx:idx + box_num].tolist()
+                masks = results['masks'][i][:box_num].astype(np.uint8)
+                seg_res = []
+                for box, mask in zip(boxes, masks):
+                    rle = mask_util.encode(
+                        np.array(
+                            mask[:, :, None], dtype=np.uint8, order="F"))[0]
+                    if 'counts' in rle:
+                        rle['counts'] = rle['counts'].decode("utf8")
+                    seg_res.append({
+                        'image_id': img_id,
+                        'category_id': coco_clsid2catid[int(box[0])] \
+                        if use_coco_category else int(box[0]),
+                        'file_name': file_name,
+                        'segmentation': rle,
+                        'score': box[1]})
+                mask_results.extend(seg_res)
+
+            idx += box_num
+
+        if bbox_results:
+            bbox_file = os.path.join(self.output_dir, "bbox.json")
+            with open(bbox_file, 'w') as f:
+                json.dump(bbox_results, f)
+            print(f"The bbox result is saved to {bbox_file}")
+        if mask_results:
+            mask_file = os.path.join(self.output_dir, "mask.json")
+            with open(mask_file, 'w') as f:
+                json.dump(mask_results, f)
+            print(f"The mask result is saved to {mask_file}")
+
+
+def create_inputs(imgs, im_info):
+    """generate input for different model type
+    Args:
+        imgs (list(numpy)): list of images (np.ndarray)
+        im_info (list(dict)): list of image info
+    Returns:
+        inputs (dict): input of model
+    """
+    inputs = {}
+
+    im_shape = []
+    scale_factor = []
+    if len(imgs) == 1:
+        inputs['image'] = np.array((imgs[0], )).astype('float32')
+        inputs['im_shape'] = np.array(
+            (im_info[0]['im_shape'], )).astype('float32')
+        inputs['scale_factor'] = np.array(
+            (im_info[0]['scale_factor'], )).astype('float32')
+        return inputs
+
+    for e in im_info:
+        im_shape.append(np.array((e['im_shape'], )).astype('float32'))
+        scale_factor.append(np.array((e['scale_factor'], )).astype('float32'))
+
+    inputs['im_shape'] = np.concatenate(im_shape, axis=0)
+    inputs['scale_factor'] = np.concatenate(scale_factor, axis=0)
+
+    imgs_shape = [[e.shape[1], e.shape[2]] for e in imgs]
+    max_shape_h = max([e[0] for e in imgs_shape])
+    max_shape_w = max([e[1] for e in imgs_shape])
+    padding_imgs = []
+    for img in imgs:
+        im_c, im_h, im_w = img.shape[:]
+        padding_im = np.zeros(
+            (im_c, max_shape_h, max_shape_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = img
+        padding_imgs.append(padding_im)
+    inputs['image'] = np.stack(padding_imgs, axis=0)
+    return inputs
+
+
+class PredictConfig():
+    """set config of preprocess, postprocess and visualize
+    Args:
+        model_dir (str): root path of model.yml
+    """
+
+    def __init__(self, model_dir):
+        # parsing Yaml config for Preprocess
+        deploy_file = os.path.join(model_dir, 'infer_cfg.yml')
+        with open(deploy_file) as f:
+            yml_conf = yaml.safe_load(f)
+        self.check_model(yml_conf)
+        self.arch = yml_conf['arch']
+        self.preprocess_infos = yml_conf['Preprocess']
+        self.min_subgraph_size = yml_conf['min_subgraph_size']
+        self.labels = yml_conf['label_list']
+        self.mask = False
+        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
+        if 'mask' in yml_conf:
+            self.mask = yml_conf['mask']
+        self.tracker = None
+        if 'tracker' in yml_conf:
+            self.tracker = yml_conf['tracker']
+        if 'NMS' in yml_conf:
+            self.nms = yml_conf['NMS']
+        if 'fpn_stride' in yml_conf:
+            self.fpn_stride = yml_conf['fpn_stride']
+        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
+            print(
+                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
+            )
+        self.print_config()
+
+    def check_model(self, yml_conf):
+        """
+        Raises:
+            ValueError: loaded model not in supported model type 
+        """
+        for support_model in SUPPORT_MODELS:
+            if support_model in yml_conf['arch']:
+                return True
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
+            'arch'], SUPPORT_MODELS))
+
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+
+
+def load_predictor(model_dir,
+                   arch,
+                   run_mode='paddle',
+                   batch_size=1,
+                   device='CPU',
+                   min_subgraph_size=3,
+                   use_dynamic_shape=False,
+                   trt_min_shape=1,
+                   trt_max_shape=1280,
+                   trt_opt_shape=640,
+                   trt_calib_mode=False,
+                   cpu_threads=1,
+                   enable_mkldnn=False,
+                   enable_mkldnn_bfloat16=False,
+                   delete_shuffle_pass=False):
+    """set AnalysisConfig, generate AnalysisPredictor
+    Args:
+        model_dir (str): root path of __model__ and __params__
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
+        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16/trt_int8)
+        use_dynamic_shape (bool): use dynamic shape or not
+        trt_min_shape (int): min shape for dynamic shape in trt
+        trt_max_shape (int): max shape for dynamic shape in trt
+        trt_opt_shape (int): opt shape for dynamic shape in trt
+        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
+            calibration, trt_calib_mode need to set True
+        delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT. 
+                                    Used by action model.
+    Returns:
+        predictor (PaddlePredictor): AnalysisPredictor
+    Raises:
+        ValueError: predict by TensorRT need device == 'GPU'.
+    """
+    if device != 'GPU' and run_mode != 'paddle':
+        raise ValueError(
+            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}"
+            .format(run_mode, device))
+    infer_model = os.path.join(model_dir, 'model.pdmodel')
+    infer_params = os.path.join(model_dir, 'model.pdiparams')
+    if not os.path.exists(infer_model):
+        infer_model = os.path.join(model_dir, 'inference.pdmodel')
+        infer_params = os.path.join(model_dir, 'inference.pdiparams')
+        if not os.path.exists(infer_model):
+            raise ValueError(
+                "Cannot find any inference model in dir: {},".format(model_dir))
+    config = Config(infer_model, infer_params)
+    if device == 'GPU':
+        # initial GPU memory(M), device ID
+        config.enable_use_gpu(200, 0)
+        # optimize graph and fuse op
+        config.switch_ir_optim(True)
+    elif device == 'XPU':
+        if config.lite_engine_enabled():
+            config.enable_lite_engine()
+        config.enable_xpu(10 * 1024 * 1024)
+    elif device == 'NPU':
+        if config.lite_engine_enabled():
+            config.enable_lite_engine()
+        config.enable_custom_device('npu')
+    else:
+        config.disable_gpu()
+        config.set_cpu_math_library_num_threads(cpu_threads)
+        if enable_mkldnn:
+            try:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+                if enable_mkldnn_bfloat16:
+                    config.enable_mkldnn_bfloat16()
+            except Exception as e:
+                print(
+                    "The current environment does not support `mkldnn`, so disable mkldnn."
+                )
+                pass
+
+    precision_map = {
+        'trt_int8': Config.Precision.Int8,
+        'trt_fp32': Config.Precision.Float32,
+        'trt_fp16': Config.Precision.Half
+    }
+    if run_mode in precision_map.keys():
+        config.enable_tensorrt_engine(
+            workspace_size=(1 << 25) * batch_size,
+            max_batch_size=batch_size,
+            min_subgraph_size=min_subgraph_size,
+            precision_mode=precision_map[run_mode],
+            use_static=False,
+            use_calib_mode=trt_calib_mode)
+        if FLAGS.collect_trt_shape_info:
+            config.collect_shape_range_info(FLAGS.tuned_trt_shape_file)
+        elif os.path.exists(FLAGS.tuned_trt_shape_file):
+            print(f'Use dynamic shape file: '
+                  f'{FLAGS.tuned_trt_shape_file} for TRT...')
+            config.enable_tuned_tensorrt_dynamic_shape(
+                FLAGS.tuned_trt_shape_file, True)
+
+        if use_dynamic_shape:
+            min_input_shape = {
+                'image': [batch_size, 3, trt_min_shape, trt_min_shape],
+                'scale_factor': [batch_size, 2]
+            }
+            max_input_shape = {
+                'image': [batch_size, 3, trt_max_shape, trt_max_shape],
+                'scale_factor': [batch_size, 2]
+            }
+            opt_input_shape = {
+                'image': [batch_size, 3, trt_opt_shape, trt_opt_shape],
+                'scale_factor': [batch_size, 2]
+            }
+            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
+                                              opt_input_shape)
+            print('trt set dynamic shape done!')
+
+    # disable print log when predict
+    config.disable_glog_info()
+    # enable shared memory
+    config.enable_memory_optim()
+    # disable feed, fetch OP, needed by zero_copy_run
+    config.switch_use_feed_fetch_ops(False)
+    if delete_shuffle_pass:
+        config.delete_pass("shuffle_channel_detect_pass")
+    predictor = create_predictor(config)
+    return predictor, config
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    print("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+def visualize(image_list, result, labels, output_dir='output/', threshold=0.5):
+    # visualize the predict result
+    start_idx = 0
+    for idx, image_file in enumerate(image_list):
+        im_bboxes_num = result['boxes_num'][idx]
+        im_results = {}
+        if 'boxes' in result:
+            im_results['boxes'] = result['boxes'][start_idx:start_idx +
+                                                  im_bboxes_num, :]
+        if 'masks' in result:
+            im_results['masks'] = result['masks'][start_idx:start_idx +
+                                                  im_bboxes_num, :]
+        if 'segm' in result:
+            im_results['segm'] = result['segm'][start_idx:start_idx +
+                                                im_bboxes_num, :]
+        if 'label' in result:
+            im_results['label'] = result['label'][start_idx:start_idx +
+                                                  im_bboxes_num]
+        if 'score' in result:
+            im_results['score'] = result['score'][start_idx:start_idx +
+                                                  im_bboxes_num]
+
+        start_idx += im_bboxes_num
+        im = visualize_box_mask(
+            image_file, im_results, labels, threshold=threshold)
+        img_name = os.path.split(image_file)[-1]
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        out_path = os.path.join(output_dir, img_name)
+        im.save(out_path, quality=95)
+        print("save result to: " + out_path)
+
+
+def print_arguments(args):
+    print('-----------  Running Arguments -----------')
+    for arg, value in sorted(vars(args).items()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------')
+
+
+def main():
+    deploy_file = os.path.join(FLAGS.model_dir, 'infer_cfg.yml')
+    with open(deploy_file) as f:
+        yml_conf = yaml.safe_load(f)
+    arch = yml_conf['arch']
+    detector_func = 'Detector'
+
+    detector = eval(detector_func)(
+        FLAGS.model_dir,
+        device=FLAGS.device,
+        run_mode=FLAGS.run_mode,
+        batch_size=FLAGS.batch_size,
+        trt_min_shape=FLAGS.trt_min_shape,
+        trt_max_shape=FLAGS.trt_max_shape,
+        trt_opt_shape=FLAGS.trt_opt_shape,
+        trt_calib_mode=FLAGS.trt_calib_mode,
+        cpu_threads=FLAGS.cpu_threads,
+        enable_mkldnn=FLAGS.enable_mkldnn,
+        enable_mkldnn_bfloat16=FLAGS.enable_mkldnn_bfloat16,
+        threshold=FLAGS.threshold,
+        output_dir=FLAGS.output_dir)
+
+    # predict from video file or camera video stream
+    if FLAGS.video_file is not None or FLAGS.camera_id != -1:
+        detector.predict_video(FLAGS.video_file, FLAGS.camera_id)
+    else:
+        # predict from image
+        if FLAGS.image_dir is None and FLAGS.image_file is not None:
+            assert FLAGS.batch_size == 1, "batch_size should be 1, when image_file is not None"
+        img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
+        if FLAGS.slice_infer:
+            detector.predict_image_slice(
+                img_list,
+                FLAGS.slice_size,
+                FLAGS.overlap_ratio,
+                FLAGS.combine_method,
+                FLAGS.match_threshold,
+                FLAGS.match_metric,
+                visual=FLAGS.save_images,
+                save_results=FLAGS.save_results)
+        else:
+            detector.predict_image(
+                img_list,
+                FLAGS.run_benchmark,
+                repeats=100,
+                visual=FLAGS.save_images,
+                save_results=FLAGS.save_results)
+        if not FLAGS.run_benchmark:
+            detector.det_times.info(average=True)
+        else:
+            mode = FLAGS.run_mode
+            model_dir = FLAGS.model_dir
+            model_info = {
+                'model_name': model_dir.strip('/').split('/')[-1],
+                'precision': mode.split('_')[-1]
+            }
+            bench_log(detector, img_list, model_info, name='DET')
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    parser = argsparser()
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    FLAGS.device = FLAGS.device.upper()
+    assert FLAGS.device in ['CPU', 'GPU', 'XPU', 'NPU'
+                            ], "device should be CPU, GPU, XPU or NPU"
+    assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
+
+    assert not (
+        FLAGS.enable_mkldnn == False and FLAGS.enable_mkldnn_bfloat16 == True
+    ), 'To enable mkldnn bfloat, please turn on both enable_mkldnn and enable_mkldnn_bfloat16'
+
+    main()
diff --git a/deploy/python/preprocess.py b/deploy/python/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..d31afdede719e1e64a69c2b31fabb516ca830cee
--- /dev/null
+++ b/deploy/python/preprocess.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+from PIL import Image
+
+
+def decode_image(im_file, im_info):
+    """read rgb image
+    Args:
+        im_file (str|np.ndarray): input can be image path or np.ndarray
+        im_info (dict): info of image
+    Returns:
+        im (np.ndarray):  processed image (np.ndarray)
+        im_info (dict): info of processed image
+    """
+    if isinstance(im_file, str):
+        with open(im_file, 'rb') as f:
+            im_read = f.read()
+        data = np.frombuffer(im_read, dtype='uint8')
+        im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    else:
+        im = im_file
+    im_info['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+    im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+    return im, im_info
+
+
+class Resize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+
+    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+        im_info['scale_factor'] = np.array(
+            [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+
+
+class NormalizeImage(object):
+    """normalize image
+    Args:
+        mean (list): im - mean
+        std (list): im / std
+        is_scale (bool): whether need im / 255
+        norm_type (str): type in ['mean_std', 'none']
+    """
+
+    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+        return im, im_info
+
+
+class Permute(object):
+    """permute image
+    Args:
+        to_bgr (bool): whether convert RGB to BGR 
+        channel_first (bool): whether convert HWC to CHW
+    """
+
+    def __init__(self, ):
+        super(Permute, self).__init__()
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.transpose((2, 0, 1)).copy()
+        return im, im_info
+
+
+class PadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im, im_info
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
+
+
+class LetterBoxResize(object):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        height, width = self.target_size
+        h, w = im.shape[:2]
+        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
+
+        new_shape = [round(h * ratio), round(w * ratio)]
+        im_info['im_shape'] = np.array(new_shape, dtype=np.float32)
+        im_info['scale_factor'] = np.array([ratio, ratio], dtype=np.float32)
+        return im, im_info
+
+
+class Pad(object):
+    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
+        """
+        Pad image to a specified size.
+        Args:
+            size (list[int]): image target size
+            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
+        """
+        super(Pad, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+        self.fill_value = fill_value
+
+    def __call__(self, im, im_info):
+        im_h, im_w = im.shape[:2]
+        h, w = self.size
+        if h == im_h and w == im_w:
+            im = im.astype(np.float32)
+            return im, im_info
+
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
+        im = canvas
+        return im, im_info
+
+
+def preprocess(im, preprocess_ops):
+    # process image by preprocess_ops
+    im_info = {
+        'scale_factor': np.array(
+            [1., 1.], dtype=np.float32),
+        'im_shape': None,
+    }
+    im, im_info = decode_image(im, im_info)
+    for operator in preprocess_ops:
+        im, im_info = operator(im, im_info)
+    return im, im_info
diff --git a/deploy/python/utils.py b/deploy/python/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6168dc9ea7f6f682d8e9cac54c7eb005220f8785
--- /dev/null
+++ b/deploy/python/utils.py
@@ -0,0 +1,441 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os
+import ast
+import argparse
+import numpy as np
+
+
+def argsparser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default=None,
+        help=("Directory include:'model.pdiparams', 'model.pdmodel', "
+              "'infer_cfg.yml', created by tools/export_model.py."),
+        required=True)
+    parser.add_argument(
+        "--image_file", type=str, default=None, help="Path of image file.")
+    parser.add_argument(
+        "--image_dir",
+        type=str,
+        default=None,
+        help="Dir of image file, `image_file` has a higher priority.")
+    parser.add_argument(
+        "--batch_size", type=int, default=1, help="batch_size for inference.")
+    parser.add_argument(
+        "--video_file",
+        type=str,
+        default=None,
+        help="Path of video file, `video_file` or `camera_id` has a highest priority."
+    )
+    parser.add_argument(
+        "--camera_id",
+        type=int,
+        default=-1,
+        help="device id of camera to predict.")
+    parser.add_argument(
+        "--threshold", type=float, default=0.5, help="Threshold of score.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Directory of output visualization files.")
+    parser.add_argument(
+        "--run_mode",
+        type=str,
+        default='paddle',
+        help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Choose the device you want to run, it can be: CPU/GPU/XPU/NPU, default is CPU."
+    )
+    parser.add_argument(
+        "--use_gpu",
+        type=ast.literal_eval,
+        default=False,
+        help="Deprecated, please use `--device`.")
+    parser.add_argument(
+        "--run_benchmark",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether to predict a image_file repeatedly for benchmark")
+    parser.add_argument(
+        "--enable_mkldnn",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether use mkldnn with CPU.")
+    parser.add_argument(
+        "--enable_mkldnn_bfloat16",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether use mkldnn bfloat16 inference with CPU.")
+    parser.add_argument(
+        "--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
+    parser.add_argument(
+        "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
+    parser.add_argument(
+        "--trt_max_shape",
+        type=int,
+        default=1280,
+        help="max_shape for TensorRT.")
+    parser.add_argument(
+        "--trt_opt_shape",
+        type=int,
+        default=640,
+        help="opt_shape for TensorRT.")
+    parser.add_argument(
+        "--trt_calib_mode",
+        type=bool,
+        default=False,
+        help="If the model is produced by TRT offline quantitative "
+        "calibration, trt_calib_mode need to set True.")
+    parser.add_argument(
+        '--save_images',
+        type=ast.literal_eval,
+        default=True,
+        help='Save visualization image results.')
+    parser.add_argument(
+        "--save_results",
+        action='store_true',
+        default=False,
+        help="Whether save detection result to file using coco format")
+    parser.add_argument(
+        '--use_coco_category',
+        action='store_true',
+        default=False,
+        help='Whether to use the coco format dictionary `clsid2catid`')
+    parser.add_argument(
+        "--slice_infer",
+        action='store_true',
+        help="Whether to slice the image and merge the inference results for small object detection."
+    )
+    parser.add_argument(
+        '--slice_size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help="Height of the sliced image.")
+    parser.add_argument(
+        "--overlap_ratio",
+        nargs='+',
+        type=float,
+        default=[0.25, 0.25],
+        help="Overlap height ratio of the sliced image.")
+    parser.add_argument(
+        "--combine_method",
+        type=str,
+        default='nms',
+        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']."
+    )
+    parser.add_argument(
+        "--match_threshold",
+        type=float,
+        default=0.6,
+        help="Combine method matching threshold.")
+    parser.add_argument(
+        "--match_metric",
+        type=str,
+        default='ios',
+        help="Combine method matching metric, choose in ['iou', 'ios'].")
+    parser.add_argument(
+        "--collect_trt_shape_info",
+        action='store_true',
+        default=False,
+        help="Whether to collect dynamic shape before using tensorrt.")
+    parser.add_argument(
+        "--tuned_trt_shape_file",
+        type=str,
+        default="shape_range_info.pbtxt",
+        help="Path of a dynamic shape file for tensorrt.")
+    return parser
+
+
+class Times(object):
+    def __init__(self):
+        self.time = 0.
+        # start time
+        self.st = 0.
+        # end time
+        self.et = 0.
+
+    def start(self):
+        self.st = time.time()
+
+    def end(self, repeats=1, accumulative=True):
+        self.et = time.time()
+        if accumulative:
+            self.time += (self.et - self.st) / repeats
+        else:
+            self.time = (self.et - self.st) / repeats
+
+    def reset(self):
+        self.time = 0.
+        self.st = 0.
+        self.et = 0.
+
+    def value(self):
+        return round(self.time, 4)
+
+
+class Timer(Times):
+    def __init__(self, with_tracker=False):
+        super(Timer, self).__init__()
+        self.with_tracker = with_tracker
+        self.preprocess_time_s = Times()
+        self.inference_time_s = Times()
+        self.postprocess_time_s = Times()
+        self.tracking_time_s = Times()
+        self.img_num = 0
+
+    def info(self, average=False):
+        pre_time = self.preprocess_time_s.value()
+        infer_time = self.inference_time_s.value()
+        post_time = self.postprocess_time_s.value()
+        track_time = self.tracking_time_s.value()
+
+        total_time = pre_time + infer_time + post_time
+        if self.with_tracker:
+            total_time = total_time + track_time
+        total_time = round(total_time, 4)
+        print("------------------ Inference Time Info ----------------------")
+        print("total_time(ms): {}, img_num: {}".format(total_time * 1000,
+                                                       self.img_num))
+        preprocess_time = round(pre_time / max(1, self.img_num),
+                                4) if average else pre_time
+        postprocess_time = round(post_time / max(1, self.img_num),
+                                 4) if average else post_time
+        inference_time = round(infer_time / max(1, self.img_num),
+                               4) if average else infer_time
+        tracking_time = round(track_time / max(1, self.img_num),
+                              4) if average else track_time
+
+        average_latency = total_time / max(1, self.img_num)
+        qps = 0
+        if total_time > 0:
+            qps = 1 / average_latency
+        print("average latency time(ms): {:.2f}, QPS: {:2f}".format(
+            average_latency * 1000, qps))
+        if self.with_tracker:
+            print(
+                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".
+                format(preprocess_time * 1000, inference_time * 1000,
+                       postprocess_time * 1000, tracking_time * 1000))
+        else:
+            print(
+                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".
+                format(preprocess_time * 1000, inference_time * 1000,
+                       postprocess_time * 1000))
+
+    def report(self, average=False):
+        dic = {}
+        pre_time = self.preprocess_time_s.value()
+        infer_time = self.inference_time_s.value()
+        post_time = self.postprocess_time_s.value()
+        track_time = self.tracking_time_s.value()
+
+        dic['preprocess_time_s'] = round(pre_time / max(1, self.img_num),
+                                         4) if average else pre_time
+        dic['inference_time_s'] = round(infer_time / max(1, self.img_num),
+                                        4) if average else infer_time
+        dic['postprocess_time_s'] = round(post_time / max(1, self.img_num),
+                                          4) if average else post_time
+        dic['img_num'] = self.img_num
+        total_time = pre_time + infer_time + post_time
+        if self.with_tracker:
+            dic['tracking_time_s'] = round(track_time / max(1, self.img_num),
+                                           4) if average else track_time
+            total_time = total_time + track_time
+        dic['total_time_s'] = round(total_time, 4)
+        return dic
+
+
+def get_current_memory_mb():
+    """
+    It is used to Obtain the memory usage of the CPU and GPU during the running of the program.
+    And this function Current program is time-consuming.
+    """
+    import pynvml
+    import psutil
+    import GPUtil
+    gpu_id = int(os.environ.get('CUDA_VISIBLE_DEVICES', 0))
+
+    pid = os.getpid()
+    p = psutil.Process(pid)
+    info = p.memory_full_info()
+    cpu_mem = info.uss / 1024. / 1024.
+    gpu_mem = 0
+    gpu_percent = 0
+    gpus = GPUtil.getGPUs()
+    if gpu_id is not None and len(gpus) > 0:
+        gpu_percent = gpus[gpu_id].load
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        gpu_mem = meminfo.used / 1024. / 1024.
+    return round(cpu_mem, 4), round(gpu_mem, 4), round(gpu_percent, 4)
+
+
+def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
+    final_boxes = []
+    for c in range(num_classes):
+        idxs = bboxs[:, 0] == c
+        if np.count_nonzero(idxs) == 0: continue
+        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
+        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+    return final_boxes
+
+
+def nms(dets, match_threshold=0.6, match_metric='iou'):
+    """ Apply NMS to avoid detecting too many overlapping bounding boxes.
+        Args:
+            dets: shape [N, 5], [score, x1, y1, x2, y2]
+            match_metric: 'iou' or 'ios'
+            match_threshold: overlap thresh for match metric.
+    """
+    if dets.shape[0] == 0:
+        return dets[[], :]
+    scores = dets[:, 0]
+    x1 = dets[:, 1]
+    y1 = dets[:, 2]
+    x2 = dets[:, 3]
+    y2 = dets[:, 4]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int32)
+
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            if match_metric == 'iou':
+                union = iarea + areas[j] - inter
+                match_value = inter / union
+            elif match_metric == 'ios':
+                smaller = min(iarea, areas[j])
+                match_value = inter / smaller
+            else:
+                raise ValueError()
+            if match_value >= match_threshold:
+                suppressed[j] = 1
+    keep = np.where(suppressed == 0)[0]
+    dets = dets[keep, :]
+    return dets
+
+
+coco_clsid2catid = {
+    0: 1,
+    1: 2,
+    2: 3,
+    3: 4,
+    4: 5,
+    5: 6,
+    6: 7,
+    7: 8,
+    8: 9,
+    9: 10,
+    10: 11,
+    11: 13,
+    12: 14,
+    13: 15,
+    14: 16,
+    15: 17,
+    16: 18,
+    17: 19,
+    18: 20,
+    19: 21,
+    20: 22,
+    21: 23,
+    22: 24,
+    23: 25,
+    24: 27,
+    25: 28,
+    26: 31,
+    27: 32,
+    28: 33,
+    29: 34,
+    30: 35,
+    31: 36,
+    32: 37,
+    33: 38,
+    34: 39,
+    35: 40,
+    36: 41,
+    37: 42,
+    38: 43,
+    39: 44,
+    40: 46,
+    41: 47,
+    42: 48,
+    43: 49,
+    44: 50,
+    45: 51,
+    46: 52,
+    47: 53,
+    48: 54,
+    49: 55,
+    50: 56,
+    51: 57,
+    52: 58,
+    53: 59,
+    54: 60,
+    55: 61,
+    56: 62,
+    57: 63,
+    58: 64,
+    59: 65,
+    60: 67,
+    61: 70,
+    62: 72,
+    63: 73,
+    64: 74,
+    65: 75,
+    66: 76,
+    67: 77,
+    68: 78,
+    69: 79,
+    70: 80,
+    71: 81,
+    72: 82,
+    73: 84,
+    74: 85,
+    75: 86,
+    76: 87,
+    77: 88,
+    78: 89,
+    79: 90
+}
diff --git a/deploy/python/visualize.py b/deploy/python/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..932be07100c299cd68d5b81f11419fc3673d6f59
--- /dev/null
+++ b/deploy/python/visualize.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import os
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+import math
+
+
+def visualize_box_mask(im, results, labels, threshold=0.5):
+    """
+    Args:
+        im (str/np.ndarray): path of image/np.ndarray read by cv2
+        results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
+                        matix element:[class, score, x_min, y_min, x_max, y_max]
+                        MaskRCNN's results include 'masks': np.ndarray:
+                        shape:[N, im_h, im_w]
+        labels (list): labels:['class1', ..., 'classn']
+        threshold (float): Threshold of score.
+    Returns:
+        im (PIL.Image.Image): visualized image
+    """
+    if isinstance(im, str):
+        im = Image.open(im).convert('RGB')
+    elif isinstance(im, np.ndarray):
+        im = Image.fromarray(im)
+    if 'masks' in results and 'boxes' in results and len(results['boxes']) > 0:
+        im = draw_mask(
+            im, results['boxes'], results['masks'], labels, threshold=threshold)
+    if 'boxes' in results and len(results['boxes']) > 0:
+        im = draw_box(im, results['boxes'], labels, threshold=threshold)
+    if 'segm' in results:
+        im = draw_segm(
+            im,
+            results['segm'],
+            results['label'],
+            results['score'],
+            labels,
+            threshold=threshold)
+    return im
+
+
+def get_color_map_list(num_classes):
+    """
+    Args:
+        num_classes (int): number of class
+    Returns:
+        color_map (list): RGB color list
+    """
+    color_map = num_classes * [0, 0, 0]
+    for i in range(0, num_classes):
+        j = 0
+        lab = i
+        while lab:
+            color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+            color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+            color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+            j += 1
+            lab >>= 3
+    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+    return color_map
+
+
+def draw_mask(im, np_boxes, np_masks, labels, threshold=0.5):
+    """
+    Args:
+        im (PIL.Image.Image): PIL image
+        np_boxes (np.ndarray): shape:[N,6], N: number of box,
+            matix element:[class, score, x_min, y_min, x_max, y_max]
+        np_masks (np.ndarray): shape:[N, im_h, im_w]
+        labels (list): labels:['class1', ..., 'classn']
+        threshold (float): threshold of mask
+    Returns:
+        im (PIL.Image.Image): visualized image
+    """
+    color_list = get_color_map_list(len(labels))
+    w_ratio = 0.4
+    alpha = 0.7
+    im = np.array(im).astype('float32')
+    clsid2color = {}
+    expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
+    np_boxes = np_boxes[expect_boxes, :]
+    np_masks = np_masks[expect_boxes, :, :]
+    im_h, im_w = im.shape[:2]
+    np_masks = np_masks[:, :im_h, :im_w]
+    for i in range(len(np_masks)):
+        clsid, score = int(np_boxes[i][0]), np_boxes[i][1]
+        mask = np_masks[i]
+        if clsid not in clsid2color:
+            clsid2color[clsid] = color_list[clsid]
+        color_mask = clsid2color[clsid]
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        color_mask = np.array(color_mask)
+        im[idx[0], idx[1], :] *= 1.0 - alpha
+        im[idx[0], idx[1], :] += alpha * color_mask
+    return Image.fromarray(im.astype('uint8'))
+
+
+def draw_box(im, np_boxes, labels, threshold=0.5):
+    """
+    Args:
+        im (PIL.Image.Image): PIL image
+        np_boxes (np.ndarray): shape:[N,6], N: number of box,
+                               matix element:[class, score, x_min, y_min, x_max, y_max]
+        labels (list): labels:['class1', ..., 'classn']
+        threshold (float): threshold of box
+    Returns:
+        im (PIL.Image.Image): visualized image
+    """
+    draw_thickness = min(im.size) // 320
+    draw = ImageDraw.Draw(im)
+    clsid2color = {}
+    color_list = get_color_map_list(len(labels))
+    expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
+    np_boxes = np_boxes[expect_boxes, :]
+
+    for dt in np_boxes:
+        clsid, bbox, score = int(dt[0]), dt[2:], dt[1]
+        if clsid not in clsid2color:
+            clsid2color[clsid] = color_list[clsid]
+        color = tuple(clsid2color[clsid])
+
+        if len(bbox) == 4:
+            xmin, ymin, xmax, ymax = bbox
+            print('class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],'
+                  'right_bottom:[{:.2f},{:.2f}]'.format(
+                      int(clsid), score, xmin, ymin, xmax, ymax))
+            # draw bbox
+            draw.line(
+                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
+                 (xmin, ymin)],
+                width=draw_thickness,
+                fill=color)
+        elif len(bbox) == 8:
+            x1, y1, x2, y2, x3, y3, x4, y4 = bbox
+            draw.line(
+                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
+                width=2,
+                fill=color)
+            xmin = min(x1, x2, x3, x4)
+            ymin = min(y1, y2, y3, y4)
+
+        # draw label
+        text = "{} {:.4f}".format(labels[clsid], score)
+        tw, th = draw.textsize(text)
+        draw.rectangle(
+            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
+        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
+    return im
+
+
+def draw_segm(im,
+              np_segms,
+              np_label,
+              np_score,
+              labels,
+              threshold=0.5,
+              alpha=0.7):
+    """
+    Draw segmentation on image
+    """
+    mask_color_id = 0
+    w_ratio = .4
+    color_list = get_color_map_list(len(labels))
+    im = np.array(im).astype('float32')
+    clsid2color = {}
+    np_segms = np_segms.astype(np.uint8)
+    for i in range(np_segms.shape[0]):
+        mask, score, clsid = np_segms[i], np_score[i], np_label[i]
+        if score < threshold:
+            continue
+
+        if clsid not in clsid2color:
+            clsid2color[clsid] = color_list[clsid]
+        color_mask = clsid2color[clsid]
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        color_mask = np.array(color_mask)
+        idx0 = np.minimum(idx[0], im.shape[0] - 1)
+        idx1 = np.minimum(idx[1], im.shape[1] - 1)
+        im[idx0, idx1, :] *= 1.0 - alpha
+        im[idx0, idx1, :] += alpha * color_mask
+        sum_x = np.sum(mask, axis=0)
+        x = np.where(sum_x > 0.5)[0]
+        sum_y = np.sum(mask, axis=1)
+        y = np.where(sum_y > 0.5)[0]
+        x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
+        cv2.rectangle(im, (x0, y0), (x1, y1),
+                      tuple(color_mask.astype('int32').tolist()), 1)
+        bbox_text = '%s %.2f' % (labels[clsid], score)
+        t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
+        cv2.rectangle(im, (x0, y0), (x0 + t_size[0], y0 - t_size[1] - 3),
+                      tuple(color_mask.astype('int32').tolist()), -1)
+        cv2.putText(
+            im,
+            bbox_text, (x0, y0 - 2),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.3, (0, 0, 0),
+            1,
+            lineType=cv2.LINE_AA)
+    return Image.fromarray(im.astype('uint8'))
diff --git a/deploy/serving/README.md b/deploy/serving/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5fb6a033f461ccfca88216508275e7eb006e5017
--- /dev/null
+++ b/deploy/serving/README.md
@@ -0,0 +1,188 @@
+# 服务端预测部署
+
+`PaddleDetection`训练出来的模型可以使用[Serving](https://github.com/PaddlePaddle/Serving) 部署在服务端。  
+本教程以在COCO数据集上用`configs/yolov3/yolov3_darknet53_270e_coco.yml`算法训练的模型进行部署。  
+预训练模型权重文件为[yolov3_darknet53_270e_coco.pdparams](https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams) 。
+
+## 1. 首先验证模型
+```
+python tools/infer.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --infer_img=demo/000000014439.jpg -o use_gpu=True weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams --infer_img=demo/000000014439.jpg
+```
+
+## 2. 安装 paddle serving
+请参考[PaddleServing](https://github.com/PaddlePaddle/Serving/tree/v0.7.0) 中安装教程安装（版本>=0.7.0）。
+
+## 3. 导出模型
+PaddleDetection在训练过程包括网络的前向和优化器相关参数，而在部署过程中，我们只需要前向参数，具体参考:[导出模型](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/deploy/EXPORT_MODEL.md)
+
+```
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams --export_serving_model=True
+```
+
+以上命令会在`output_inference/`文件夹下生成一个`yolov3_darknet53_270e_coco`文件夹：
+```
+output_inference
+│   ├── yolov3_darknet53_270e_coco
+│   │   ├── infer_cfg.yml
+│   │   ├── model.pdiparams
+│   │   ├── model.pdiparams.info
+│   │   ├── model.pdmodel
+│   │   ├── serving_client
+│   │   │   ├── serving_client_conf.prototxt
+│   │   │   ├── serving_client_conf.stream.prototxt
+│   │   ├── serving_server
+│   │   │   ├── __model__
+│   │   │   ├── __params__
+│   │   │   ├── serving_server_conf.prototxt
+│   │   │   ├── serving_server_conf.stream.prototxt
+│   │   │   ├── ...
+```
+
+`serving_client`文件夹下`serving_client_conf.prototxt`详细说明了模型输入输出信息
+`serving_client_conf.prototxt`文件内容为：
+```
+feed_var {
+  name: "im_shape"
+  alias_name: "im_shape"
+  is_lod_tensor: false
+  feed_type: 1
+  shape: 2
+}
+feed_var {
+  name: "image"
+  alias_name: "image"
+  is_lod_tensor: false
+  feed_type: 1
+  shape: 3
+  shape: 608
+  shape: 608
+}
+feed_var {
+  name: "scale_factor"
+  alias_name: "scale_factor"
+  is_lod_tensor: false
+  feed_type: 1
+  shape: 2
+}
+fetch_var {
+  name: "multiclass_nms3_0.tmp_0"
+  alias_name: "multiclass_nms3_0.tmp_0"
+  is_lod_tensor: true
+  fetch_type: 1
+  shape: -1
+}
+fetch_var {
+  name: "multiclass_nms3_0.tmp_2"
+  alias_name: "multiclass_nms3_0.tmp_2"
+  is_lod_tensor: false
+  fetch_type: 2
+```
+
+## 4. 启动PaddleServing服务
+
+```
+cd output_inference/yolov3_darknet53_270e_coco/
+
+# GPU
+python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0
+
+# CPU
+python -m paddle_serving_server.serve --model serving_server --port 9393
+```
+
+## 5. 测试部署的服务
+准备`label_list.txt`文件，示例`label_list.txt`文件内容为
+```
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
+```
+
+设置`prototxt`文件路径为`serving_client/serving_client_conf.prototxt`
+设置`fetch`为`fetch=["multiclass_nms3_0.tmp_0"])`
+
+测试
+```
+# 进入目录
+cd output_inference/yolov3_darknet53_270e_coco/
+
+# 测试代码 test_client.py 会自动创建output文件夹，并在output下生成`bbox.json`和`000000014439.jpg`两个文件
+python ../../deploy/serving/test_client.py ../../deploy/serving/label_list.txt ../../demo/000000014439.jpg
+```
diff --git a/deploy/serving/cpp/README.md b/deploy/serving/cpp/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e6d4af0c01c0c295bc384dca76de490cc222fba
--- /dev/null
+++ b/deploy/serving/cpp/README.md
@@ -0,0 +1,74 @@
+# C++ Serving预测部署
+
+## 1. 简介
+Paddle Serving是飞桨开源的服务化部署框架，提供了C++ Serving和Python Pipeline两套框架，
+C++ Serving框架更倾向于追求极致性能，Python Pipeline框架倾向于二次开发的便捷性。
+旨在帮助深度学习开发者和企业提供高性能、灵活易用的工业级在线推理服务，助力人工智能落地应用。
+
+更多关于Paddle Serving的介绍，可以参考[Paddle Serving官网repo](https://github.com/PaddlePaddle/Serving)。
+
+本文档主要介绍利用C++ Serving框架实现模型（以yolov3_darknet53_270e_coco为例）的服务化部署。
+
+## 2. C++ Serving预测部署
+
+#### 2.1 C++ 服务化部署样例程序介绍
+服务化部署的样例程序的目录地址为：`deploy/serving/cpp`
+```shell
+deploy/
+├── serving/
+│   ├── python/                       # Python 服务化部署样例程序目录
+│   │   ├──config.yml                 # 服务端模型预测相关配置文件
+│   │   ├──pipeline_http_client.py    # 客户端代码
+│   │   ├──postprocess_ops.py         # 用户自定义后处理代码
+│   │   ├──preprocess_ops.py          # 用户自定义预处理代码
+│   │   ├──README.md                  # 说明文档
+│   │   ├──web_service.py             # 服务端代码
+│   ├── cpp/                          # C++ 服务化部署样例程序目录
+│   │   ├──preprocess/                # C++ 自定义OP
+│   │   ├──build_server.sh            # C++ Serving 编译脚本
+│   │   ├──serving_client.py          # 客户端代码
+│   │   └── ...
+│   └── ...
+└── ...
+```
+
+### 2.2 环境准备
+安装Paddle Serving三个安装包的最新版本，
+分别是：paddle-serving-client, paddle-serving-app和paddlepaddle(CPU/GPU版本二选一)。
+```commandline
+pip install paddle-serving-client
+# pip install paddle-serving-server # CPU
+pip install paddle-serving-server-gpu # GPU 默认 CUDA10.2 + TensorRT6，其他环境需手动指定版本号
+pip install paddle-serving-app
+# pip install paddlepaddle # CPU
+pip install paddlepaddle-gpu
+```
+您可能需要使用国内镜像源（例如百度源, 在pip命令中添加`-i https://mirror.baidu.com/pypi/simple`）来加速下载。
+Paddle Serving Server更多不同运行环境的whl包下载地址，请参考：[下载页面](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Latest_Packages_CN.md)
+PaddlePaddle更多版本请参考[官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)
+
+### 2.3 服务化部署模型导出
+导出步骤参考文档[PaddleDetection部署模型导出教程](../../EXPORT_MODEL.md),
+导出服务化部署模型需要添加`--export_serving_model True`参数，导出示例如下:
+```commandline
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml \
+                             --export_serving_model True \
+                             -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams
+```
+
+### 2.4 编译C++ Serving & 启动服务端模型预测服务
+可使用一键编译脚本`deploy/serving/cpp/build_server.sh`进行编译
+```commandline
+bash deploy/serving/cpp/build_server.sh
+```
+当完成以上编译安装和模型导出后，可以按如下命令启动模型预测服务：
+```commandline
+python -m paddle_serving_server.serve --model output_inference/yolov3_darknet53_270e_coco/serving_server --op yolov3_darknet53_270e_coco --port 9997 &
+```
+如果需要自定义开发OP，请参考[文档](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/C%2B%2B_Serving/2%2B_model.md)进行开发
+
+### 2.5 启动客户端访问
+当成功启动了模型预测服务，可以按如下命令启动客户端访问服务：
+```commandline
+python deploy/serving/python/serving_client.py --serving_client output_inference/yolov3_darknet53_270e_coco/serving_client --image_file demo/000000014439.jpg --http_port 9997
+```
diff --git a/deploy/serving/cpp/build_server.sh b/deploy/serving/cpp/build_server.sh
new file mode 100644
index 0000000000000000000000000000000000000000..803dce07c1cdb9c6a77f063b7b01391f3109667c
--- /dev/null
+++ b/deploy/serving/cpp/build_server.sh
@@ -0,0 +1,70 @@
+#使用镜像：
+#registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82
+
+#编译Serving Server：
+
+#client和app可以直接使用release版本
+
+#server因为加入了自定义OP，需要重新编译
+
+apt-get update
+apt install -y libcurl4-openssl-dev libbz2-dev
+wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && tar xf centos_ssl.tar && rm -rf centos_ssl.tar && mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so
+
+# 安装go依赖
+rm -rf /usr/local/go
+wget -qO- https://paddle-ci.cdn.bcebos.com/go1.17.2.linux-amd64.tar.gz | tar -xz -C /usr/local
+export GOROOT=/usr/local/go
+export GOPATH=/root/gopath
+export PATH=$PATH:$GOPATH/bin:$GOROOT/bin
+go env -w GO111MODULE=on
+go env -w GOPROXY=https://goproxy.cn,direct
+go install github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
+go install github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+go install github.com/golang/protobuf/protoc-gen-go@v1.4.3
+go install google.golang.org/grpc@v1.33.0
+go env -w GO111MODULE=auto
+
+# 下载opencv库
+wget https://paddle-qa.bj.bcebos.com/PaddleServing/opencv3.tar.gz && tar -xvf opencv3.tar.gz && rm -rf opencv3.tar.gz
+export OPENCV_DIR=$PWD/opencv3
+
+# clone Serving
+git clone https://github.com/PaddlePaddle/Serving.git -b develop --depth=1
+cd Serving
+export Serving_repo_path=$PWD
+git submodule update --init --recursive
+python -m pip install -r python/requirements.txt
+
+# set env
+export PYTHON_INCLUDE_DIR=$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())")
+export PYTHON_LIBRARIES=$(python -c "import distutils.sysconfig as sysconfig; print(sysconfig.get_config_var('LIBDIR'))")
+export PYTHON_EXECUTABLE=`which python`
+
+export CUDA_PATH='/usr/local/cuda'
+export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
+export CUDA_CUDART_LIBRARY='/usr/local/cuda/lib64/'
+export TENSORRT_LIBRARY_PATH='/usr/local/TensorRT6-cuda10.1-cudnn7/targets/x86_64-linux-gnu/'
+
+# cp 自定义OP代码
+\cp ../deploy/serving/cpp/preprocess/*.h ${Serving_repo_path}/core/general-server/op
+\cp ../deploy/serving/cpp/preprocess/*.cpp ${Serving_repo_path}/core/general-server/op
+
+# 编译Server, export SERVING_BIN
+mkdir server-build-gpu-opencv && cd server-build-gpu-opencv
+cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
+            -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
+            -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+            -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+            -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+            -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
+            -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
+            -DOPENCV_DIR=${OPENCV_DIR} \
+            -DWITH_OPENCV=ON \
+            -DSERVER=ON \
+            -DWITH_GPU=ON ..
+make -j32
+
+python -m pip install python/dist/paddle*
+export SERVING_BIN=$PWD/core/general-server/serving
+cd ../../
diff --git a/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.cpp b/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b60eb24ce288e349eec73a4bf6c6b7ce8983e7fe
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.cpp
@@ -0,0 +1,309 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/mask_rcnn_r50_fpn_1x_coco.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int mask_rcnn_r50_fpn_1x_coco::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  Resize(&img, scale_factor_h, scale_factor_w, im_shape_h, im_shape_w);
+  Normalize(&img, mean_, scale_, is_scale_);
+  PadStride(&img, 32);
+  int input_shape_h = img.rows;
+  int input_shape_w = img.cols;
+  std::vector<float> input(1 * 3 * input_shape_h * input_shape_w, 0.0f);
+  Permute(img, input.data());
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // im_shape
+  std::vector<float> im_shape{static_cast<float>(im_shape_h),
+                              static_cast<float>(im_shape_w)};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, im_shape.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_0(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_0;
+  tensor_in_0.name = "im_shape";
+  tensor_in_0.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_0.shape = {1, 2};
+  tensor_in_0.lod = in->at(0).lod;
+  tensor_in_0.data = paddleBuf_0;
+  real_in->push_back(tensor_in_0);
+
+  // image
+  in_num = 1 * 3 * input_shape_h * input_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_1(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_1;
+  tensor_in_1.name = "image";
+  tensor_in_1.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_1.shape = {1, 3, input_shape_h, input_shape_w};
+  tensor_in_1.lod = in->at(0).lod;
+  tensor_in_1.data = paddleBuf_1;
+  real_in->push_back(tensor_in_1);
+
+  // scale_factor
+  std::vector<float> scale_factor{scale_factor_h, scale_factor_w};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, scale_factor.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_2(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_2;
+  tensor_in_2.name = "scale_factor";
+  tensor_in_2.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_2.shape = {1, 2};
+  tensor_in_2.lod = in->at(0).lod;
+  tensor_in_2.data = paddleBuf_2;
+  real_in->push_back(tensor_in_2);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void mask_rcnn_r50_fpn_1x_coco::Resize(cv::Mat *img, float &scale_factor_h,
+                                       float &scale_factor_w, int &im_shape_h,
+                                       int &im_shape_w) {
+  // keep_ratio
+  int im_size_max = std::max(img->rows, img->cols);
+  int im_size_min = std::min(img->rows, img->cols);
+  int target_size_max = std::max(im_shape_h, im_shape_w);
+  int target_size_min = std::min(im_shape_h, im_shape_w);
+  float scale_min =
+      static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+  float scale_max =
+      static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+  float scale_ratio = std::min(scale_min, scale_max);
+
+  // scale_factor
+  scale_factor_h = scale_ratio;
+  scale_factor_w = scale_ratio;
+
+  // Resize
+  cv::resize(*img, *img, cv::Size(), scale_ratio, scale_ratio, 2);
+  im_shape_h = img->rows;
+  im_shape_w = img->cols;
+}
+
+void mask_rcnn_r50_fpn_1x_coco::Normalize(cv::Mat *img,
+                                          const std::vector<float> &mean,
+                                          const std::vector<float> &scale,
+                                          const bool is_scale) {
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  (*img).convertTo(*img, CV_32FC3, e);
+  for (int h = 0; h < img->rows; h++) {
+    for (int w = 0; w < img->cols; w++) {
+      img->at<cv::Vec3f>(h, w)[0] =
+          (img->at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img->at<cv::Vec3f>(h, w)[1] =
+          (img->at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img->at<cv::Vec3f>(h, w)[2] =
+          (img->at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+}
+
+void mask_rcnn_r50_fpn_1x_coco::PadStride(cv::Mat *img, int stride_) {
+  // PadStride
+  if (stride_ <= 0)
+    return;
+  int rh = img->rows;
+  int rw = img->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(*img, *img, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT,
+                     cv::Scalar(0));
+}
+
+void mask_rcnn_r50_fpn_1x_coco::Permute(const cv::Mat &img, float *data) {
+  // Permute
+  int rh = img.rows;
+  int rw = img.cols;
+  int rc = img.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw), i);
+  }
+}
+
+cv::Mat mask_rcnn_r50_fpn_1x_coco::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string mask_rcnn_r50_fpn_1x_coco::base64Decode(const char *Data,
+                                                    int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(mask_rcnn_r50_fpn_1x_coco);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.h b/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b2b8377a88b0cbcc313a3dd8a96c35dd9f57f91
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h" // NOLINT
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class mask_rcnn_r50_fpn_1x_coco
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(mask_rcnn_r50_fpn_1x_coco);
+
+  int inference();
+
+private:
+  // preprocess
+  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
+  bool is_scale_ = true;
+  int im_shape_h = 1333;
+  int im_shape_w = 800;
+  float scale_factor_h = 1.0f;
+  float scale_factor_w = 1.0f;
+
+  void Resize(cv::Mat *img, float &scale_factor_h, float &scale_factor_w,
+              int &im_shape_h, int &im_shape_w);
+  void Normalize(cv::Mat *img, const std::vector<float> &mean,
+                 const std::vector<float> &scale, const bool is_scale);
+  void PadStride(cv::Mat *img, int stride_ = -1);
+  void Permute(const cv::Mat &img, float *data);
+
+  // read pics
+  cv::Mat Base2Mat(std::string &base64_data);
+  std::string base64Decode(const char *Data, int DataByte);
+};
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.cpp b/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66bfeaef21189e395c2f15d716468723465c24b6
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.cpp
@@ -0,0 +1,258 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/picodet_lcnet_1_5x_416_coco.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int picodet_lcnet_1_5x_416_coco::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  std::vector<float> input(1 * 3 * im_shape_h * im_shape_w, 0.0f);
+  preprocess_det(img, input.data(), scale_factor_h, scale_factor_w, im_shape_h,
+                 im_shape_w, mean_, scale_, is_scale_);
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // image
+  in_num = 1 * 3 * im_shape_h * im_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in;
+  tensor_in.name = "image";
+  tensor_in.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in.shape = {1, 3, im_shape_h, im_shape_w};
+  tensor_in.lod = in->at(0).lod;
+  tensor_in.data = paddleBuf;
+  real_in->push_back(tensor_in);
+
+  // scale_factor
+  std::vector<float> scale_factor{scale_factor_h, scale_factor_w};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, scale_factor.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_2(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_2;
+  tensor_in_2.name = "scale_factor";
+  tensor_in_2.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_2.shape = {1, 2};
+  tensor_in_2.lod = in->at(0).lod;
+  tensor_in_2.data = paddleBuf_2;
+  real_in->push_back(tensor_in_2);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void picodet_lcnet_1_5x_416_coco::preprocess_det(
+    const cv::Mat &img, float *data, float &scale_factor_h,
+    float &scale_factor_w, int im_shape_h, int im_shape_w,
+    const std::vector<float> &mean, const std::vector<float> &scale,
+    const bool is_scale) {
+  // scale_factor
+  scale_factor_h =
+      static_cast<float>(im_shape_h) / static_cast<float>(img.rows);
+  scale_factor_w =
+      static_cast<float>(im_shape_w) / static_cast<float>(img.cols);
+
+  // Resize
+  cv::Mat resize_img;
+  cv::resize(img, resize_img, cv::Size(im_shape_w, im_shape_h), 0, 0, 2);
+
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  cv::Mat img_fp;
+  (resize_img).convertTo(img_fp, CV_32FC3, e);
+  for (int h = 0; h < im_shape_h; h++) {
+    for (int w = 0; w < im_shape_w; w++) {
+      img_fp.at<cv::Vec3f>(h, w)[0] =
+          (img_fp.at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img_fp.at<cv::Vec3f>(h, w)[1] =
+          (img_fp.at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img_fp.at<cv::Vec3f>(h, w)[2] =
+          (img_fp.at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+
+  // Permute
+  int rh = img_fp.rows;
+  int rw = img_fp.cols;
+  int rc = img_fp.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img_fp, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw),
+                       i);
+  }
+}
+
+cv::Mat picodet_lcnet_1_5x_416_coco::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string picodet_lcnet_1_5x_416_coco::base64Decode(const char *Data,
+                                                      int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(picodet_lcnet_1_5x_416_coco);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.h b/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.h
new file mode 100644
index 0000000000000000000000000000000000000000..4db649a27b2dbd408b1984511cbb184c112bf1fe
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h" // NOLINT
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class picodet_lcnet_1_5x_416_coco
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(picodet_lcnet_1_5x_416_coco);
+
+  int inference();
+
+private:
+  // preprocess
+  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
+  bool is_scale_ = true;
+  int im_shape_h = 416;
+  int im_shape_w = 416;
+  float scale_factor_h = 1.0f;
+  float scale_factor_w = 1.0f;
+  void preprocess_det(const cv::Mat &img, float *data, float &scale_factor_h,
+                      float &scale_factor_w, int im_shape_h, int im_shape_w,
+                      const std::vector<float> &mean,
+                      const std::vector<float> &scale, const bool is_scale);
+
+  // read pics
+  cv::Mat Base2Mat(std::string &base64_data);
+  std::string base64Decode(const char *Data, int DataByte);
+};
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/ppyolo_mbv3_large_coco.cpp b/deploy/serving/cpp/preprocess/ppyolo_mbv3_large_coco.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d2d62cd321bf1d2d5055b827552337e86b4aa15
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/ppyolo_mbv3_large_coco.cpp
@@ -0,0 +1,282 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/ppyolo_mbv3_large_coco.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int ppyolo_mbv3_large_coco::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  std::vector<float> input(1 * 3 * im_shape_h * im_shape_w, 0.0f);
+  preprocess_det(img, input.data(), scale_factor_h, scale_factor_w, im_shape_h,
+                 im_shape_w, mean_, scale_, is_scale_);
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // im_shape
+  std::vector<float> im_shape{static_cast<float>(im_shape_h),
+                              static_cast<float>(im_shape_w)};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, im_shape.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_0(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_0;
+  tensor_in_0.name = "im_shape";
+  tensor_in_0.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_0.shape = {1, 2};
+  tensor_in_0.lod = in->at(0).lod;
+  tensor_in_0.data = paddleBuf_0;
+  real_in->push_back(tensor_in_0);
+
+  // image
+  in_num = 1 * 3 * im_shape_h * im_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_1(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_1;
+  tensor_in_1.name = "image";
+  tensor_in_1.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_1.shape = {1, 3, im_shape_h, im_shape_w};
+  tensor_in_1.lod = in->at(0).lod;
+  tensor_in_1.data = paddleBuf_1;
+  real_in->push_back(tensor_in_1);
+
+  // scale_factor
+  std::vector<float> scale_factor{scale_factor_h, scale_factor_w};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, scale_factor.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_2(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_2;
+  tensor_in_2.name = "scale_factor";
+  tensor_in_2.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_2.shape = {1, 2};
+  tensor_in_2.lod = in->at(0).lod;
+  tensor_in_2.data = paddleBuf_2;
+  real_in->push_back(tensor_in_2);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void ppyolo_mbv3_large_coco::preprocess_det(const cv::Mat &img, float *data,
+                                            float &scale_factor_h,
+                                            float &scale_factor_w,
+                                            int im_shape_h, int im_shape_w,
+                                            const std::vector<float> &mean,
+                                            const std::vector<float> &scale,
+                                            const bool is_scale) {
+  // scale_factor
+  scale_factor_h =
+      static_cast<float>(im_shape_h) / static_cast<float>(img.rows);
+  scale_factor_w =
+      static_cast<float>(im_shape_w) / static_cast<float>(img.cols);
+
+  // Resize
+  cv::Mat resize_img;
+  cv::resize(img, resize_img, cv::Size(im_shape_w, im_shape_h), 0, 0, 2);
+
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  cv::Mat img_fp;
+  (resize_img).convertTo(img_fp, CV_32FC3, e);
+  for (int h = 0; h < im_shape_h; h++) {
+    for (int w = 0; w < im_shape_w; w++) {
+      img_fp.at<cv::Vec3f>(h, w)[0] =
+          (img_fp.at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img_fp.at<cv::Vec3f>(h, w)[1] =
+          (img_fp.at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img_fp.at<cv::Vec3f>(h, w)[2] =
+          (img_fp.at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+
+  // Permute
+  int rh = img_fp.rows;
+  int rw = img_fp.cols;
+  int rc = img_fp.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img_fp, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw),
+                       i);
+  }
+}
+
+cv::Mat ppyolo_mbv3_large_coco::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string ppyolo_mbv3_large_coco::base64Decode(const char *Data,
+                                                 int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(ppyolo_mbv3_large_coco);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/ppyolo_mbv3_large_coco.h b/deploy/serving/cpp/preprocess/ppyolo_mbv3_large_coco.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f55e18f51eae4c3f5588594b2db05773d529987
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/ppyolo_mbv3_large_coco.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h" // NOLINT
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class ppyolo_mbv3_large_coco
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(ppyolo_mbv3_large_coco);
+
+  int inference();
+
+private:
+  // preprocess
+  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
+  bool is_scale_ = true;
+  int im_shape_h = 320;
+  int im_shape_w = 320;
+  float scale_factor_h = 1.0f;
+  float scale_factor_w = 1.0f;
+  void preprocess_det(const cv::Mat &img, float *data, float &scale_factor_h,
+                      float &scale_factor_w, int im_shape_h, int im_shape_w,
+                      const std::vector<float> &mean,
+                      const std::vector<float> &scale, const bool is_scale);
+
+  // read pics
+  cv::Mat Base2Mat(std::string &base64_data);
+  std::string base64Decode(const char *Data, int DataByte);
+};
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/ppyoloe_crn_s_300e_coco.cpp b/deploy/serving/cpp/preprocess/ppyoloe_crn_s_300e_coco.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f59c4f341539db3a7b777051c49da6d6f6919166
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/ppyoloe_crn_s_300e_coco.cpp
@@ -0,0 +1,260 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/ppyoloe_crn_s_300e_coco.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int ppyoloe_crn_s_300e_coco::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  std::vector<float> input(1 * 3 * im_shape_h * im_shape_w, 0.0f);
+  preprocess_det(img, input.data(), scale_factor_h, scale_factor_w, im_shape_h,
+                 im_shape_w, mean_, scale_, is_scale_);
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // image
+  in_num = 1 * 3 * im_shape_h * im_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in;
+  tensor_in.name = "image";
+  tensor_in.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in.shape = {1, 3, im_shape_h, im_shape_w};
+  tensor_in.lod = in->at(0).lod;
+  tensor_in.data = paddleBuf;
+  real_in->push_back(tensor_in);
+
+  // scale_factor
+  std::vector<float> scale_factor{scale_factor_h, scale_factor_w};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, scale_factor.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_2(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_2;
+  tensor_in_2.name = "scale_factor";
+  tensor_in_2.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_2.shape = {1, 2};
+  tensor_in_2.lod = in->at(0).lod;
+  tensor_in_2.data = paddleBuf_2;
+  real_in->push_back(tensor_in_2);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void ppyoloe_crn_s_300e_coco::preprocess_det(const cv::Mat &img, float *data,
+                                             float &scale_factor_h,
+                                             float &scale_factor_w,
+                                             int im_shape_h, int im_shape_w,
+                                             const std::vector<float> &mean,
+                                             const std::vector<float> &scale,
+                                             const bool is_scale) {
+  // scale_factor
+  scale_factor_h =
+      static_cast<float>(im_shape_h) / static_cast<float>(img.rows);
+  scale_factor_w =
+      static_cast<float>(im_shape_w) / static_cast<float>(img.cols);
+
+  // Resize
+  cv::Mat resize_img;
+  cv::resize(img, resize_img, cv::Size(im_shape_w, im_shape_h), 0, 0, 2);
+
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  cv::Mat img_fp;
+  (resize_img).convertTo(img_fp, CV_32FC3, e);
+  for (int h = 0; h < im_shape_h; h++) {
+    for (int w = 0; w < im_shape_w; w++) {
+      img_fp.at<cv::Vec3f>(h, w)[0] =
+          (img_fp.at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img_fp.at<cv::Vec3f>(h, w)[1] =
+          (img_fp.at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img_fp.at<cv::Vec3f>(h, w)[2] =
+          (img_fp.at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+
+  // Permute
+  int rh = img_fp.rows;
+  int rw = img_fp.cols;
+  int rc = img_fp.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img_fp, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw),
+                       i);
+  }
+}
+
+cv::Mat ppyoloe_crn_s_300e_coco::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string ppyoloe_crn_s_300e_coco::base64Decode(const char *Data,
+                                                  int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(ppyoloe_crn_s_300e_coco);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/ppyoloe_crn_s_300e_coco.h b/deploy/serving/cpp/preprocess/ppyoloe_crn_s_300e_coco.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb3e68476998d7fadaafba8e2bc9282c4479a5f8
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/ppyoloe_crn_s_300e_coco.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h" // NOLINT
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class ppyoloe_crn_s_300e_coco
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(ppyoloe_crn_s_300e_coco);
+
+  int inference();
+
+private:
+  // preprocess
+  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
+  bool is_scale_ = true;
+  int im_shape_h = 640;
+  int im_shape_w = 640;
+  float scale_factor_h = 1.0f;
+  float scale_factor_w = 1.0f;
+  void preprocess_det(const cv::Mat &img, float *data, float &scale_factor_h,
+                      float &scale_factor_w, int im_shape_h, int im_shape_w,
+                      const std::vector<float> &mean,
+                      const std::vector<float> &scale, const bool is_scale);
+
+  // read pics
+  cv::Mat Base2Mat(std::string &base64_data);
+  std::string base64Decode(const char *Data, int DataByte);
+};
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/tinypose_128x96.cpp b/deploy/serving/cpp/preprocess/tinypose_128x96.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ccc94d2c4a35ed9f47f65fab6e74301e35c801d6
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/tinypose_128x96.cpp
@@ -0,0 +1,232 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/tinypose_128x96.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int tinypose_128x96::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  std::vector<float> input(1 * 3 * im_shape_h * im_shape_w, 0.0f);
+  preprocess_det(img, input.data(), scale_factor_h, scale_factor_w, im_shape_h,
+                 im_shape_w, mean_, scale_, is_scale_);
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // image
+  in_num = 1 * 3 * im_shape_h * im_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in;
+  tensor_in.name = "image";
+  tensor_in.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in.shape = {1, 3, im_shape_h, im_shape_w};
+  tensor_in.lod = in->at(0).lod;
+  tensor_in.data = paddleBuf;
+  real_in->push_back(tensor_in);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void tinypose_128x96::preprocess_det(const cv::Mat &img, float *data,
+                                     float &scale_factor_h,
+                                     float &scale_factor_w, int im_shape_h,
+                                     int im_shape_w,
+                                     const std::vector<float> &mean,
+                                     const std::vector<float> &scale,
+                                     const bool is_scale) {
+  // Resize
+  cv::Mat resize_img;
+  cv::resize(img, resize_img, cv::Size(im_shape_w, im_shape_h), 0, 0, 1);
+
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  cv::Mat img_fp;
+  (resize_img).convertTo(img_fp, CV_32FC3, e);
+  for (int h = 0; h < im_shape_h; h++) {
+    for (int w = 0; w < im_shape_w; w++) {
+      img_fp.at<cv::Vec3f>(h, w)[0] =
+          (img_fp.at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img_fp.at<cv::Vec3f>(h, w)[1] =
+          (img_fp.at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img_fp.at<cv::Vec3f>(h, w)[2] =
+          (img_fp.at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+
+  // Permute
+  int rh = img_fp.rows;
+  int rw = img_fp.cols;
+  int rc = img_fp.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img_fp, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw),
+                       i);
+  }
+}
+
+cv::Mat tinypose_128x96::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string tinypose_128x96::base64Decode(const char *Data, int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(tinypose_128x96);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/tinypose_128x96.h b/deploy/serving/cpp/preprocess/tinypose_128x96.h
new file mode 100644
index 0000000000000000000000000000000000000000..83bf9bf7d17de5fd03407f73bf7e96b512a6fe3e
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/tinypose_128x96.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h" // NOLINT
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class tinypose_128x96
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(tinypose_128x96);
+
+  int inference();
+
+private:
+  // preprocess
+  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
+  bool is_scale_ = true;
+  int im_shape_h = 128;
+  int im_shape_w = 96;
+  float scale_factor_h = 1.0f;
+  float scale_factor_w = 1.0f;
+  void preprocess_det(const cv::Mat &img, float *data, float &scale_factor_h,
+                      float &scale_factor_w, int im_shape_h, int im_shape_w,
+                      const std::vector<float> &mean,
+                      const std::vector<float> &scale, const bool is_scale);
+
+  // read pics
+  cv::Mat Base2Mat(std::string &base64_data);
+  std::string base64Decode(const char *Data, int DataByte);
+};
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/yolov3_darknet53_270e_coco.cpp b/deploy/serving/cpp/preprocess/yolov3_darknet53_270e_coco.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5937be46c0ffffe07651e7c8ed13be369d03bf7c
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/yolov3_darknet53_270e_coco.cpp
@@ -0,0 +1,282 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/yolov3_darknet53_270e_coco.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int yolov3_darknet53_270e_coco::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  std::vector<float> input(1 * 3 * im_shape_h * im_shape_w, 0.0f);
+  preprocess_det(img, input.data(), scale_factor_h, scale_factor_w, im_shape_h,
+                 im_shape_w, mean_, scale_, is_scale_);
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // im_shape
+  std::vector<float> im_shape{static_cast<float>(im_shape_h),
+                              static_cast<float>(im_shape_w)};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, im_shape.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_0(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_0;
+  tensor_in_0.name = "im_shape";
+  tensor_in_0.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_0.shape = {1, 2};
+  tensor_in_0.lod = in->at(0).lod;
+  tensor_in_0.data = paddleBuf_0;
+  real_in->push_back(tensor_in_0);
+
+  // image
+  in_num = 1 * 3 * im_shape_h * im_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_1(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_1;
+  tensor_in_1.name = "image";
+  tensor_in_1.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_1.shape = {1, 3, im_shape_h, im_shape_w};
+  tensor_in_1.lod = in->at(0).lod;
+  tensor_in_1.data = paddleBuf_1;
+  real_in->push_back(tensor_in_1);
+
+  // scale_factor
+  std::vector<float> scale_factor{scale_factor_h, scale_factor_w};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, scale_factor.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_2(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_2;
+  tensor_in_2.name = "scale_factor";
+  tensor_in_2.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_2.shape = {1, 2};
+  tensor_in_2.lod = in->at(0).lod;
+  tensor_in_2.data = paddleBuf_2;
+  real_in->push_back(tensor_in_2);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void yolov3_darknet53_270e_coco::preprocess_det(const cv::Mat &img, float *data,
+                                                float &scale_factor_h,
+                                                float &scale_factor_w,
+                                                int im_shape_h, int im_shape_w,
+                                                const std::vector<float> &mean,
+                                                const std::vector<float> &scale,
+                                                const bool is_scale) {
+  // scale_factor
+  scale_factor_h =
+      static_cast<float>(im_shape_h) / static_cast<float>(img.rows);
+  scale_factor_w =
+      static_cast<float>(im_shape_w) / static_cast<float>(img.cols);
+
+  // Resize
+  cv::Mat resize_img;
+  cv::resize(img, resize_img, cv::Size(im_shape_w, im_shape_h), 0, 0, 2);
+
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  cv::Mat img_fp;
+  (resize_img).convertTo(img_fp, CV_32FC3, e);
+  for (int h = 0; h < im_shape_h; h++) {
+    for (int w = 0; w < im_shape_w; w++) {
+      img_fp.at<cv::Vec3f>(h, w)[0] =
+          (img_fp.at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img_fp.at<cv::Vec3f>(h, w)[1] =
+          (img_fp.at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img_fp.at<cv::Vec3f>(h, w)[2] =
+          (img_fp.at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+
+  // Permute
+  int rh = img_fp.rows;
+  int rw = img_fp.cols;
+  int rc = img_fp.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img_fp, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw),
+                       i);
+  }
+}
+
+cv::Mat yolov3_darknet53_270e_coco::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string yolov3_darknet53_270e_coco::base64Decode(const char *Data,
+                                                     int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(yolov3_darknet53_270e_coco);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/preprocess/yolov3_darknet53_270e_coco.h b/deploy/serving/cpp/preprocess/yolov3_darknet53_270e_coco.h
new file mode 100644
index 0000000000000000000000000000000000000000..67593040eadd664d49981c66f37d4e689807ec8f
--- /dev/null
+++ b/deploy/serving/cpp/preprocess/yolov3_darknet53_270e_coco.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h" // NOLINT
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class yolov3_darknet53_270e_coco
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(yolov3_darknet53_270e_coco);
+
+  int inference();
+
+private:
+  // preprocess
+  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
+  bool is_scale_ = true;
+  int im_shape_h = 608;
+  int im_shape_w = 608;
+  float scale_factor_h = 1.0f;
+  float scale_factor_w = 1.0f;
+  void preprocess_det(const cv::Mat &img, float *data, float &scale_factor_h,
+                      float &scale_factor_w, int im_shape_h, int im_shape_w,
+                      const std::vector<float> &mean,
+                      const std::vector<float> &scale, const bool is_scale);
+
+  // read pics
+  cv::Mat Base2Mat(std::string &base64_data);
+  std::string base64Decode(const char *Data, int DataByte);
+};
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
diff --git a/deploy/serving/cpp/serving_client.py b/deploy/serving/cpp/serving_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..49134c30569d60533b131b8a25d6584ab782329c
--- /dev/null
+++ b/deploy/serving/cpp/serving_client.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import glob
+import base64
+import argparse
+from paddle_serving_client import Client
+from paddle_serving_client.proto import general_model_config_pb2 as m_config
+import google.protobuf.text_format
+
+parser = argparse.ArgumentParser(description="args for paddleserving")
+parser.add_argument(
+    "--serving_client", type=str, help="the directory of serving_client")
+parser.add_argument("--image_dir", type=str)
+parser.add_argument("--image_file", type=str)
+parser.add_argument("--http_port", type=int, default=9997)
+parser.add_argument(
+    "--threshold", type=float, default=0.5, help="Threshold of score.")
+args = parser.parse_args()
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    print("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+def postprocess(fetch_dict, fetch_vars, draw_threshold=0.5):
+    result = []
+    if "conv2d_441.tmp_1" in fetch_dict:
+        heatmap = fetch_dict["conv2d_441.tmp_1"]
+        print(heatmap)
+        result.append(heatmap)
+    else:
+        bboxes = fetch_dict[fetch_vars[0]]
+        for bbox in bboxes:
+            if bbox[0] > -1 and bbox[1] > draw_threshold:
+                print(f"{int(bbox[0])} {bbox[1]} "
+                      f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")
+                result.append(f"{int(bbox[0])} {bbox[1]} "
+                              f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")
+    return result
+
+
+def get_model_vars(client_config_dir):
+    # read original serving_client_conf.prototxt
+    client_config_file = os.path.join(client_config_dir,
+                                      "serving_client_conf.prototxt")
+    with open(client_config_file, 'r') as f:
+        model_var = google.protobuf.text_format.Merge(
+            str(f.read()), m_config.GeneralModelConfig())
+    # modify feed_var to run core/general-server/op/
+    [model_var.feed_var.pop() for _ in range(len(model_var.feed_var))]
+    feed_var = m_config.FeedVar()
+    feed_var.name = "input"
+    feed_var.alias_name = "input"
+    feed_var.is_lod_tensor = False
+    feed_var.feed_type = 20
+    feed_var.shape.extend([1])
+    model_var.feed_var.extend([feed_var])
+    with open(
+            os.path.join(client_config_dir, "serving_client_conf_cpp.prototxt"),
+            "w") as f:
+        f.write(str(model_var))
+    # get feed_vars/fetch_vars
+    feed_vars = [var.name for var in model_var.feed_var]
+    fetch_vars = [var.name for var in model_var.fetch_var]
+    return feed_vars, fetch_vars
+
+
+if __name__ == '__main__':
+    url = f"127.0.0.1:{args.http_port}"
+    logid = 10000
+    img_list = get_test_images(args.image_dir, args.image_file)
+    feed_vars, fetch_vars = get_model_vars(args.serving_client)
+
+    client = Client()
+    client.load_client_config(
+        os.path.join(args.serving_client, "serving_client_conf_cpp.prototxt"))
+    client.connect([url])
+
+    for img_file in img_list:
+        with open(img_file, 'rb') as file:
+            image_data = file.read()
+        image = base64.b64encode(image_data).decode('utf8')
+        fetch_dict = client.predict(
+            feed={feed_vars[0]: image}, fetch=fetch_vars)
+        result = postprocess(fetch_dict, fetch_vars, args.threshold)
diff --git a/deploy/serving/cpp/serving_client_conf.prototxt b/deploy/serving/cpp/serving_client_conf.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb069003ab8a6b8163d7e06d7760b1c6c42b196a
--- /dev/null
+++ b/deploy/serving/cpp/serving_client_conf.prototxt
@@ -0,0 +1,20 @@
+feed_var {
+  name: "input"
+  alias_name: "input"
+  is_lod_tensor: false
+  feed_type: 20
+  shape: 1
+}
+fetch_var {
+  name: "multiclass_nms3_0.tmp_0"
+  alias_name: "multiclass_nms3_0.tmp_0"
+  is_lod_tensor: true
+  fetch_type: 1
+  shape: -1
+}
+fetch_var {
+  name: "multiclass_nms3_0.tmp_2"
+  alias_name: "multiclass_nms3_0.tmp_2"
+  is_lod_tensor: false
+  fetch_type: 2
+}
\ No newline at end of file
diff --git a/deploy/serving/label_list.txt b/deploy/serving/label_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1f42c8eb44628f95b2f4067de928a7f5c1e9c8dc
--- /dev/null
+++ b/deploy/serving/label_list.txt
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
\ No newline at end of file
diff --git a/deploy/serving/python/README.md b/deploy/serving/python/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..40130d043b0db0bc7e7077088d9eb4c1fc1e6cb6
--- /dev/null
+++ b/deploy/serving/python/README.md
@@ -0,0 +1,72 @@
+# Python Serving预测部署
+
+## 1. 简介
+Paddle Serving是飞桨开源的服务化部署框架，提供了C++ Serving和Python Pipeline两套框架，
+C++ Serving框架更倾向于追求极致性能，Python Pipeline框架倾向于二次开发的便捷性。
+旨在帮助深度学习开发者和企业提供高性能、灵活易用的工业级在线推理服务，助力人工智能落地应用。
+
+更多关于Paddle Serving的介绍，可以参考[Paddle Serving官网repo](https://github.com/PaddlePaddle/Serving)。
+
+本文档主要介绍利用Python Pipeline框架实现模型（以yolov3_darknet53_270e_coco为例）的服务化部署。
+
+## 2. Python Serving预测部署
+
+#### 2.1 Python 服务化部署样例程序介绍
+服务化部署的样例程序的目录地址为：`deploy/serving/python`
+```shell
+deploy/
+├── serving/
+│   ├── python/                       # Python 服务化部署样例程序目录
+│   │   ├──config.yml                 # 服务端模型预测相关配置文件
+│   │   ├──pipeline_http_client.py    # 客户端代码
+│   │   ├──postprocess_ops.py         # 用户自定义后处理代码
+│   │   ├──preprocess_ops.py          # 用户自定义预处理代码
+│   │   ├──README.md                  # 说明文档
+│   │   ├──web_service.py             # 服务端代码
+│   ├── cpp/                          # C++ 服务化部署样例程序目录
+│   │   ├──preprocess/                # C++ 自定义OP
+│   │   ├──build_server.sh            # C++ Serving 编译脚本
+│   │   ├──serving_client.py          # 客户端代码
+│   │   └── ...
+│   └── ...
+└── ...
+```
+
+### 2.2 环境准备
+安装Paddle Serving四个安装包的最新版本，
+分别是：paddle-serving-server(CPU/GPU版本二选一),
+paddle-serving-client, paddle-serving-app和paddlepaddle(CPU/GPU版本二选一)。
+```commandline
+pip install paddle-serving-client
+# pip install paddle-serving-server # CPU
+pip install paddle-serving-server-gpu # GPU 默认 CUDA10.2 + TensorRT6，其他环境需手动指定版本号
+pip install paddle-serving-app
+# pip install paddlepaddle # CPU
+pip install paddlepaddle-gpu
+```
+您可能需要使用国内镜像源（例如百度源, 在pip命令中添加`-i https://mirror.baidu.com/pypi/simple`）来加速下载。
+Paddle Serving Server更多不同运行环境的whl包下载地址，请参考：[下载页面](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Latest_Packages_CN.md)
+PaddlePaddle更多版本请参考[官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)
+
+### 2.3 服务化部署模型导出
+导出步骤参考文档[PaddleDetection部署模型导出教程](../../EXPORT_MODEL.md),
+导出服务化部署模型需要添加`--export_serving_model True`参数，导出示例如下:
+```commandline
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml \
+                             --export_serving_model True \
+                             -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams
+```
+
+### 2.4 启动服务端模型预测服务
+当完成以上环境准备和模型导出后，可以按如下命令启动模型预测服务：
+```commandline
+python deploy/serving/python/web_service.py --model_dir output_inference/yolov3_darknet53_270e_coco &
+```
+服务端模型预测相关配置可在[config.yml](./config.yml)中修改，
+开发者只需要关注如下配置：http_port（服务的http端口），device_type（计算硬件类型），devices（计算硬件ID）。
+
+### 2.5 启动客户端访问
+当成功启动了模型预测服务，可以按如下命令启动客户端访问服务：
+```commandline
+python deploy/serving/python/pipeline_http_client.py --image_file demo/000000014439.jpg
+```
diff --git a/deploy/serving/python/config.yml b/deploy/serving/python/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5ec4285257d618f6c5a7ed02aab5c34dae9a96e1
--- /dev/null
+++ b/deploy/serving/python/config.yml
@@ -0,0 +1,31 @@
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 20
+
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 18093
+rpc_port: 9993
+
+dag:
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: False
+op:
+    #op名称，与web_service中的TIPCExampleService初始化name参数一致
+    ppdet:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+        local_service_conf:
+
+            #uci模型路径
+            model_config: "./serving_server"
+
+            #计算硬件类型: 空缺时由devices决定(CPU/GPU)，0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+            device_type:
+
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            devices: "0" # "0,1"
+
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
diff --git a/deploy/serving/python/pipeline_http_client.py b/deploy/serving/python/pipeline_http_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa9b30c0d79bf5a7e0d5da7a2538580e7452f8bb
--- /dev/null
+++ b/deploy/serving/python/pipeline_http_client.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import requests
+import json
+import base64
+import os
+import argparse
+
+parser = argparse.ArgumentParser(description="args for paddleserving")
+parser.add_argument("--image_dir", type=str)
+parser.add_argument("--image_file", type=str)
+parser.add_argument("--http_port", type=int, default=18093)
+parser.add_argument("--service_name", type=str, default="ppdet")
+args = parser.parse_args()
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    print("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+if __name__ == "__main__":
+    url = f"http://127.0.0.1:{args.http_port}/{args.service_name}/prediction"
+    logid = 10000
+    img_list = get_test_images(args.image_dir, args.image_file)
+
+    for img_file in img_list:
+        with open(img_file, 'rb') as file:
+            image_data = file.read()
+
+        # base64 encode
+        image = base64.b64encode(image_data).decode('utf8')
+
+        data = {"key": ["image_0"], "value": [image], "logid": logid}
+        # send requests
+        r = requests.post(url=url, data=json.dumps(data))
+        print(r.json())
diff --git a/deploy/serving/python/postprocess_ops.py b/deploy/serving/python/postprocess_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1836f7de776921c4dae97d42e927834a3d2d8613
--- /dev/null
+++ b/deploy/serving/python/postprocess_ops.py
@@ -0,0 +1,171 @@
+import cv2
+import math
+import numpy as np
+from preprocess_ops import get_affine_transform
+
+
+class HRNetPostProcess(object):
+    def __init__(self, use_dark=True):
+        self.use_dark = use_dark
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped
+
+    def get_max_preds(self, heatmaps):
+        """get predictions from score maps
+
+        Args:
+            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
+            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
+        """
+        assert isinstance(heatmaps,
+                          np.ndarray), 'heatmaps should be numpy.ndarray'
+        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+        batch_size = heatmaps.shape[0]
+        num_joints = heatmaps.shape[1]
+        width = heatmaps.shape[3]
+        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
+        idx = np.argmax(heatmaps_reshaped, 2)
+        maxvals = np.amax(heatmaps_reshaped, 2)
+
+        maxvals = maxvals.reshape((batch_size, num_joints, 1))
+        idx = idx.reshape((batch_size, num_joints, 1))
+
+        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+        preds[:, :, 0] = (preds[:, :, 0]) % width
+        preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+        pred_mask = pred_mask.astype(np.float32)
+
+        preds *= pred_mask
+
+        return preds, maxvals
+
+    def gaussian_blur(self, heatmap, kernel):
+        border = (kernel - 1) // 2
+        batch_size = heatmap.shape[0]
+        num_joints = heatmap.shape[1]
+        height = heatmap.shape[2]
+        width = heatmap.shape[3]
+        for i in range(batch_size):
+            for j in range(num_joints):
+                origin_max = np.max(heatmap[i, j])
+                dr = np.zeros((height + 2 * border, width + 2 * border))
+                dr[border:-border, border:-border] = heatmap[i, j].copy()
+                dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
+                heatmap[i, j] = dr[border:-border, border:-border].copy()
+                heatmap[i, j] *= origin_max / np.max(heatmap[i, j])
+        return heatmap
+
+    def dark_parse(self, hm, coord):
+        heatmap_height = hm.shape[0]
+        heatmap_width = hm.shape[1]
+        px = int(coord[0])
+        py = int(coord[1])
+        if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2:
+            dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
+            dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
+            dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
+            dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \
+                + hm[py-1][px-1])
+            dyy = 0.25 * (
+                hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
+            derivative = np.matrix([[dx], [dy]])
+            hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
+            if dxx * dyy - dxy**2 != 0:
+                hessianinv = hessian.I
+                offset = -hessianinv * derivative
+                offset = np.squeeze(np.array(offset.T), axis=0)
+                coord += offset
+        return coord
+
+    def dark_postprocess(self, hm, coords, kernelsize):
+        """
+        refer to https://github.com/ilovepose/DarkPose/lib/core/inference.py
+
+        """
+        hm = self.gaussian_blur(hm, kernelsize)
+        hm = np.maximum(hm, 1e-10)
+        hm = np.log(hm)
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                coords[n, p] = self.dark_parse(hm[n][p], coords[n][p])
+        return coords
+
+    def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
+        """the highest heatvalue location with a quarter offset in the
+        direction from the highest response to the second highest response.
+
+        Args:
+            heatmaps (numpy.ndarray): The predicted heatmaps
+            center (numpy.ndarray): The boxes center
+            scale (numpy.ndarray): The scale factor
+
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
+            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
+        """
+
+        coords, maxvals = self.get_max_preds(heatmaps)
+
+        heatmap_height = heatmaps.shape[2]
+        heatmap_width = heatmaps.shape[3]
+
+        if self.use_dark:
+            coords = self.dark_postprocess(heatmaps, coords, kernelsize)
+        else:
+            for n in range(coords.shape[0]):
+                for p in range(coords.shape[1]):
+                    hm = heatmaps[n][p]
+                    px = int(math.floor(coords[n][p][0] + 0.5))
+                    py = int(math.floor(coords[n][p][1] + 0.5))
+                    if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
+                        diff = np.array([
+                            hm[py][px + 1] - hm[py][px - 1],
+                            hm[py + 1][px] - hm[py - 1][px]
+                        ])
+                        coords[n][p] += np.sign(diff) * .25
+        preds = coords.copy()
+
+        # Transform back
+        for i in range(coords.shape[0]):
+            preds[i] = transform_preds(coords[i], center[i], scale[i],
+                                       [heatmap_width, heatmap_height])
+
+        return preds, maxvals
+
+    def __call__(self, output, center, scale):
+        preds, maxvals = self.get_final_preds(output, center, scale)
+        return np.concatenate(
+            (preds, maxvals), axis=-1), np.mean(
+                maxvals, axis=1)
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
diff --git a/deploy/serving/python/preprocess_ops.py b/deploy/serving/python/preprocess_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..15f76159818a159f3967c7778eda7dc53a8a40a4
--- /dev/null
+++ b/deploy/serving/python/preprocess_ops.py
@@ -0,0 +1,490 @@
+import numpy as np
+import cv2
+import copy
+
+
+def decode_image(im):
+    im = np.array(im)
+    img_info = {
+        "im_shape": np.array(
+            im.shape[:2], dtype=np.float32),
+        "scale_factor": np.array(
+            [1., 1.], dtype=np.float32)
+    }
+    return im, img_info
+
+
+class Resize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+
+    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+        im_info['scale_factor'] = np.array(
+            [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+
+
+class NormalizeImage(object):
+    """normalize image
+    Args:
+        mean (list): im - mean
+        std (list): im / std
+        is_scale (bool): whether need im / 255
+        norm_type (str): type in ['mean_std', 'none']
+    """
+
+    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+        return im, im_info
+
+
+class Permute(object):
+    """permute image
+    Args:
+        to_bgr (bool): whether convert RGB to BGR
+        channel_first (bool): whether convert HWC to CHW
+    """
+
+    def __init__(self, ):
+        super(Permute, self).__init__()
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.transpose((2, 0, 1)).copy()
+        return im, im_info
+
+
+class PadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im, im_info
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
+
+
+class LetterBoxResize(object):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        height, width = self.target_size
+        h, w = im.shape[:2]
+        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
+
+        new_shape = [round(h * ratio), round(w * ratio)]
+        im_info['im_shape'] = np.array(new_shape, dtype=np.float32)
+        im_info['scale_factor'] = np.array([ratio, ratio], dtype=np.float32)
+        return im, im_info
+
+
+class Pad(object):
+    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
+        """
+        Pad image to a specified size.
+        Args:
+            size (list[int]): image target size
+            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
+        """
+        super(Pad, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+        self.fill_value = fill_value
+
+    def __call__(self, im, im_info):
+        im_h, im_w = im.shape[:2]
+        h, w = self.size
+        if h == im_h and w == im_w:
+            im = im.astype(np.float32)
+            return im, im_info
+
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
+        im = canvas
+        return im, im_info
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def get_affine_transform(center,
+                         input_size,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+    if not isinstance(input_size, (np.ndarray, list)):
+        input_size = np.array([input_size, input_size], dtype=np.float32)
+    scale_tmp = input_size
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+class WarpAffine(object):
+    """Warp affine the image
+    """
+
+    def __init__(self,
+                 keep_res=False,
+                 pad=31,
+                 input_h=512,
+                 input_w=512,
+                 scale=0.4,
+                 shift=0.1):
+        self.keep_res = keep_res
+        self.pad = pad
+        self.input_h = input_h
+        self.input_w = input_w
+        self.scale = scale
+        self.shift = shift
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        img = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
+
+        h, w = img.shape[:2]
+
+        if self.keep_res:
+            input_h = (h | self.pad) + 1
+            input_w = (w | self.pad) + 1
+            s = np.array([input_w, input_h], dtype=np.float32)
+            c = np.array([w // 2, h // 2], dtype=np.float32)
+
+        else:
+            s = max(h, w) * 1.0
+            input_h, input_w = self.input_h, self.input_w
+            c = np.array([w / 2., h / 2.], dtype=np.float32)
+
+        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
+        img = cv2.resize(img, (w, h))
+        inp = cv2.warpAffine(
+            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+        return inp, im_info
+
+
+# keypoint preprocess
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """This code is based on
+        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+
+        Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        matrix (np.ndarray): A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = np.cos(theta) * scale_x
+    matrix[0, 1] = -np.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
+        np.sin(theta) + 0.5 * size_target[0])
+    matrix[1, 0] = np.sin(theta) * scale_y
+    matrix[1, 1] = np.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
+        np.cos(theta) + 0.5 * size_target[1])
+    return matrix
+
+
+class TopDownEvalAffine(object):
+    """apply affine transform to image and coords
+
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self, trainsize, use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+
+    def __call__(self, image, im_info):
+        rot = 0
+        imshape = im_info['im_shape'][::-1]
+        center = im_info['center'] if 'center' in im_info else imshape / 2.
+        scale = im_info['scale'] if 'scale' in im_info else imshape
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot, center * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        else:
+            trans = get_affine_transform(center, scale, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+
+        return image, im_info
+
+
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = []
+        for op_info in transforms:
+            new_op_info = op_info.copy()
+            op_type = new_op_info.pop('type')
+            self.transforms.append(eval(op_type)(**new_op_info))
+
+    def __call__(self, img):
+        img, im_info = decode_image(img)
+        for t in self.transforms:
+            img, im_info = t(img, im_info)
+        inputs = copy.deepcopy(im_info)
+        inputs['image'] = img
+        return inputs
diff --git a/deploy/serving/python/web_service.py b/deploy/serving/python/web_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..b273d7ea46a3fc4f6b7d8d0cc4b43f9ffdf14aa6
--- /dev/null
+++ b/deploy/serving/python/web_service.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+from paddle_serving_server.web_service import WebService, Op
+from paddle_serving_server.proto import general_model_config_pb2 as m_config
+import google.protobuf.text_format
+
+import os
+import numpy as np
+import base64
+from PIL import Image
+import io
+from preprocess_ops import Compose
+from postprocess_ops import HRNetPostProcess
+
+from argparse import ArgumentParser, RawDescriptionHelpFormatter
+import yaml
+
+# Global dictionary
+SUPPORT_MODELS = {
+    'YOLO', 'PPYOLOE', 'YOLOX', 'YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8', 'RTMDet'
+}
+
+GLOBAL_VAR = {}
+
+
+class ArgsParser(ArgumentParser):
+    def __init__(self):
+        super(ArgsParser, self).__init__(
+            formatter_class=RawDescriptionHelpFormatter)
+        self.add_argument(
+            "-c",
+            "--config",
+            default="deploy/serving/python/config.yml",
+            help="configuration file to use")
+        self.add_argument(
+            "--model_dir",
+            type=str,
+            default=None,
+            help=("Directory include:'model.pdiparams', 'model.pdmodel', "
+                  "'infer_cfg.yml', created by tools/export_model.py."),
+            required=True)
+        self.add_argument(
+            "-o", "--opt", nargs='+', help="set configuration options")
+
+    def parse_args(self, argv=None):
+        args = super(ArgsParser, self).parse_args(argv)
+        assert args.config is not None, \
+            "Please specify --config=configure_file_path."
+        args.service_config = self._parse_opt(args.opt, args.config)
+        args.model_config = PredictConfig(args.model_dir)
+        return args
+
+    def _parse_helper(self, v):
+        if v.isnumeric():
+            if "." in v:
+                v = float(v)
+            else:
+                v = int(v)
+        elif v == "True" or v == "False":
+            v = (v == "True")
+        return v
+
+    def _parse_opt(self, opts, conf_path):
+        f = open(conf_path)
+        config = yaml.load(f, Loader=yaml.Loader)
+        if not opts:
+            return config
+        for s in opts:
+            s = s.strip()
+            k, v = s.split('=')
+            v = self._parse_helper(v)
+            if "devices" in k:
+                v = str(v)
+            print(k, v, type(v))
+            cur = config
+            parent = cur
+            for kk in k.split("."):
+                if kk not in cur:
+                    cur[kk] = {}
+                    parent = cur
+                    cur = cur[kk]
+                else:
+                    parent = cur
+                    cur = cur[kk]
+            parent[k.split(".")[-1]] = v
+        return config
+
+
+class PredictConfig(object):
+    """set config of preprocess, postprocess and visualize
+    Args:
+        model_dir (str): root path of infer_cfg.yml
+    """
+
+    def __init__(self, model_dir):
+        # parsing Yaml config for Preprocess
+        deploy_file = os.path.join(model_dir, 'infer_cfg.yml')
+        with open(deploy_file) as f:
+            yml_conf = yaml.safe_load(f)
+        self.check_model(yml_conf)
+        self.arch = yml_conf['arch']
+        self.preprocess_infos = yml_conf['Preprocess']
+        self.min_subgraph_size = yml_conf['min_subgraph_size']
+        self.label_list = yml_conf['label_list']
+        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
+        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
+        self.mask = yml_conf.get("mask", False)
+        self.tracker = yml_conf.get("tracker", None)
+        self.nms = yml_conf.get("NMS", None)
+        self.fpn_stride = yml_conf.get("fpn_stride", None)
+        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
+            print(
+                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
+            )
+        self.print_config()
+
+    def check_model(self, yml_conf):
+        """
+        Raises:
+            ValueError: loaded model not in supported model type
+        """
+        for support_model in SUPPORT_MODELS:
+            if support_model in yml_conf['arch']:
+                return True
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
+            'arch'], SUPPORT_MODELS))
+
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+
+
+class DetectorOp(Op):
+    def init_op(self):
+        self.preprocess_pipeline = Compose(GLOBAL_VAR['preprocess_ops'])
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        (_, input_dict), = input_dicts.items()
+        inputs = []
+        for key, data in input_dict.items():
+            data = base64.b64decode(data.encode('utf8'))
+            byte_stream = io.BytesIO(data)
+            img = Image.open(byte_stream).convert("RGB")
+            inputs.append(self.preprocess_pipeline(img))
+        inputs = self.collate_inputs(inputs)
+        return inputs, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        (_, input_dict), = input_dicts.items()
+        if GLOBAL_VAR['model_config'].arch in ["HRNet"]:
+            result = self.parse_keypoint_result(input_dict, fetch_dict)
+        else:
+            result = self.parse_detection_result(input_dict, fetch_dict)
+        return result, None, ""
+
+    def collate_inputs(self, inputs):
+        collate_inputs = {k: [] for k in inputs[0].keys()}
+        for info in inputs:
+            for k in collate_inputs.keys():
+                collate_inputs[k].append(info[k])
+        return {
+            k: np.stack(v)
+            for k, v in collate_inputs.items() if k in GLOBAL_VAR['feed_vars']
+        }
+
+    def parse_detection_result(self, input_dict, fetch_dict):
+        bboxes = fetch_dict[GLOBAL_VAR['fetch_vars'][0]]
+        bboxes_num = fetch_dict[GLOBAL_VAR['fetch_vars'][1]]
+        if GLOBAL_VAR['model_config'].mask:
+            masks = fetch_dict[GLOBAL_VAR['fetch_vars'][2]]
+        idx = 0
+        results = {}
+        for img_name, num in zip(input_dict.keys(), bboxes_num):
+            if num == 0:
+                results[img_name] = 'No object detected!'
+            else:
+                result = []
+                bbox = bboxes[idx:idx + num]
+                for line in bbox:
+                    if line[0] > -1 and line[1] > GLOBAL_VAR[
+                            'model_config'].draw_threshold:
+                        result.append(
+                            f"{int(line[0])} {line[1]} "
+                            f"{line[2]} {line[3]} {line[4]} {line[5]}")
+                if len(result) == 0:
+                    result = 'No object detected!'
+                results[img_name] = result
+            idx += num
+        return results
+
+    def parse_keypoint_result(self, input_dict, fetch_dict):
+        heatmap = fetch_dict["conv2d_441.tmp_1"]
+        keypoint_postprocess = HRNetPostProcess()
+        im_shape = []
+        for key, data in input_dict.items():
+            data = base64.b64decode(data.encode('utf8'))
+            byte_stream = io.BytesIO(data)
+            img = Image.open(byte_stream).convert("RGB")
+            im_shape.append([img.width, img.height])
+        im_shape = np.array(im_shape)
+        center = np.round(im_shape / 2.)
+        scale = im_shape / 200.
+        kpts, scores = keypoint_postprocess(heatmap, center, scale)
+        results = {"keypoint": kpts, "scores": scores}
+        return results
+
+
+class DetectorService(WebService):
+    def get_pipeline_response(self, read_op):
+        return DetectorOp(name="ppdet", input_ops=[read_op])
+
+
+def get_model_vars(model_dir, service_config):
+    serving_server_dir = os.path.join(model_dir, "serving_server")
+    # rewrite model_config
+    service_config['op']['ppdet']['local_service_conf'][
+        'model_config'] = serving_server_dir
+    serving_server_conf = os.path.join(serving_server_dir,
+                                       "serving_server_conf.prototxt")
+    with open(serving_server_conf, 'r') as f:
+        model_var = google.protobuf.text_format.Merge(
+            str(f.read()), m_config.GeneralModelConfig())
+    feed_vars = [var.name for var in model_var.feed_var]
+    fetch_vars = [var.name for var in model_var.fetch_var]
+    return feed_vars, fetch_vars
+
+
+if __name__ == '__main__':
+    # load config and prepare the service
+    FLAGS = ArgsParser().parse_args()
+    feed_vars, fetch_vars = get_model_vars(FLAGS.model_dir,
+                                           FLAGS.service_config)
+    GLOBAL_VAR['feed_vars'] = feed_vars
+    GLOBAL_VAR['fetch_vars'] = fetch_vars
+    GLOBAL_VAR['preprocess_ops'] = FLAGS.model_config.preprocess_infos
+    GLOBAL_VAR['model_config'] = FLAGS.model_config
+    print(FLAGS)
+    # define the service
+    uci_service = DetectorService(name="ppdet")
+    uci_service.prepare_pipeline_config(yml_dict=FLAGS.service_config)
+    # start the service
+    uci_service.run_service()
diff --git a/deploy/serving/test_client.py b/deploy/serving/test_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..d66d52b1c5708a8f7f36fe969841970cfb1d9cf8
--- /dev/null
+++ b/deploy/serving/test_client.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
+preprocess = Sequential([
+    File2Image(), BGR2RGB(), Resize(
+        (608, 608), interpolation=cv2.INTER_LINEAR), Div(255.0), Transpose(
+            (2, 0, 1))
+])
+
+postprocess = RCNNPostprocess(sys.argv[1], "output", [608, 608])
+client = Client()
+
+client.load_client_config("serving_client/serving_client_conf.prototxt")
+client.connect(['127.0.0.1:9393'])
+
+im = preprocess(sys.argv[2])
+fetch_map = client.predict(
+    feed={
+        "image": im,
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
+        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+    },
+    fetch=["multiclass_nms3_0.tmp_0"],
+    batch=False)
+print(fetch_map)
+fetch_map["image"] = sys.argv[2]
+postprocess(fetch_map)
diff --git a/deploy/third_engine/demo_avh/.gitignore b/deploy/third_engine/demo_avh/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..faeba235a6894f0bd28aab23dea5f4f559071846
--- /dev/null
+++ b/deploy/third_engine/demo_avh/.gitignore
@@ -0,0 +1,5 @@
+include/inputs.h
+include/outputs.h
+
+__pycache__/
+build/
\ No newline at end of file
diff --git a/deploy/third_engine/demo_avh/Makefile b/deploy/third_engine/demo_avh/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..cf7d375b7e54c7781768db39274d9b3f7128812b
--- /dev/null
+++ b/deploy/third_engine/demo_avh/Makefile
@@ -0,0 +1,129 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Makefile to build demo
+
+# Setup build environment
+BUILD_DIR := build
+
+ARM_CPU = ARMCM55
+ETHOSU_PATH = /opt/arm/ethosu
+CMSIS_PATH ?= ${ETHOSU_PATH}/cmsis
+ETHOSU_PLATFORM_PATH ?= ${ETHOSU_PATH}/core_platform
+STANDALONE_CRT_PATH := $(abspath $(BUILD_DIR))/runtime
+CORSTONE_300_PATH = ${ETHOSU_PLATFORM_PATH}/targets/corstone-300
+PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -std=gnu99
+CMAKE ?= cmake
+CC = arm-none-eabi-gcc
+AR = arm-none-eabi-ar
+RANLIB = arm-none-eabi-ranlib
+PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
+	-I${STANDALONE_CRT_PATH}/include \
+	-I${STANDALONE_CRT_PATH}/src/runtime/crt/include \
+	-I${PWD}/include \
+	-I${CORSTONE_300_PATH} \
+	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
+	-I${CMSIS_PATH}/CMSIS/Core/Include \
+	-I${CMSIS_PATH}/CMSIS/NN/Include \
+	-I${CMSIS_PATH}/CMSIS/DSP/Include \
+	-I$(abspath $(BUILD_DIR))/codegen/host/include
+CMSIS_NN_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
+	-DTARGET_CPU=cortex-m55 \
+	-DBUILD_CMSIS_NN_FUNCTIONS=YES
+PKG_LDFLAGS = -lm -specs=nosys.specs -static -T corstone300.ld
+
+$(ifeq VERBOSE,1)
+QUIET ?=
+$(else)
+QUIET ?= @
+$(endif)
+
+DEMO_MAIN = src/demo_bare_metal.c
+CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
+CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
+CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
+UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
+
+demo: $(BUILD_DIR)/demo
+
+$(BUILD_DIR)/stack_allocator.o: $(STANDALONE_CRT_PATH)/src/runtime/crt/memory/stack_allocator.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^
+
+$(BUILD_DIR)/crt_backend_api.o: $(STANDALONE_CRT_PATH)/src/runtime/crt/common/crt_backend_api.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^
+
+# Build generated code
+$(BUILD_DIR)/libcodegen.a: $(CODEGEN_SRCS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/codegen/host/src) && $(CC) -c $(PKG_CFLAGS) $(CODEGEN_SRCS)
+	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcodegen.a) $(CODEGEN_OBJS)
+	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcodegen.a)
+
+# Build CMSIS startup code
+${BUILD_DIR}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS)
+	$(QUIET)mkdir -p $(abspath $(BUILD_DIR)/libcmsis_startup)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/libcmsis_startup) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^
+	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_startup.a) $(abspath $(BUILD_DIR))/libcmsis_startup/*.o
+	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_startup.a)
+
+CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha
+ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
+${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a:
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all
+else
+# Build CMSIS-NN
+${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a:
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all
+endif
+
+# Build demo application
+ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
+$(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
+	${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
+else
+$(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
+       ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a \
+       ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a \
+       ${BUILD_DIR}/cmsis_nn/Source/FullyConnectedFunctions/libCMSISNNFullyConnected.a \
+       ${BUILD_DIR}/cmsis_nn/Source/SVDFunctions/libCMSISNNSVDF.a \
+       ${BUILD_DIR}/cmsis_nn/Source/ReshapeFunctions/libCMSISNNReshape.a \
+       ${BUILD_DIR}/cmsis_nn/Source/ActivationFunctions/libCMSISNNActivation.a \
+       ${BUILD_DIR}/cmsis_nn/Source/NNSupportFunctions/libCMSISNNSupport.a \
+       ${BUILD_DIR}/cmsis_nn/Source/ConcatenationFunctions/libCMSISNNConcatenation.a \
+       ${BUILD_DIR}/cmsis_nn/Source/BasicMathFunctions/libCMSISNNBasicMaths.a \
+       ${BUILD_DIR}/cmsis_nn/Source/ConvolutionFunctions/libCMSISNNConvolutions.a \
+       ${BUILD_DIR}/cmsis_nn/Source/PoolingFunctions/libCMSISNNPooling.a
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
+endif
+
+clean:
+	$(QUIET)rm -rf $(BUILD_DIR)/codegen
+
+cleanall:
+	$(QUIET)rm -rf $(BUILD_DIR)
+
+.SUFFIXES:
+
+.DEFAULT: demo
diff --git a/deploy/third_engine/demo_avh/README.md b/deploy/third_engine/demo_avh/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..17454619fd894070f0cc221df509812619967108
--- /dev/null
+++ b/deploy/third_engine/demo_avh/README.md
@@ -0,0 +1,107 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+Running PP-PicoDet object detection model on bare metal Arm(R) Cortex(R)-M55 CPU using Arm Virtual Hardware
+======================================================================
+
+This folder contains an example of how to run a PP-PicoDet model on bare metal [Cortex(R)-M55 CPU](https://www.arm.com/products/silicon-ip-cpu/cortex-m/cortex-m55) using [Arm Virtual Hardware](https://www.arm.com/products/development-tools/simulation/virtual-hardware).
+
+
+Running environment and prerequisites
+-------------
+Case 1: If the demo is run in Arm Virtual Hardware Amazon Machine Image(AMI) instance hosted by [AWS](https://aws.amazon.com/marketplace/pp/prodview-urbpq7yo5va7g?sr=0-1&ref_=beagle&applicationId=AWSMPContessa)/[AWS China](https://awsmarketplace.amazonaws.cn/marketplace/pp/prodview-2y7nefntbmybu), the following software will be installed through [configure_avh.sh](./configure_avh.sh) script. It will install automatically when you run the application through [run_demo.sh](./run_demo.sh) script.
+You can refer to this [guide](https://arm-software.github.io/AVH/main/examples/html/MicroSpeech.html#amilaunch) to launch an Arm Virtual Hardware AMI instance.
+
+Case 2: If the demo is run in the [ci_cpu Docker container](https://github.com/apache/tvm/blob/main/docker/Dockerfile.ci_cpu) provided with [TVM](https://github.com/apache/tvm), then the following software will already be installed.
+
+Case 3: If the demo is not run in the ci_cpu Docker container, then you will need the following:
+- Software required to build and run the demo (These can all be installed by running
+  tvm/docker/install/ubuntu_install_ethosu_driver_stack.sh.)
+  - [Fixed Virtual Platform (FVP) based on Arm(R) Corstone(TM)-300 software](https://developer.arm.com/tools-and-software/open-source-software/arm-platforms-software/arm-ecosystem-fvps)
+  - [cmake 3.19.5](https://github.com/Kitware/CMake/releases/)
+  - [GCC toolchain from Arm(R)](https://developer.arm.com/-/media/Files/downloads/gnu-rm/10-2020q4/gcc-arm-none-eabi-10-2020-q4-major-x86_64-linux.tar.bz2)
+  - [Arm(R) Ethos(TM)-U NPU driver stack](https://review.mlplatform.org)
+  - [CMSIS](https://github.com/ARM-software/CMSIS_5)
+- The python libraries listed in the requirements.txt of this directory
+  - These can be installed by running the following from the current directory:
+    ```bash
+    pip install -r ./requirements.txt
+    ```
+
+In case2 and case3:  
+
+You will need to update your PATH environment variable to include the path to cmake 3.19.5 and the FVP.
+For example if you've installed these in ```/opt/arm``` , then you would do the following:
+```bash
+export PATH=/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:/opt/arm/cmake/bin:$PATH
+```
+
+You will also need TVM which can either be:
+  - Installed from TLCPack(see [TLCPack](https://tlcpack.ai/))
+  - Built from source (see [Install from Source](https://tvm.apache.org/docs/install/from_source.html))
+    - When building from source, the following need to be set in config.cmake:
+      - set(USE_CMSISNN ON)
+      - set(USE_MICRO ON)
+      - set(USE_LLVM ON)
+
+
+Running the demo application
+----------------------------
+Type the following command to run the bare metal text recognition application ([src/demo_bare_metal.c](./src/demo_bare_metal.c)):
+
+```bash
+./run_demo.sh
+```
+
+If you are not able to use Arm Virtual Hardware Amazon Machine Image(AMI) instance hosted by AWS/AWS China, specify argument --enable_FVP to 1 to make the application run on local Fixed Virtual Platforms (FVPs) executables.
+
+```bash
+./run_demo.sh --enable_FVP 1
+```
+
+If the Ethos(TM)-U platform and/or CMSIS have not been installed in /opt/arm/ethosu then
+the locations for these can be specified as arguments to run_demo.sh, for example:
+
+```bash
+./run_demo.sh --cmsis_path /home/tvm-user/cmsis \
+--ethosu_platform_path /home/tvm-user/ethosu/core_platform
+```
+
+With [run_demo.sh](./run_demo.sh) to run the demo application, it will:
+- Set up running environment by installing the required prerequisites automatically if running in Arm Virtual Hardware Amazon AMI instance(not specify --enable_FVP to 1)
+- Download a PP-PicoDet model
+- Use tvmc to compile the text recognition model for Cortex(R)-M55 CPU and CMSIS-NN
+- Create a C header file inputs.c containing the image data as a C array
+- Create a C header file outputs.c containing a C array where the output of inference will be stored
+- Build the demo application
+- Run the demo application on a Arm Virtual Hardware based on Arm(R) Corstone(TM)-300 software
+- The application will report the text on the image and the corresponding score.
+
+Using your own image
+--------------------
+The create_image.py script takes a single argument on the command line which is the path of the
+image to be converted into an array of bytes for consumption by the model.
+
+The demo can be modified to use an image of your choice by changing the following line in run_demo.sh
+
+```bash
+python3 ./convert_image.py path/to/image
+```
+
+Model description
+-----------------
+In this demo, the model we used is based on [PP-PicoDet](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.6/configs/picodet). Because of the excellent performance, PP-PicoDet are very suitable for deployment on mobile or CPU. And it is released by [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection).
diff --git a/deploy/third_engine/demo_avh/README.md.bak b/deploy/third_engine/demo_avh/README.md.bak
new file mode 100644
index 0000000000000000000000000000000000000000..18c6a442de90b5a8bae36a598dc70474dd0c068b
--- /dev/null
+++ b/deploy/third_engine/demo_avh/README.md.bak
@@ -0,0 +1,89 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+Running PP-PicoDet via TVM on bare metal Arm(R) Cortex(R)-M55 CPU and CMSIS-NN
+===============================================================
+
+This folder contains an example of how to use TVM to run a PP-PicoDet model
+on bare metal Cortex(R)-M55 CPU and CMSIS-NN.
+
+Prerequisites
+-------------
+If the demo is run in the ci_cpu Docker container provided with TVM, then the following
+software will already be installed.
+
+If the demo is not run in the ci_cpu Docker container, then you will need the following:
+- Software required to build and run the demo (These can all be installed by running
+  tvm/docker/install/ubuntu_install_ethosu_driver_stack.sh.)
+  - [Fixed Virtual Platform (FVP) based on Arm(R) Corstone(TM)-300 software](https://developer.arm.com/tools-and-software/open-source-software/arm-platforms-software/arm-ecosystem-fvps)
+  - [cmake 3.19.5](https://github.com/Kitware/CMake/releases/)
+  - [GCC toolchain from Arm(R)](https://developer.arm.com/-/media/Files/downloads/gnu-rm/10-2020q4/gcc-arm-none-eabi-10-2020-q4-major-x86_64-linux.tar.bz2)
+  - [Arm(R) Ethos(TM)-U NPU driver stack](https://review.mlplatform.org)
+  - [CMSIS](https://github.com/ARM-software/CMSIS_5)
+- The python libraries listed in the requirements.txt of this directory
+  - These can be installed by running the following from the current directory:
+    ```bash
+    pip install -r ./requirements.txt
+    ```
+
+You will also need TVM which can either be:
+  - Built from source (see [Install from Source](https://tvm.apache.org/docs/install/from_source.html))
+    - When building from source, the following need to be set in config.cmake:
+      - set(USE_CMSISNN ON)
+      - set(USE_MICRO ON)
+      - set(USE_LLVM ON)
+  - Installed from TLCPack(see [TLCPack](https://tlcpack.ai/))
+
+You will need to update your PATH environment variable to include the path to cmake 3.19.5 and the FVP.
+For example if you've installed these in ```/opt/arm``` , then you would do the following:
+```bash
+export PATH=/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:/opt/arm/cmake/bin:$PATH
+```
+
+Running the demo application
+----------------------------
+Type the following command to run the bare metal text recognition application ([src/demo_bare_metal.c](./src/demo_bare_metal.c)):
+```bash
+./run_demo.sh
+```
+If the Ethos(TM)-U platform and/or CMSIS have not been installed in /opt/arm/ethosu then
+the locations for these can be specified as arguments to run_demo.sh, for example:
+
+```bash
+./run_demo.sh --cmsis_path /home/tvm-user/cmsis \
+--ethosu_platform_path /home/tvm-user/ethosu/core_platform
+```
+
+This will:
+- Download a PP-PicoDet text recognition model
+- Use tvmc to compile the text recognition model for Cortex(R)-M55 CPU and CMSIS-NN
+- Create a C header file inputs.c containing the image data as a C array
+- Create a C header file outputs.c containing a C array where the output of inference will be stored
+- Build the demo application
+- Run the demo application on a Fixed Virtual Platform (FVP) based on Arm(R) Corstone(TM)-300 software
+- The application will report the text on the image and the corresponding score.
+
+Using your own image
+--------------------
+The create_image.py script takes a single argument on the command line which is the path of the
+image to be converted into an array of bytes for consumption by the model.
+
+The demo can be modified to use an image of your choice by changing the following line in run_demo.sh
+
+```bash
+python3 ./convert_image.py ../../demo/000000014439_640x640.jpg
+```
+
+Model description
+-----------------
diff --git a/deploy/third_engine/demo_avh/arm-none-eabi-gcc.cmake b/deploy/third_engine/demo_avh/arm-none-eabi-gcc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..415b3139be1b7f891c017dff0dc299b67f7ef2fe
--- /dev/null
+++ b/deploy/third_engine/demo_avh/arm-none-eabi-gcc.cmake
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if (__TOOLCHAIN_LOADED)
+    return()
+endif()
+set(__TOOLCHAIN_LOADED TRUE)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_C_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_CXX_COMPILER "arm-none-eabi-g++")
+set(CMAKE_SYSTEM_PROCESSOR "cortex-m55" CACHE STRING "Select Arm(R) Cortex(R)-M architecture. (cortex-m0, cortex-m3, cortex-m33, cortex-m4, cortex-m55, cortex-m7, etc)")
+
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(CMAKE_C_STANDARD 99)
+set(CMAKE_CXX_STANDARD 14)
+
+# The system processor could for example be set to cortex-m33+nodsp+nofp.
+set(__CPU_COMPILE_TARGET ${CMAKE_SYSTEM_PROCESSOR})
+string(REPLACE "+" ";" __CPU_FEATURES ${__CPU_COMPILE_TARGET})
+list(POP_FRONT __CPU_FEATURES CMAKE_SYSTEM_PROCESSOR)
+
+string(FIND ${__CPU_COMPILE_TARGET} "+" __OFFSET)
+if(__OFFSET GREATER_EQUAL 0)
+    string(SUBSTRING ${__CPU_COMPILE_TARGET} ${__OFFSET} -1 CPU_FEATURES)
+endif()
+
+# Add -mcpu to the compile options to override the -mcpu the CMake toolchain adds
+add_compile_options(-mcpu=${__CPU_COMPILE_TARGET})
+
+# Set floating point unit
+if("${__CPU_COMPILE_TARGET}" MATCHES "\\+fp")
+    set(FLOAT hard)
+elseif("${__CPU_COMPILE_TARGET}" MATCHES "\\+nofp")
+    set(FLOAT soft)
+elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "cortex-m33" OR
+       "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "cortex-m55")
+    set(FLOAT hard)
+else()
+    set(FLOAT soft)
+endif()
+
+add_compile_options(-mfloat-abi=${FLOAT})
+add_link_options(-mfloat-abi=${FLOAT})
+
+# Link target
+add_link_options(-mcpu=${__CPU_COMPILE_TARGET})
+add_link_options(-Xlinker -Map=output.map)
+
+#
+# Compile options
+#
+set(cxx_flags "-fno-unwind-tables;-fno-rtti;-fno-exceptions")
+
+add_compile_options("-Wall;-Wextra;-Wsign-compare;-Wunused;-Wswitch-default;\
+-Wdouble-promotion;-Wredundant-decls;-Wshadow;-Wnull-dereference;\
+-Wno-format-extra-args;-Wno-unused-function;-Wno-unused-label;\
+-Wno-missing-field-initializers;-Wno-return-type;-Wno-format;-Wno-int-conversion"
+    "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+)
diff --git a/deploy/third_engine/demo_avh/configure_avh.sh b/deploy/third_engine/demo_avh/configure_avh.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8042fd81d2379c6f7489d90372dffd2dc10e145e
--- /dev/null
+++ b/deploy/third_engine/demo_avh/configure_avh.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# Copyright (c) 2022 Arm Limited and Contributors. All rights reserved.
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+# Show usage
+function show_usage() {
+    cat <<EOF
+Usage: Set up running environment by installing the required prerequisites.
+-h, --help
+    Display this help message.
+EOF
+}
+
+if [ "$#" -eq 1 ] && [ "$1" == "--help" -o "$1" == "-h" ]; then
+    show_usage
+    exit 0
+elif [ "$#" -ge 1 ]; then
+        show_usage
+        exit 1
+fi
+
+echo -e "\e[36mStart setting up running environment\e[0m"
+
+# Install CMSIS
+echo -e "\e[36mStart installing CMSIS\e[0m"
+CMSIS_PATH="/opt/arm/ethosu/cmsis" 
+mkdir -p "${CMSIS_PATH}"
+
+CMSIS_SHA="977abe9849781a2e788b02282986480ff4e25ea6"
+CMSIS_SHASUM="86c88d9341439fbb78664f11f3f25bc9fda3cd7de89359324019a4d87d169939eea85b7fdbfa6ad03aa428c6b515ef2f8cd52299ce1959a5444d4ac305f934cc"
+CMSIS_URL="http://github.com/ARM-software/CMSIS_5/archive/${CMSIS_SHA}.tar.gz"
+DOWNLOAD_PATH="/tmp/${CMSIS_SHA}.tar.gz"
+
+wget ${CMSIS_URL} -O "${DOWNLOAD_PATH}"
+echo "$CMSIS_SHASUM" ${DOWNLOAD_PATH} | sha512sum -c
+tar -xf "${DOWNLOAD_PATH}" -C "${CMSIS_PATH}" --strip-components=1
+touch "${CMSIS_PATH}"/"${CMSIS_SHA}".sha
+echo -e "\e[36mCMSIS Installation SUCCESS\e[0m"
+
+# Install Arm(R) Ethos(TM)-U NPU driver stack
+echo -e "\e[36mStart installing Arm(R) Ethos(TM)-U NPU driver stack\e[0m"
+git clone "https://review.mlplatform.org/ml/ethos-u/ethos-u-core-platform" /opt/arm/ethosu/core_platform 
+cd /opt/arm/ethosu/core_platform 
+git checkout tags/"21.11"
+echo -e "\e[36mArm(R) Ethos(TM)-U Core Platform Installation SUCCESS\e[0m"
+ 
+# Install Arm(R) GNU Toolchain
+echo -e "\e[36mStart installing Arm(R) GNU Toolchain\e[0m"
+mkdir -p /opt/arm/gcc-arm-none-eabi
+export gcc_arm_url='https://developer.arm.com/-/media/Files/downloads/gnu-rm/10-2020q4/gcc-arm-none-eabi-10-2020-q4-major-x86_64-linux.tar.bz2?revision=ca0cbf9c-9de2-491c-ac48-898b5bbc0443&la=en&hash=68760A8AE66026BCF99F05AC017A6A50C6FD832A'
+curl --retry 64 -sSL ${gcc_arm_url} | tar -C /opt/arm/gcc-arm-none-eabi --strip-components=1 -jx 
+export PATH=/opt/arm/gcc-arm-none-eabi/bin:$PATH 
+arm-none-eabi-gcc --version
+arm-none-eabi-g++ --version
+echo -e "\e[36mArm(R) Arm(R) GNU Toolchain Installation SUCCESS\e[0m"
+
+# Install TVM from TLCPack
+echo -e "\e[36mStart installing TVM\e[0m"
+pip install tlcpack-nightly -f https://tlcpack.ai/wheels
+echo -e "\e[36mTVM Installation SUCCESS\e[0m"
\ No newline at end of file
diff --git a/deploy/third_engine/demo_avh/convert_image.py b/deploy/third_engine/demo_avh/convert_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..b440f0853a30ebe051dad1c66bb33241392a6c16
--- /dev/null
+++ b/deploy/third_engine/demo_avh/convert_image.py
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import pathlib
+import re
+import sys
+import cv2
+import math
+from PIL import Image
+import numpy as np
+
+
+def resize_norm_img(img, image_shape, padding=True):
+    imgC, imgH, imgW = image_shape
+    img = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, [2, 0, 1]) / 255
+    img = np.expand_dims(img, 0)
+    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    return img.astype(np.float32)
+
+
+def create_header_file(name, tensor_name, tensor_data, output_path):
+    """
+    This function generates a header file containing the data from the numpy array provided.
+    """
+    file_path = pathlib.Path(f"{output_path}/" + name).resolve()
+    # Create header file with npy_data as a C array
+    raw_path = file_path.with_suffix(".h").resolve()
+    with open(raw_path, "a") as header_file:
+        header_file.write(
+            "\n" + f"const size_t {tensor_name}_len = {tensor_data.size};\n" +
+            f'__attribute__((section(".data.tvm"), aligned(16))) float {tensor_name}[] = '
+        )
+
+        header_file.write("{")
+        for i in np.ndindex(tensor_data.shape):
+            header_file.write(f"{tensor_data[i]}, ")
+        header_file.write("};\n\n")
+
+
+def create_headers(image_name):
+    """
+    This function generates C header files for the input and output arrays required to run inferences
+    """
+    img_path = os.path.join("./", f"{image_name}")
+
+    # Resize image to 32x320
+    img = cv2.imread(img_path)
+    img = resize_norm_img(img, [3, 32, 320])
+    img_data = img.astype("float32")
+
+    # # Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
+    img_data = np.expand_dims(img_data, axis=0)
+
+    if os.path.exists("./include/inputs.h"):
+        os.remove("./include/inputs.h")
+    if os.path.exists("./include/outputs.h"):
+        os.remove("./include/outputs.h")
+    # Create input header file
+    create_header_file("inputs", "input", img_data, "./include")
+    # Create output header file
+    output_data = np.zeros([8500], np.float32)
+    create_header_file(
+        "outputs",
+        "output0",
+        output_data,
+        "./include", )
+    output_data = np.zeros([170000], np.float32)
+    create_header_file(
+        "outputs",
+        "output1",
+        output_data,
+        "./include", )
+
+
+if __name__ == "__main__":
+    create_headers(sys.argv[1])
diff --git a/deploy/third_engine/demo_avh/corstone300.ld b/deploy/third_engine/demo_avh/corstone300.ld
new file mode 100644
index 0000000000000000000000000000000000000000..68cce18312fc615d458d79c9b36c81f7738e978a
--- /dev/null
+++ b/deploy/third_engine/demo_avh/corstone300.ld
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*------------------ Reference System Memories -------------
+  +===================+============+=======+============+============+
+  | Memory            | Address    | Size  | CPU Access | NPU Access |
+  +===================+============+=======+============+============+
+  | ITCM              | 0x00000000 | 512KB | Yes (RO)   | No         |
+  +-------------------+------------+-------+------------+------------+
+  | DTCM              | 0x20000000 | 512KB | Yes (R/W)  | No         |
+  +-------------------+------------+-------+------------+------------+
+  | SSE-300 SRAM      | 0x21000000 |   2MB | Yes (R/W)  | Yes (R/W)  |
+  +-------------------+------------+-------+------------+------------+
+  | Data SRAM         | 0x01000000 |   2MB | Yes (R/W)  | Yes (R/W)  |
+  +-------------------+------------+-------+------------+------------+
+  | DDR               | 0x60000000 |  32MB | Yes (R/W)  | Yes (R/W)  |
+  +-------------------+------------+-------+------------+------------+ */
+
+/*---------------------- ITCM Configuration ----------------------------------
+  <h> Flash Configuration
+    <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+    <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__ROM_BASE = 0x00000000;
+__ROM_SIZE = 0x00080000;
+
+/*--------------------- DTCM RAM Configuration ----------------------------
+  <h> RAM Configuration
+    <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__RAM_BASE = 0x20000000;
+__RAM_SIZE = 0x00080000;
+
+/*----------------------- Data SRAM Configuration ------------------------------
+  <h> Data SRAM Configuration
+    <o0> DATA_SRAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> DATA_SRAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__DATA_SRAM_BASE = 0x01000000;
+__DATA_SRAM_SIZE = 0x00200000;
+
+/*--------------------- Embedded SRAM Configuration ----------------------------
+  <h> SRAM Configuration
+    <o0> SRAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> SRAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__SRAM_BASE = 0x21000000;
+__SRAM_SIZE = 0x00200000;
+
+/*--------------------- Stack / Heap Configuration ----------------------------
+  <h> Stack / Heap Configuration
+    <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+    <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__STACK_SIZE = 0x00008000;
+__HEAP_SIZE  = 0x00008000;
+
+/*--------------------- Embedded RAM Configuration ----------------------------
+  <h> DDR Configuration
+    <o0> DDR Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> DDR Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__DDR_BASE = 0x60000000;
+__DDR_SIZE = 0x02000000;
+
+/*
+ *-------------------- <<< end of configuration section >>> -------------------
+ */
+
+MEMORY
+{
+  ITCM       (rx)  : ORIGIN = __ROM_BASE, LENGTH = __ROM_SIZE
+  DTCM       (rwx) : ORIGIN = __RAM_BASE, LENGTH = __RAM_SIZE
+  DATA_SRAM  (rwx) : ORIGIN = __DATA_SRAM_BASE, LENGTH = __DATA_SRAM_SIZE
+  SRAM       (rwx) : ORIGIN = __SRAM_BASE, LENGTH = __SRAM_SIZE
+  DDR        (rwx) : ORIGIN = __DDR_BASE, LENGTH = __DDR_SIZE
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions ITCM and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  /* .ddr is placed before .text so that .rodata.tvm is encountered before .rodata* */
+  .ddr :
+  {
+    . = ALIGN (16);
+    *(.rodata.tvm)
+    . = ALIGN (16);
+    *(.data.tvm);
+    . = ALIGN(16);    
+  } > DDR
+
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > ITCM
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ITCM
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ITCM
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+    LONG (__etext)
+    LONG (__data_start__)
+    LONG (__data_end__ - __data_start__)
+    /* Add each additional data section here */
+    __copy_table_end__ = .;
+  } > ITCM
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+    __zero_table_end__ = .;
+  } > ITCM
+
+  /**
+   * Location counter can end up 2byte aligned with narrow Thumb code but
+   * __etext is assumed by startup code to be the LMA of a section in DTCM
+   * which must be 4byte aligned
+   */
+  __etext = ALIGN (4);
+
+  .sram :
+  {
+    . = ALIGN(16);
+  } > SRAM AT > SRAM
+
+  .data : AT (__etext)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > DTCM
+
+  .bss.noinit (NOLOAD):
+  {
+    . = ALIGN(16);
+    *(.bss.noinit.*)
+    . = ALIGN(16);
+  } > DDR AT > DDR
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > DTCM AT > DTCM
+
+  .data_sram :
+  {
+    . = ALIGN(16);
+  } > DATA_SRAM
+
+  .heap (COPY) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > DTCM
+
+  .stack (ORIGIN(DTCM) + LENGTH(DTCM) - __STACK_SIZE) (COPY) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > DTCM
+  PROVIDE(__stack = __StackTop);
+
+  /* Check if data + stack exceeds DTCM limit */
+  ASSERT(__StackLimit >= __bss_end__, "region DTCM overflowed with stack")
+}
diff --git a/deploy/third_engine/demo_avh/image/000000014439_640x640.jpg b/deploy/third_engine/demo_avh/image/000000014439_640x640.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..58e9d3e228af43c9b55d8d0cb385ce82ebb8b996
Binary files /dev/null and b/deploy/third_engine/demo_avh/image/000000014439_640x640.jpg differ
diff --git a/deploy/third_engine/demo_avh/include/crt_config.h b/deploy/third_engine/demo_avh/include/crt_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fd0ead60697beb86d55dfadde5070b7ae5afd3e
--- /dev/null
+++ b/deploy/third_engine/demo_avh/include/crt_config.h
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_CRT_CONFIG_H_
+#define TVM_RUNTIME_CRT_CONFIG_H_
+
+/*! Log level of the CRT runtime */
+#define TVM_CRT_LOG_LEVEL TVM_CRT_LOG_LEVEL_DEBUG
+
+#endif // TVM_RUNTIME_CRT_CONFIG_H_
diff --git a/deploy/third_engine/demo_avh/include/tvm_runtime.h b/deploy/third_engine/demo_avh/include/tvm_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..0978d7adfa039bf188aa8d17a43a7e61f1adecc6
--- /dev/null
+++ b/deploy/third_engine/demo_avh/include/tvm_runtime.h
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/stack_allocator.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t error_code) {
+  printf("TVMPlatformAbort: %d\n", error_code);
+  printf("EXITTHESIM\n");
+  exit(-1);
+}
+
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev,
+                                          void **out_ptr) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
+
+tvm_crt_error_t TVMPlatformMemoryFree(void *ptr, DLDevice dev) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
+
+void TVMLogf(const char *msg, ...) {
+  va_list args;
+  va_start(args, msg);
+  vfprintf(stdout, msg, args);
+  va_end(args);
+}
+
+TVM_DLL int TVMFuncRegisterGlobal(const char *name, TVMFunctionHandle f,
+                                  int override) {
+  return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/deploy/third_engine/demo_avh/requirements.txt b/deploy/third_engine/demo_avh/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..992002efbe1e197bb33d07e54a2722950911313e
--- /dev/null
+++ b/deploy/third_engine/demo_avh/requirements.txt
@@ -0,0 +1,3 @@
+paddlepaddle
+numpy
+opencv-python
diff --git a/deploy/third_engine/demo_avh/run_demo.sh b/deploy/third_engine/demo_avh/run_demo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dc9fefecea807b11c015b3f0938ba268c7f41528
--- /dev/null
+++ b/deploy/third_engine/demo_avh/run_demo.sh
@@ -0,0 +1,184 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+set -e
+set -u
+set -o pipefail
+
+# Show usage
+function show_usage() {
+    cat <<EOF
+Usage: run_demo.sh
+-h, --help
+	Display this help message.
+--cmsis_path CMSIS_PATH
+	Set path to CMSIS.
+--ethosu_platform_path ETHOSU_PLATFORM_PATH
+	Set path to Arm(R) Ethos(TM)-U core platform.
+--fvp_path FVP_PATH
+	Set path to FVP.
+--cmake_path
+	Set path to cmake.
+--enable_FVP
+	Set 1 to run application on local Fixed Virtual Platforms (FVPs) executables.
+EOF
+}
+
+# Configure environment variables
+FVP_enable=0
+export PATH=/opt/arm/gcc-arm-none-eabi/bin:$PATH
+
+# Install python libraries
+echo -e "\e[36mInstall python libraries\e[0m"
+sudo pip install -r ./requirements.txt
+
+# Parse arguments
+while (( $# )); do
+    case "$1" in
+        -h|--help)
+            show_usage
+            exit 0
+            ;;
+
+        --cmsis_path)
+            if [ $# -gt 1 ]
+            then
+                export CMSIS_PATH="$2"
+                shift 2
+            else
+                echo 'ERROR: --cmsis_path requires a non-empty argument' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+
+        --ethosu_platform_path)
+            if [ $# -gt 1 ]
+            then
+                export ETHOSU_PLATFORM_PATH="$2"
+                shift 2
+            else
+                echo 'ERROR: --ethosu_platform_path requires a non-empty argument' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+
+        --fvp_path)
+            if [ $# -gt 1 ]
+            then
+                export PATH="$2/models/Linux64_GCC-6.4:$PATH"
+                shift 2
+            else
+                echo 'ERROR: --fvp_path requires a non-empty argument' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+
+        --cmake_path)
+            if [ $# -gt 1 ]
+            then
+                export CMAKE="$2"
+                shift 2
+            else
+                echo 'ERROR: --cmake_path requires a non-empty argument' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+            
+        --enable_FVP)
+            if [ $# -gt 1 ] && [ "$2" == "1" -o "$2" == "0" ];
+            then
+                FVP_enable="$2"
+                shift 2
+            else
+                echo 'ERROR: --enable_FVP requires a right argument 1 or 0' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+
+        -*|--*)
+            echo "Error: Unknown flag: $1" >&2
+            show_usage >&2
+            exit 1
+            ;;
+    esac
+done
+
+# Choose running environment: cloud(default) or local environment
+Platform="VHT_Corstone_SSE-300_Ethos-U55"
+if [ $FVP_enable == "1" ]; then
+	Platform="FVP_Corstone_SSE-300_Ethos-U55"
+	echo -e "\e[36mRun application on local Fixed Virtual Platforms (FVPs)\e[0m"
+else
+	if [ ! -d "/opt/arm/" ]; then
+		sudo ./configure_avh.sh
+	fi
+fi
+
+# Directories
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+# Make build directory
+make cleanall
+mkdir -p build
+cd build
+
+# Get PaddlePaddle inference model
+echo -e "\e[36mDownload PaddlePaddle inference model\e[0m"
+wget https://bj.bcebos.com/v1/paddledet/deploy/Inference/picodet_s_320_coco_lcnet_no_nms.tar
+tar -xf picodet_s_320_coco_lcnet_no_nms.tar
+
+# Compile model for Arm(R) Cortex(R)-M55 CPU and CMSIS-NN
+# An alternative to using "python3 -m tvm.driver.tvmc" is to call
+# "tvmc" directly once TVM has been pip installed.
+python3 -m tvm.driver.tvmc compile --target=cmsis-nn,c \
+    --target-cmsis-nn-mcpu=cortex-m55 \
+    --target-c-mcpu=cortex-m55 \
+    --runtime=crt \
+    --executor=aot \
+    --executor-aot-interface-api=c \
+    --executor-aot-unpacked-api=1 \
+    --pass-config tir.usmp.enable=1 \
+    --pass-config tir.usmp.algorithm=hill_climb \
+    --pass-config tir.disable_storage_rewrite=1 \
+    --pass-config tir.disable_vectorize=1 picodet_s_320_coco_lcnet_no_nms/model.pdmodel \
+    --output-format=mlf \
+    --model-format=paddle \
+    --module-name=picodet \
+    --input-shapes image:[1,3,320,320] \
+    --output=picodet.tar
+tar -xf picodet.tar
+
+# Create C header files
+cd ..
+python3 ./convert_image.py ./image/000000014439_640x640.jpg
+
+# Build demo executable
+cd ${script_dir}
+echo ${script_dir}
+make
+
+# Run demo executable on the AVH
+$Platform -C cpu0.CFGDTCMSZ=15 \
+-C cpu0.CFGITCMSZ=15 -C mps3_board.uart0.out_file=\"-\" -C mps3_board.uart0.shutdown_tag=\"EXITTHESIM\" \
+-C mps3_board.visualisation.disable-visualisation=1 -C mps3_board.telnetterminal0.start_telnet=0 \
+-C mps3_board.telnetterminal1.start_telnet=0 -C mps3_board.telnetterminal2.start_telnet=0 -C mps3_board.telnetterminal5.start_telnet=0 \
+./build/demo --stat
diff --git a/deploy/third_engine/demo_avh/src/demo_bare_metal.c b/deploy/third_engine/demo_avh/src/demo_bare_metal.c
new file mode 100644
index 0000000000000000000000000000000000000000..07ed5bebe2c266bde5b59b101c1df1a54ba2ef28
--- /dev/null
+++ b/deploy/third_engine/demo_avh/src/demo_bare_metal.c
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <stdio.h>
+#include <tvm_runtime.h>
+#include <tvmgen_picodet.h>
+
+#include "uart.h"
+
+// Header files generated by convert_image.py
+#include "inputs.h"
+#include "outputs.h"
+
+int main(int argc, char **argv) {
+  uart_init();
+  printf("Starting PicoDet inference:\n");
+  struct tvmgen_picodet_outputs rec_outputs = {
+      .output0 = output0, .output1 = output1,
+  };
+  struct tvmgen_picodet_inputs rec_inputs = {
+      .image = input,
+  };
+
+  tvmgen_picodet_run(&rec_inputs, &rec_outputs);
+
+  // post process
+  for (int i = 0; i < output0_len / 4; i++) {
+    float score = 0;
+    int32_t class = 0;
+    for (int j = 0; j < 80; j++) {
+      if (output1[i + j * 2125] > score) {
+        score = output1[i + j * 2125];
+        class = j;
+      }
+    }
+    if (score > 0.1 && output0[i * 4] > 0 && output0[i * 4 + 1] > 0) {
+      printf("box: %f, %f, %f, %f, class: %d, score: %f\n", output0[i * 4] * 2,
+             output0[i * 4 + 1] * 2, output0[i * 4 + 2] * 2,
+             output0[i * 4 + 3] * 2, class, score);
+    }
+  }
+  return 0;
+}
diff --git a/deploy/third_engine/demo_mnn/CMakeLists.txt b/deploy/third_engine/demo_mnn/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9afa8cfc011587977b4ef3ed13bb0b050e990fa0
--- /dev/null
+++ b/deploy/third_engine/demo_mnn/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.9)
+project(picodet-mnn)
+
+set(CMAKE_CXX_STANDARD 17)
+set(MNN_DIR PATHS "./mnn")
+
+# find_package(OpenCV REQUIRED PATHS "/work/dependence/opencv/opencv-3.4.3/build")
+find_package(OpenCV REQUIRED)
+include_directories(
+        ${MNN_DIR}/include
+        ${MNN_DIR}/include/MNN
+        ${CMAKE_SOURCE_DIR}
+)
+link_directories(mnn/lib)
+
+add_library(libMNN SHARED IMPORTED)
+set_target_properties(
+        libMNN
+        PROPERTIES IMPORTED_LOCATION
+        ${CMAKE_SOURCE_DIR}/mnn/lib/libMNN.so
+)
+add_executable(picodet-mnn main.cpp picodet_mnn.cpp)
+target_link_libraries(picodet-mnn MNN ${OpenCV_LIBS} libMNN.so)
diff --git a/deploy/third_engine/demo_mnn/README.md b/deploy/third_engine/demo_mnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac11a8e18fdc53aa7eebb57fa1ba2d4680a9dcf3
--- /dev/null
+++ b/deploy/third_engine/demo_mnn/README.md
@@ -0,0 +1,89 @@
+# PicoDet MNN Demo
+
+本Demo提供的预测代码是根据[Alibaba's MNN framework](https://github.com/alibaba/MNN) 推理库预测的。
+
+## C++ Demo
+
+- 第一步：根据[MNN官方编译文档](https://www.yuque.com/mnn/en/build_linux) 编译生成预测库.
+- 第二步：编译或下载得到OpenCV库，可参考OpenCV官网，为了方便如果环境是gcc8.2 x86环境，可直接下载以下库：
+```shell
+wget https://paddledet.bj.bcebos.com/data/opencv-3.4.16_gcc8.2_ffmpeg.tar.gz
+tar -xf opencv-3.4.16_gcc8.2_ffmpeg.tar.gz
+```
+
+- 第三步：准备模型
+    ```shell
+    modelName=picodet_s_320_coco_lcnet
+    # 导出Inference model
+    python tools/export_model.py \
+            -c configs/picodet/${modelName}.yml \
+            -o weights=${modelName}.pdparams \
+            --output_dir=inference_model
+    # 转换到ONNX
+    paddle2onnx --model_dir inference_model/${modelName} \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file ${modelName}.onnx
+    # 简化模型
+    python -m onnxsim ${modelName}.onnx ${modelName}_processed.onnx
+    # 将模型转换至MNN格式
+    python -m MNN.tools.mnnconvert -f ONNX --modelFile picodet_s_320_lcnet_processed.onnx --MNNModel picodet_s_320_lcnet.mnn
+    ```
+为了快速测试，可直接下载：[picodet_s_320_lcnet.mnn](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_lcnet.mnn)（不带后处理）。
+
+**注意：**由于MNN里，Matmul算子的输入shape如果不一致计算有问题，带后处理的Demo正在升级中，很快发布。
+
+## 编译可执行程序
+
+- 第一步：导入lib包
+```
+mkdir mnn && cd mnn && mkdir lib
+cp /path/to/MNN/build/libMNN.so .
+cd ..
+cp -r /path/to/MNN/include .
+```
+- 第二步：修改CMakeLists.txt中OpenCV和MNN的路径
+- 第三步：开始编译
+``` shell
+mkdir build && cd build
+cmake ..
+make
+```
+如果在build目录下生成`picodet-mnn`可执行文件，就证明成功了。
+
+## 开始运行
+
+首先新建预测结果存放目录：
+```shell
+cp -r ../demo_onnxruntime/imgs .
+cd build
+mkdir ../results
+```
+
+- 预测一张图片
+``` shell
+./picodet-mnn 0 ../picodet_s_320_lcnet_3.mnn 320 320 ../imgs/dog.jpg
+```
+
+-测试速度Benchmark
+
+``` shell
+./picodet-mnn 1 ../picodet_s_320_lcnet.mnn 320 320
+```
+
+## FAQ
+
+- 预测结果精度不对：
+请先确认模型输入shape是否对齐，并且模型输出name是否对齐，不带后处理的PicoDet增强版模型输出name如下：
+```shell
+# 分类分支  |  检测分支
+{"transpose_0.tmp_0", "transpose_1.tmp_0"},
+{"transpose_2.tmp_0", "transpose_3.tmp_0"},
+{"transpose_4.tmp_0", "transpose_5.tmp_0"},
+{"transpose_6.tmp_0", "transpose_7.tmp_0"},
+```
+可使用[netron](https://netron.app)查看具体name，并修改`picodet_mnn.hpp`中相应`non_postprocess_heads_info`数组。
+
+## Reference
+[MNN](https://github.com/alibaba/MNN)
diff --git a/deploy/third_engine/demo_mnn/main.cpp b/deploy/third_engine/demo_mnn/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5737368d5473a75ced391ad2e28883427a942795
--- /dev/null
+++ b/deploy/third_engine/demo_mnn/main.cpp
@@ -0,0 +1,203 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "picodet_mnn.hpp"
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#define __SAVE_RESULT__ // if defined save drawed results to ../results, else
+                        // show it in windows
+
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+
+void draw_bboxes(const cv::Mat &im, const std::vector<BoxInfo> &bboxes,
+                 std::string save_path = "None") {
+  static const char *class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+
+  cv::Mat image = im.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int thickness = 2;
+  auto colormap = GenerateColorMap(sizeof(class_names));
+
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo &bbox = bboxes[i];
+    std::cout << bbox.x1 << ". " << bbox.y1 << ". " << bbox.x2 << ". "
+              << bbox.y2 << ". " << std::endl;
+    int c1 = colormap[3 * bbox.label + 0];
+    int c2 = colormap[3 * bbox.label + 1];
+    int c3 = colormap[3 * bbox.label + 2];
+    cv::Scalar color = cv::Scalar(c1, c2, c3);
+    // cv::Scalar color = cv::Scalar(0, 0, 255);
+    cv::rectangle(image, cv::Rect(cv::Point(bbox.x1, bbox.y1),
+                                  cv::Point(bbox.x2, bbox.y2)),
+                  color, 1, cv::LINE_AA);
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+
+    int x = bbox.x1;
+    int y = bbox.y1 - label_size.height - baseLine;
+    if (y < 0)
+      y = 0;
+    if (x + label_size.width > image.cols)
+      x = image.cols - label_size.width;
+
+    cv::rectangle(image, cv::Rect(cv::Point(x, y),
+                                  cv::Size(label_size.width,
+                                           label_size.height + baseLine)),
+                  color, -1);
+
+    cv::putText(image, text, cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 255, 255), 1,
+                cv::LINE_AA);
+  }
+
+  if (save_path == "None") {
+    cv::imshow("image", image);
+  } else {
+    cv::imwrite(save_path, image);
+    std::cout << save_path << std::endl;
+  }
+}
+
+int image_demo(PicoDet &detector, const char *imagepath) {
+  std::vector<cv::String> filenames;
+  cv::glob(imagepath, filenames, false);
+
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name, cv::IMREAD_COLOR);
+    if (image.empty()) {
+      fprintf(stderr, "cv::imread %s failed\n", img_name.c_str());
+      return -1;
+    }
+    std::vector<BoxInfo> results;
+    detector.detect(image, results, false);
+    std::cout << "detect done." << std::endl;
+
+#ifdef __SAVE_RESULT__
+    std::string save_path = img_name;
+    draw_bboxes(image, results, save_path.replace(3, 4, "results"));
+#else
+    draw_bboxes(image, results);
+    cv::waitKey(0);
+#endif
+  }
+  return 0;
+}
+
+int benchmark(PicoDet &detector, int width, int height) {
+  int loop_num = 100;
+  int warm_up = 8;
+
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(width, height, CV_8UC3, cv::Scalar(1, 1, 1));
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    auto start = std::chrono::steady_clock::now();
+    std::vector<BoxInfo> results;
+    detector.detect(image, results, false);
+    auto end = std::chrono::steady_clock::now();
+
+    std::chrono::duration<double> elapsed = end - start;
+    double time = elapsed.count();
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", "picodet",
+          time_min, time_max, time_avg);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int mode = atoi(argv[1]);
+  std::string model_path = argv[2];
+  int height = 320;
+  int width = 320;
+  if (argc == 4) {
+    height = atoi(argv[3]);
+    width = atoi(argv[4]);
+  }
+  PicoDet detector = PicoDet(model_path, width, height, 4, 0.45, 0.3);
+  if (mode == 1) {
+    benchmark(detector, width, height);
+  } else {
+    if (argc != 5) {
+      std::cout << "Must set image file, such as ./picodet-mnn 0 "
+                   "../picodet_s_320_lcnet.mnn 320 320 img.jpg"
+                << std::endl;
+    }
+    const char *images = argv[5];
+    image_demo(detector, images);
+  }
+}
diff --git a/deploy/third_engine/demo_mnn/picodet_mnn.cpp b/deploy/third_engine/demo_mnn/picodet_mnn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a315f14a9e29f0958a2707a4e09fcdb78bd12b6c
--- /dev/null
+++ b/deploy/third_engine/demo_mnn/picodet_mnn.cpp
@@ -0,0 +1,253 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#include "picodet_mnn.hpp"
+
+using namespace std;
+
+PicoDet::PicoDet(const std::string &mnn_path, int input_width, int input_length,
+                 int num_thread_, float score_threshold_,
+                 float nms_threshold_) {
+  num_thread = num_thread_;
+  in_w = input_width;
+  in_h = input_length;
+  score_threshold = score_threshold_;
+  nms_threshold = nms_threshold_;
+
+  PicoDet_interpreter = std::shared_ptr<MNN::Interpreter>(
+      MNN::Interpreter::createFromFile(mnn_path.c_str()));
+  MNN::ScheduleConfig config;
+  config.numThread = num_thread;
+  MNN::BackendConfig backendConfig;
+  backendConfig.precision = (MNN::BackendConfig::PrecisionMode)2;
+  config.backendConfig = &backendConfig;
+
+  PicoDet_session = PicoDet_interpreter->createSession(config);
+
+  input_tensor = PicoDet_interpreter->getSessionInput(PicoDet_session, nullptr);
+}
+
+PicoDet::~PicoDet() {
+  PicoDet_interpreter->releaseModel();
+  PicoDet_interpreter->releaseSession(PicoDet_session);
+}
+
+int PicoDet::detect(cv::Mat &raw_image, std::vector<BoxInfo> &result_list,
+                    bool has_postprocess) {
+  if (raw_image.empty()) {
+    std::cout << "image is empty ,please check!" << std::endl;
+    return -1;
+  }
+
+  image_h = raw_image.rows;
+  image_w = raw_image.cols;
+  cv::Mat image;
+  cv::resize(raw_image, image, cv::Size(in_w, in_h));
+
+  PicoDet_interpreter->resizeTensor(input_tensor, {1, 3, in_h, in_w});
+  PicoDet_interpreter->resizeSession(PicoDet_session);
+  std::shared_ptr<MNN::CV::ImageProcess> pretreat(MNN::CV::ImageProcess::create(
+      MNN::CV::BGR, MNN::CV::BGR, mean_vals, 3, norm_vals, 3));
+  pretreat->convert(image.data, in_w, in_h, image.step[0], input_tensor);
+
+  auto start = chrono::steady_clock::now();
+
+  // run network
+  PicoDet_interpreter->runSession(PicoDet_session);
+
+  // get output data
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(num_class);
+
+  if (has_postprocess) {
+    auto bbox_out_tensor = PicoDet_interpreter->getSessionOutput(
+        PicoDet_session, nms_heads_info[0].c_str());
+    auto class_out_tensor = PicoDet_interpreter->getSessionOutput(
+        PicoDet_session, nms_heads_info[1].c_str());
+    // bbox branch
+    auto tensor_bbox_host =
+        new MNN::Tensor(bbox_out_tensor, MNN::Tensor::CAFFE);
+    bbox_out_tensor->copyToHostTensor(tensor_bbox_host);
+    auto bbox_output_shape = tensor_bbox_host->shape();
+    int output_size = 1;
+    for (int j = 0; j < bbox_output_shape.size(); ++j) {
+      output_size *= bbox_output_shape[j];
+    }
+    std::cout << "output_size:" << output_size << std::endl;
+    bbox_output_data_.resize(output_size);
+    std::copy_n(tensor_bbox_host->host<float>(), output_size,
+                bbox_output_data_.data());
+    delete tensor_bbox_host;
+    // class branch
+    auto tensor_class_host =
+        new MNN::Tensor(class_out_tensor, MNN::Tensor::CAFFE);
+    class_out_tensor->copyToHostTensor(tensor_class_host);
+    auto class_output_shape = tensor_class_host->shape();
+    output_size = 1;
+    for (int j = 0; j < class_output_shape.size(); ++j) {
+      output_size *= class_output_shape[j];
+    }
+    std::cout << "output_size:" << output_size << std::endl;
+    class_output_data_.resize(output_size);
+    std::copy_n(tensor_class_host->host<float>(), output_size,
+                class_output_data_.data());
+    delete tensor_class_host;
+  } else {
+    for (const auto &head_info : non_postprocess_heads_info) {
+      MNN::Tensor *tensor_scores = PicoDet_interpreter->getSessionOutput(
+          PicoDet_session, head_info.cls_layer.c_str());
+      MNN::Tensor *tensor_boxes = PicoDet_interpreter->getSessionOutput(
+          PicoDet_session, head_info.dis_layer.c_str());
+
+      MNN::Tensor tensor_scores_host(tensor_scores,
+                                     tensor_scores->getDimensionType());
+      tensor_scores->copyToHostTensor(&tensor_scores_host);
+
+      MNN::Tensor tensor_boxes_host(tensor_boxes,
+                                    tensor_boxes->getDimensionType());
+      tensor_boxes->copyToHostTensor(&tensor_boxes_host);
+
+      decode_infer(&tensor_scores_host, &tensor_boxes_host, head_info.stride,
+                   score_threshold, results);
+    }
+  }
+
+  auto end = chrono::steady_clock::now();
+  chrono::duration<double> elapsed = end - start;
+  cout << "inference time:" << elapsed.count() << " s, ";
+
+  for (int i = 0; i < (int)results.size(); i++) {
+    nms(results[i], nms_threshold);
+
+    for (auto box : results[i]) {
+      box.x1 = box.x1 / in_w * image_w;
+      box.x2 = box.x2 / in_w * image_w;
+      box.y1 = box.y1 / in_h * image_h;
+      box.y2 = box.y2 / in_h * image_h;
+      result_list.push_back(box);
+    }
+  }
+  cout << "detect " << result_list.size() << " objects" << endl;
+
+  return 0;
+}
+
+void PicoDet::decode_infer(MNN::Tensor *cls_pred, MNN::Tensor *dis_pred,
+                           int stride, float threshold,
+                           std::vector<std::vector<BoxInfo>> &results) {
+  int feature_h = ceil((float)in_h / stride);
+  int feature_w = ceil((float)in_w / stride);
+
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    const float *scores = cls_pred->host<float>() + (idx * num_class);
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < num_class; label++) {
+      if (scores[label] > score) {
+        score = scores[label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float *bbox_pred =
+          dis_pred->host<float>() + (idx * 4 * (reg_max + 1));
+      results[cur_label].push_back(
+          disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+
+BoxInfo PicoDet::disPred2Bbox(const float *&dfl_det, int label, float score,
+                              int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(dfl_det + i * (reg_max + 1), dis_after_sm,
+                                reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)in_w);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)in_h);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(),
+            [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
+
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
diff --git a/deploy/third_engine/demo_mnn/picodet_mnn.hpp b/deploy/third_engine/demo_mnn/picodet_mnn.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4744040e258498afd70ee587ffc0ae0b39d24faa
--- /dev/null
+++ b/deploy/third_engine/demo_mnn/picodet_mnn.hpp
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __PicoDet_H__
+#define __PicoDet_H__
+
+#pragma once
+
+#include "Interpreter.hpp"
+
+#include "ImageProcess.hpp"
+#include "MNNDefine.h"
+#include "Tensor.hpp"
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <opencv2/opencv.hpp>
+#include <string>
+#include <vector>
+
+typedef struct NonPostProcessHeadInfo_ {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} NonPostProcessHeadInfo;
+
+typedef struct BoxInfo_ {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+
+class PicoDet {
+public:
+  PicoDet(const std::string &mnn_path, int input_width, int input_length,
+          int num_thread_ = 4, float score_threshold_ = 0.5,
+          float nms_threshold_ = 0.3);
+
+  ~PicoDet();
+
+  int detect(cv::Mat &img, std::vector<BoxInfo> &result_list,
+             bool has_postprocess);
+
+private:
+  void decode_infer(MNN::Tensor *cls_pred, MNN::Tensor *dis_pred, int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(const float *&dfl_det, int label, float score, int x,
+                       int y, int stride);
+  void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH);
+
+private:
+  std::shared_ptr<MNN::Interpreter> PicoDet_interpreter;
+  MNN::Session *PicoDet_session = nullptr;
+  MNN::Tensor *input_tensor = nullptr;
+
+  int num_thread;
+  int image_w;
+  int image_h;
+
+  int in_w = 320;
+  int in_h = 320;
+
+  float score_threshold;
+  float nms_threshold;
+
+  const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+  const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+
+  const int num_class = 80;
+  const int reg_max = 7;
+
+  std::vector<float> bbox_output_data_;
+  std::vector<float> class_output_data_;
+
+  std::vector<std::string> nms_heads_info{"tmp_16", "concat_4.tmp_0"};
+  // If not export post-process, will use non_postprocess_heads_info
+  std::vector<NonPostProcessHeadInfo> non_postprocess_heads_info{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+};
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length);
+
+inline float fast_exp(float x);
+inline float sigmoid(float x);
+
+#endif
diff --git a/deploy/third_engine/demo_mnn_kpts/CMakeLists.txt b/deploy/third_engine/demo_mnn_kpts/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..84bf51a93e17295669e3509a12a00a3cf2fea19c
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.9)
+
+project(tinypose-mnn)
+
+set(CMAKE_CXX_STANDARD 17)
+set(MNN_DIR {YOUR_MNN_DIR})
+
+find_package(OpenCV REQUIRED)
+
+include_directories(
+        ${MNN_DIR}/include
+        ${MNN_DIR}/include/MNN
+        ${CMAKE_SOURCE_DIR}
+)
+link_directories(mnn/lib)
+
+add_library(libMNN SHARED IMPORTED)
+set_target_properties(
+        libMNN
+        PROPERTIES IMPORTED_LOCATION
+        ${CMAKE_SOURCE_DIR}/mnn/lib/libMNN.so
+)
+add_executable(tinypose-mnn main.cpp picodet_mnn.cpp keypoint_detector.cpp keypoint_postprocess.cpp)
+
+target_link_libraries(tinypose-mnn MNN ${OpenCV_LIBS} libMNN.so)
+
diff --git a/deploy/third_engine/demo_mnn_kpts/CMakeLists_armv8.txt b/deploy/third_engine/demo_mnn_kpts/CMakeLists_armv8.txt
new file mode 100644
index 0000000000000000000000000000000000000000..027f0dd970f0c9d07f20ab026b3d90d9f1af3ddc
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/CMakeLists_armv8.txt
@@ -0,0 +1,47 @@
+cmake_minimum_required(VERSION 3.9)
+
+project(tinypose-mnn)
+
+set(CMAKE_CXX_STANDARD 17)
+set(MNN_DIR {YOUR_MNN_DIR})
+set(NDK_ROOT {YOUR_ANDROID_NDK_PATH})
+set(LDFLAGS -latomic -pthread -ldl -llog -lz -static-libstdc++)
+
+set(OpenCV_DIR ${CMAKE_SOURCE_DIR}/third/opencv4.1.0/arm64-v8a)
+
+set(OpenCV_DEPS ${OpenCV_DIR}/libs/libopencv_imgcodecs.a 
+              ${OpenCV_DIR}/libs/libopencv_imgproc.a 
+              ${OpenCV_DIR}/libs/libopencv_core.a 
+              ${OpenCV_DIR}/3rdparty/libs/libtegra_hal.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibjpeg-turbo.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibwebp.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibpng.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibjasper.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibtiff.a 
+              ${OpenCV_DIR}/3rdparty/libs/libIlmImf.a 
+              ${OpenCV_DIR}/3rdparty/libs/libtbb.a 
+              ${OpenCV_DIR}/3rdparty/libs/libcpufeatures.a)
+
+set(FLAGS "-pie -Wl,--gc-sections -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=21 -fexceptions -frtti -std=c++11 -O3 -DNDEBUG -fPIE -fopenmp")
+set(CMAKE_CXX_FLAGS "--sysroot=${NDK_ROOT}/sysroot ${FLAGS}")
+
+set(STDCXX ${NDK_ROOT}/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a 
+              ${NDK_ROOT}/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a 
+              ${NDK_ROOT}/platforms/android-21/arch-arm64/usr/lib/libstdc++.a)
+set(SYS_INCS ${NDK_ROOT}/sysroot/usr/include/aarch64-linux-android/ ${NDK_ROOT}/sources/cxx-stl/llvm-libc++/include/  ${NDK_ROOT}/sources/cxx-stl/llvm-libc++abi/include/ ${NDK_ROOT}/sources/android/support/include/ ${NDK_ROOT}/sysroot/usr/include/)
+
+include_directories(
+        ${SYS_INCS} 
+        ${OpenCV_DIR}/include
+        ${MNN_DIR}/include
+        ${MNN_DIR}/include/MNN
+        ${CMAKE_SOURCE_DIR}
+)
+
+link_directories(${NDK_ROOT}/platforms/android-21/arch-arm64)
+link_directories(${MNN_DIR}/project/android/build_64)
+
+add_executable(tinypose-mnn picodet_mnn.cpp keypoint_postprocess.cpp keypoint_detector.cpp main.cpp)
+
+target_link_libraries(tinypose-mnn -lMNN ${OpenCV_DEPS} ${STDCXX} ${LDFLAGS})
+
diff --git a/deploy/third_engine/demo_mnn_kpts/README.md b/deploy/third_engine/demo_mnn_kpts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6445e603ad87f87f1a435a4d9822716f90a8ebb9
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/README.md
@@ -0,0 +1,116 @@
+# TinyPose MNN Demo
+
+This fold provides PicoDet+TinyPose inference code using
+[Alibaba's MNN framework](https://github.com/alibaba/MNN). Most of the implements in
+this fold are same as *demo_ncnn*.
+
+## Install MNN
+
+### Python library
+
+Just run:
+
+``` shell
+pip install MNN
+```
+
+### C++ library
+
+Please follow the [official document](https://www.yuque.com/mnn/en/build_linux) to build MNN engine.
+
+- Create picodet_m_416_coco.onnx and tinypose256.onnx
+    example:
+    ```shell
+    modelName=picodet_m_416_coco
+    # export model
+    python tools/export_model.py \
+            -c configs/picodet/${modelName}.yml \
+            -o weights=${modelName}.pdparams \
+            --output_dir=inference_model
+    # convert to onnx
+    paddle2onnx --model_dir inference_model/${modelName} \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file ${modelName}.onnx
+    # onnxsim
+    python -m onnxsim ${modelName}.onnx ${modelName}_processed.onnx
+    ```
+
+- Convert model
+    example:
+    ``` shell
+    python -m MNN.tools.mnnconvert -f ONNX --modelFile picodet-416.onnx --MNNModel picodet-416.mnn
+    ```
+Here are converted model
+[picodet_m_416](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416.mnn).
+[tinypose256](https://paddledet.bj.bcebos.com/deploy/third_engine/tinypose256.mnn)
+
+## Build
+
+For C++ code, replace `libMNN.so` under *./mnn/lib* with the one you just compiled, modify OpenCV path and MNN path at CMake file,
+and run
+
+``` shell
+mkdir build && cd build
+cmake ..
+make
+```
+
+Note that a flag at `main.cpp` is used to control whether to show the detection result or save it into a fold.
+
+``` c++
+#define __SAVE_RESULT__ // if defined save drawed results to ../results, else show it in windows
+```
+
+#### ARM Build
+
+Prepare OpenCV library [OpenCV_4_1](https://paddle-inference-dist.bj.bcebos.com/opencv4.1.0.tar.gz).
+
+``` shell
+mkdir third && cd third
+wget https://paddle-inference-dist.bj.bcebos.com/opencv4.1.0.tar.gz
+tar -zxvf opencv4.1.0.tar.gz
+cd ..
+
+mkdir build && cd build
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DANDROID_TOOLCHAIN=gcc ..
+make
+```
+
+## Run
+
+To detect images in a fold, run:
+``` shell
+./tinypose-mnn [mode] [image_file]
+```
+|  param   | detail  |
+|  ----  | ----  |
+| --mode  | input mode，0:camera；1:image；2:video；3:benchmark |
+| --image_file  | input image path |
+
+for example:
+
+``` shell
+./tinypose-mnn "1" "../imgs/test.jpg"
+```
+
+For speed benchmark:
+
+``` shell
+./tinypose-mnn "3" "0"
+```
+
+## Benchmark
+Plateform: Kirin980
+Model: [tinypose256](https://paddledet.bj.bcebos.com/deploy/third_engine/tinypose256.mnn)
+
+| param    | Min(s) | Max(s) | Avg(s) |
+| -------- | ------ | ------ | ------ |
+| Thread=4 | 0.018  | 0.021  | 0.019  |
+| Thread=1 | 0.031  | 0.041  | 0.032  |
+
+
+
+## Reference
+[MNN](https://github.com/alibaba/MNN)
diff --git a/deploy/third_engine/demo_mnn_kpts/keypoint_detector.cpp b/deploy/third_engine/demo_mnn_kpts/keypoint_detector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05fad66d853d721cbdfc5a9c839cdba9a75c7813
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_detector.cpp
@@ -0,0 +1,179 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "keypoint_detector.h"
+
+namespace PaddleDetection {
+
+// Visualiztion MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold) {
+  const int edge[][2] = {{0, 1},
+                         {0, 2},
+                         {1, 3},
+                         {2, 4},
+                         {3, 5},
+                         {4, 6},
+                         {5, 7},
+                         {6, 8},
+                         {7, 9},
+                         {8, 10},
+                         {5, 11},
+                         {6, 12},
+                         {11, 13},
+                         {12, 14},
+                         {13, 15},
+                         {14, 16},
+                         {11, 12}};
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > threshold) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[edge[i][0] * 3] > threshold &&
+          results[batchid].keypoints[edge[i][1] * 3] > threshold) {
+        int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+        int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+        int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+        int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+        cv::line(vis_img,
+                 cv::Point2d(x_start, y_start),
+                 cv::Point2d(x_end, y_end),
+                 colormap[i],
+                 1);
+      }
+    }
+  }
+  return vis_img;
+}
+
+void KeyPointDetector::Postprocess(std::vector<float>& output,
+                                   std::vector<int>& output_shape,
+                                   std::vector<int>& idxout,
+                                   std::vector<int>& idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  std::vector<float> preds(output_shape[1] * 3, 0);
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(output,
+                    output_shape,
+                    idxout,
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid,
+                    this->use_dark());
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+}
+
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               std::vector<KeyPointResult>* result) {
+  int batch_size = imgs.size();
+  KeyPointDet_interpreter->resizeTensor(input_tensor,
+                                        {batch_size, 3, in_h, in_w});
+  KeyPointDet_interpreter->resizeSession(KeyPointDet_session);
+  auto insize = 3 * in_h * in_w;
+
+  // Preprocess image
+  cv::Mat resized_im;
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+
+    cv::resize(im, resized_im, cv::Size(in_w, in_h));
+    std::shared_ptr<MNN::CV::ImageProcess> pretreat(
+        MNN::CV::ImageProcess::create(
+            MNN::CV::BGR, MNN::CV::RGB, mean_vals, 3, norm_vals, 3));
+    pretreat->convert(
+        resized_im.data, in_w, in_h, resized_im.step[0], input_tensor);
+  }
+
+  // Run predictor
+  auto inference_start = std::chrono::steady_clock::now();
+
+  KeyPointDet_interpreter->runSession(KeyPointDet_session);
+  // Get output tensor
+  auto out_tensor = KeyPointDet_interpreter->getSessionOutput(
+      KeyPointDet_session, "conv2d_441.tmp_1");
+  auto nchwoutTensor = new Tensor(out_tensor, Tensor::CAFFE);
+  out_tensor->copyToHostTensor(nchwoutTensor);
+
+  auto output_shape = nchwoutTensor->shape();
+  // Calculate output length
+  int output_size = 1;
+  for (int j = 0; j < output_shape.size(); ++j) {
+    output_size *= output_shape[j];
+  }
+  output_data_.resize(output_size);
+  std::copy_n(nchwoutTensor->host<float>(), output_size, output_data_.data());
+  delete nchwoutTensor;
+
+  auto idx_tensor = KeyPointDet_interpreter->getSessionOutput(
+      KeyPointDet_session, "argmax_0.tmp_0");
+
+  auto idxhostTensor = new Tensor(idx_tensor, Tensor::CAFFE);
+  idx_tensor->copyToHostTensor(idxhostTensor);
+
+  auto idx_shape = idxhostTensor->shape();
+  // Calculate output length
+  output_size = 1;
+  for (int j = 0; j < idx_shape.size(); ++j) {
+    output_size *= idx_shape[j];
+  }
+
+  idx_data_.resize(output_size);
+  std::copy_n(idxhostTensor->host<int>(), output_size, idx_data_.data());
+  delete idxhostTensor;
+
+  auto inference_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed = inference_end - inference_start;
+  printf("keypoint inference time: %f s\n", elapsed.count());
+
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/third_engine/demo_mnn_kpts/keypoint_detector.h b/deploy/third_engine/demo_mnn_kpts/keypoint_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c7af8921a0e3a0f649fd36a0f1d08a763a21256
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_detector.h
@@ -0,0 +1,131 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "Interpreter.hpp"
+
+#include "ImageProcess.hpp"
+#include "MNNDefine.h"
+#include "Tensor.hpp"
+
+#include "keypoint_postprocess.h"
+
+using namespace MNN;
+
+namespace PaddleDetection {
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold = 0.2);
+
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_path,
+                            int num_thread = 4,
+                            int input_height = 256,
+                            int input_width = 192,
+                            float score_threshold = 0.3,
+                            const int batch_size = 1,
+                            bool use_dark = true) {
+    printf("config path: %s",
+           model_path.substr(0, model_path.find_last_of('/') + 1).c_str());
+    use_dark_ = use_dark;
+
+    in_w = input_width;
+    in_h = input_height;
+    threshold_ = score_threshold;
+
+    KeyPointDet_interpreter = std::shared_ptr<MNN::Interpreter>(
+        MNN::Interpreter::createFromFile(model_path.c_str()));
+    MNN::ScheduleConfig config;
+    config.type = MNN_FORWARD_CPU;
+    /*modeNum means gpuMode for GPU usage, Or means numThread for CPU usage.*/
+    config.numThread = num_thread;
+    // If type not fount, let it failed
+    config.backupType = MNN_FORWARD_CPU;
+    BackendConfig backendConfig;
+    backendConfig.precision = static_cast<MNN::BackendConfig::PrecisionMode>(1);
+    config.backendConfig = &backendConfig;
+
+    KeyPointDet_session = KeyPointDet_interpreter->createSession(config);
+
+    input_tensor =
+        KeyPointDet_interpreter->getSessionInput(KeyPointDet_session, nullptr);
+  }
+
+  ~KeyPointDetector() {
+    KeyPointDet_interpreter->releaseModel();
+    KeyPointDet_interpreter->releaseSession(KeyPointDet_session);
+  }
+
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               std::vector<KeyPointResult>* result = nullptr);
+
+  bool use_dark() { return this->use_dark_; }
+
+  inline float get_threshold() { return threshold_; };
+
+  // const float mean_vals[3] = { 103.53f, 116.28f, 123.675f };
+  // const float norm_vals[3] = { 0.017429f, 0.017507f, 0.017125f };
+  const float mean_vals[3] = {0.f, 0.f, 0.f};
+  const float norm_vals[3] = {1.f, 1.f, 1.f};
+  int in_w = 128;
+  int in_h = 256;
+
+ private:
+  // Postprocess result
+  void Postprocess(std::vector<float>& output,
+                   std::vector<int>& output_shape,
+                   std::vector<int>& idxout,
+                   std::vector<int>& idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+
+  std::vector<float> output_data_;
+  std::vector<int> idx_data_;
+  float threshold_;
+  bool use_dark_;
+
+  std::shared_ptr<MNN::Interpreter> KeyPointDet_interpreter;
+  MNN::Session* KeyPointDet_session = nullptr;
+  MNN::Tensor* input_tensor = nullptr;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.cpp b/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe6e8298d01be7e76ec4792a6b33f5c8d96ba518
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.cpp
@@ -0,0 +1,258 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "keypoint_postprocess.h"
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{static_cast<float>(-0.5) * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords) {
+  cv::Mat trans(2, 3, CV_64FC1);
+  get_affine_transform(center, scale, 0, output_size, trans, 1);
+  for (int p = 0; p < dim[1]; ++p) {
+    affine_tranform(coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+  }
+}
+
+// only for batchsize == 1
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<int>& dim,
+                std::vector<float>& coords,
+                int px,
+                int py,
+                int index,
+                int ch) {
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 =
+      heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0, dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1, 1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1, 1));
+
+  float epsilon = 1e-10;
+  // sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+
+  // compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2 * xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2 * xy + yd2);
+
+  // finally get offset by derivative and hassian, which combined by dx/dy and
+  // dxx/dyy
+  if (dxx * dyy - dxy * dxy != 0) {
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2, 2, CV_32F, M);
+    cv::Mat derivative(2, 1, CV_32F, D);
+    cv::Mat offset = -hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0, 0);
+    coords[ch * 2 + 1] += offset.at<float>(1, 0);
+  }
+}
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+
+    int idx = idxout[batchid * dim[1] + j];
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+
+    if (DARK && px > 1 && px < heatmap_width - 2 && py > 1 &&
+        py < heatmap_height - 2) {
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    } else {
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                       heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                       heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
diff --git a/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.h b/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.h
new file mode 100644
index 0000000000000000000000000000000000000000..928a57bb0be897e4e368bb676f0de3268ffd4c7e
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.h
@@ -0,0 +1,67 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+void affine_tranform(float pt_x,
+                     float pt_y,
+                     cv::Mat& trans,
+                     std::vector<float>& x,
+                     int p,
+                     int num);
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords);
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx);
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK = true);
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.25);
diff --git a/deploy/third_engine/demo_mnn_kpts/main.cpp b/deploy/third_engine/demo_mnn_kpts/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f03e983c6271b6804cde2829604d4f3be369fdd4
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/main.cpp
@@ -0,0 +1,424 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "keypoint_detector.h"
+#include "picodet_mnn.h"
+
+#define __SAVE_RESULT__  // if defined save drawed results to ../results, else
+                         // show it in windows
+
+using namespace PaddleDetection;
+
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+
+int resize_uniform(cv::Mat& src,
+                   cv::Mat& dst,
+                   cv::Size dst_size,
+                   object_rect& effect_area) {
+  int w = src.cols;
+  int h = src.rows;
+  int dst_w = dst_size.width;
+  int dst_h = dst_size.height;
+  dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+
+  float ratio_src = w * 1.0 / h;
+  float ratio_dst = dst_w * 1.0 / dst_h;
+
+  int tmp_w = 0;
+  int tmp_h = 0;
+  if (ratio_src > ratio_dst) {
+    tmp_w = dst_w;
+    tmp_h = floor((dst_w * 1.0 / w) * h);
+  } else if (ratio_src < ratio_dst) {
+    tmp_h = dst_h;
+    tmp_w = floor((dst_h * 1.0 / h) * w);
+  } else {
+    cv::resize(src, dst, dst_size);
+    effect_area.x = 0;
+    effect_area.y = 0;
+    effect_area.width = dst_w;
+    effect_area.height = dst_h;
+    return 0;
+  }
+  cv::Mat tmp;
+  cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+
+  if (tmp_w != dst_w) {
+    int index_w = floor((dst_w - tmp_w) / 2.0);
+    for (int i = 0; i < dst_h; i++) {
+      memcpy(dst.data + i * dst_w * 3 + index_w * 3,
+             tmp.data + i * tmp_w * 3,
+             tmp_w * 3);
+    }
+    effect_area.x = index_w;
+    effect_area.y = 0;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else if (tmp_h != dst_h) {
+    int index_h = floor((dst_h - tmp_h) / 2.0);
+    memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+    effect_area.x = 0;
+    effect_area.y = index_h;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else {
+    printf("error\n");
+  }
+  return 0;
+}
+
+const int color_list[80][3] = {
+    {216, 82, 24},   {236, 176, 31},  {125, 46, 141},  {118, 171, 47},
+    {76, 189, 237},  {238, 19, 46},   {76, 76, 76},    {153, 153, 153},
+    {255, 0, 0},     {255, 127, 0},   {190, 190, 0},   {0, 255, 0},
+    {0, 0, 255},     {170, 0, 255},   {84, 84, 0},     {84, 170, 0},
+    {84, 255, 0},    {170, 84, 0},    {170, 170, 0},   {170, 255, 0},
+    {255, 84, 0},    {255, 170, 0},   {255, 255, 0},   {0, 84, 127},
+    {0, 170, 127},   {0, 255, 127},   {84, 0, 127},    {84, 84, 127},
+    {84, 170, 127},  {84, 255, 127},  {170, 0, 127},   {170, 84, 127},
+    {170, 170, 127}, {170, 255, 127}, {255, 0, 127},   {255, 84, 127},
+    {255, 170, 127}, {255, 255, 127}, {0, 84, 255},    {0, 170, 255},
+    {0, 255, 255},   {84, 0, 255},    {84, 84, 255},   {84, 170, 255},
+    {84, 255, 255},  {170, 0, 255},   {170, 84, 255},  {170, 170, 255},
+    {170, 255, 255}, {255, 0, 255},   {255, 84, 255},  {255, 170, 255},
+    {42, 0, 0},      {84, 0, 0},      {127, 0, 0},     {170, 0, 0},
+    {212, 0, 0},     {255, 0, 0},     {0, 42, 0},      {0, 84, 0},
+    {0, 127, 0},     {0, 170, 0},     {0, 212, 0},     {0, 255, 0},
+    {0, 0, 42},      {0, 0, 84},      {0, 0, 127},     {0, 0, 170},
+    {0, 0, 212},     {0, 0, 255},     {0, 0, 0},       {36, 36, 36},
+    {72, 72, 72},    {109, 109, 109}, {145, 145, 145}, {182, 182, 182},
+    {218, 218, 218}, {0, 113, 188},   {80, 182, 188},  {127, 127, 0},
+};
+
+void draw_bboxes(const cv::Mat& bgr,
+                 const std::vector<BoxInfo>& bboxes,
+                 object_rect effect_roi,
+                 std::string save_path = "None") {
+  static const char* class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+
+  cv::Mat image = bgr.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo& bbox = bboxes[i];
+    cv::Scalar color = cv::Scalar(color_list[bbox.label][0],
+                                  color_list[bbox.label][1],
+                                  color_list[bbox.label][2]);
+    cv::rectangle(image,
+                  cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio,
+                                     (bbox.y1 - effect_roi.y) * height_ratio),
+                           cv::Point((bbox.x2 - effect_roi.x) * width_ratio,
+                                     (bbox.y2 - effect_roi.y) * height_ratio)),
+                  color);
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+
+    int x = (bbox.x1 - effect_roi.x) * width_ratio;
+    int y =
+        (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+    if (y < 0) y = 0;
+    if (x + label_size.width > image.cols) x = image.cols - label_size.width;
+
+    cv::rectangle(
+        image,
+        cv::Rect(cv::Point(x, y),
+                 cv::Size(label_size.width, label_size.height + baseLine)),
+        color,
+        -1);
+
+    cv::putText(image,
+                text,
+                cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX,
+                0.4,
+                cv::Scalar(255, 255, 255));
+  }
+
+  if (save_path == "None") {
+    cv::imshow("image", image);
+  } else {
+    cv::imwrite(save_path, image);
+    std::cout << save_path << std::endl;
+  }
+}
+
+std::vector<BoxInfo> coordsback(const cv::Mat image,
+                                const object_rect effect_roi,
+                                const std::vector<BoxInfo>& bboxes) {
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  std::vector<BoxInfo> bboxes_oimg;
+
+  for (int i = 0; i < bboxes.size(); i++) {
+    auto bbox = bboxes[i];
+    bbox.x1 = (bbox.x1 - effect_roi.x) * width_ratio;
+    bbox.y1 = (bbox.y1 - effect_roi.y) * height_ratio;
+    bbox.x2 = (bbox.x2 - effect_roi.x) * width_ratio;
+    bbox.y2 = (bbox.y2 - effect_roi.y) * height_ratio;
+    bboxes_oimg.emplace_back(bbox);
+  }
+  return bboxes_oimg;
+}
+
+void image_infer_kpts(KeyPointDetector* kpts_detector,
+                      cv::Mat image,
+                      const object_rect effect_roi,
+                      const std::vector<BoxInfo>& results,
+                      std::string img_name = "kpts_vis",
+                      bool save_img = true) {
+  std::vector<cv::Mat> cropimgs;
+  std::vector<std::vector<float>> center_bs;
+  std::vector<std::vector<float>> scale_bs;
+  std::vector<KeyPointResult> kpts_results;
+  auto results_oimg = coordsback(image, effect_roi, results);
+
+  for (int i = 0; i < results_oimg.size(); i++) {
+    auto rect = results_oimg[i];
+    if (rect.label == 0) {
+      cv::Mat cropimg;
+      std::vector<float> center, scale;
+      std::vector<int> area = {static_cast<int>(rect.x1),
+                               static_cast<int>(rect.y1),
+                               static_cast<int>(rect.x2),
+                               static_cast<int>(rect.y2)};
+      CropImg(image, cropimg, area, center, scale);
+      //   cv::imwrite("./test_crop_"+std::to_string(i)+".jpg", cropimg);
+      cropimgs.emplace_back(cropimg);
+      center_bs.emplace_back(center);
+      scale_bs.emplace_back(scale);
+    }
+    if (cropimgs.size() == 1 ||
+        (cropimgs.size() > 0 && i == results_oimg.size() - 1)) {
+      kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+      cropimgs.clear();
+      center_bs.clear();
+      scale_bs.clear();
+    }
+  }
+  std::vector<int> compression_params;
+  compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+  compression_params.push_back(95);
+  std::string kpts_savepath =
+      "keypoint_" + img_name.substr(img_name.find_last_of('/') + 1);
+  cv::Mat kpts_vis_img =
+      VisualizeKptsResult(image, kpts_results, {0, 255, 0}, 0.3);
+  if (save_img) {
+    cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+    printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+  } else {
+    cv::imshow("image", kpts_vis_img);
+  }
+}
+
+int image_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* imagepath) {
+  std::vector<cv::String> filenames;
+  cv::glob(imagepath, filenames, false);
+
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name);
+    if (image.empty()) {
+      fprintf(stderr, "cv::imread %s failed\n", img_name.c_str());
+      return -1;
+    }
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, img_name);
+    }
+  }
+  return 0;
+}
+
+int webcam_demo(PicoDet& detector,
+                KeyPointDetector* kpts_detector,
+                int cam_id) {
+  cv::Mat image;
+  cv::VideoCapture cap(cam_id);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+
+int video_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* path) {
+  cv::Mat image;
+  cv::VideoCapture cap(path);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+
+int benchmark(KeyPointDetector* kpts_detector) {
+  int loop_num = 100;
+  int warm_up = 8;
+
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(256, 192, CV_8UC3, cv::Scalar(1, 1, 1));
+  std::vector<float> center = {128, 96};
+  std::vector<float> scale = {256, 192};
+  std::vector<cv::Mat> cropimgs = {image};
+  std::vector<std::vector<float>> center_bs = {center};
+  std::vector<std::vector<float>> scale_bs = {scale};
+  std::vector<KeyPointResult> kpts_results;
+
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    auto start = std::chrono::steady_clock::now();
+    std::vector<BoxInfo> results;
+    kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+    auto end = std::chrono::steady_clock::now();
+
+    std::chrono::duration<double> elapsed = end - start;
+    double time = elapsed.count();
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr,
+          "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n",
+          "tinypose",
+          time_min,
+          time_max,
+          time_avg);
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n "
+            "For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; "
+            "\n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    return -1;
+  }
+  PicoDet detector =
+      PicoDet("../weight/picodet_m_416.mnn", 416, 416, 4, 0.45, 0.3);
+  KeyPointDetector* kpts_detector =
+      new KeyPointDetector("../weight/tinypose256.mnn", 4, 256, 192);
+  int mode = atoi(argv[1]);
+  switch (mode) {
+    case 0: {
+      int cam_id = atoi(argv[2]);
+      webcam_demo(detector, kpts_detector, cam_id);
+      break;
+    }
+    case 1: {
+      const char* images = argv[2];
+      image_demo(detector, kpts_detector, images);
+      break;
+    }
+    case 2: {
+      const char* path = argv[2];
+      video_demo(detector, kpts_detector, path);
+      break;
+    }
+    case 3: {
+      benchmark(kpts_detector);
+      break;
+    }
+    default: {
+      fprintf(stderr,
+              "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; "
+              "\n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, "
+              "mode=2; \n For benchmark, mode=3 path=0.\n",
+              argv[0]);
+      break;
+    }
+  }
+  delete kpts_detector;
+  kpts_detector = nullptr;
+}
diff --git a/deploy/third_engine/demo_mnn_kpts/picodet_mnn.cpp b/deploy/third_engine/demo_mnn_kpts/picodet_mnn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f38e68c16317e7bc2019a6e533e550bd6607f93
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/picodet_mnn.cpp
@@ -0,0 +1,229 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#include "picodet_mnn.h"
+
+using namespace std;
+
+PicoDet::PicoDet(const std::string &mnn_path,
+                 int input_width,
+                 int input_length,
+                 int num_thread_,
+                 float score_threshold_,
+                 float nms_threshold_) {
+  num_thread = num_thread_;
+  in_w = input_width;
+  in_h = input_length;
+  score_threshold = score_threshold_;
+  nms_threshold = nms_threshold_;
+
+  PicoDet_interpreter = std::shared_ptr<MNN::Interpreter>(
+      MNN::Interpreter::createFromFile(mnn_path.c_str()));
+  MNN::ScheduleConfig config;
+  config.numThread = num_thread;
+  MNN::BackendConfig backendConfig;
+  backendConfig.precision = (MNN::BackendConfig::PrecisionMode)2;
+  config.backendConfig = &backendConfig;
+
+  PicoDet_session = PicoDet_interpreter->createSession(config);
+
+  input_tensor = PicoDet_interpreter->getSessionInput(PicoDet_session, nullptr);
+}
+
+PicoDet::~PicoDet() {
+  PicoDet_interpreter->releaseModel();
+  PicoDet_interpreter->releaseSession(PicoDet_session);
+}
+
+int PicoDet::detect(cv::Mat &raw_image, std::vector<BoxInfo> &result_list) {
+  if (raw_image.empty()) {
+    std::cout << "image is empty ,please check!" << std::endl;
+    return -1;
+  }
+
+  image_h = raw_image.rows;
+  image_w = raw_image.cols;
+  cv::Mat image;
+  cv::resize(raw_image, image, cv::Size(in_w, in_h));
+
+  PicoDet_interpreter->resizeTensor(input_tensor, {1, 3, in_h, in_w});
+  PicoDet_interpreter->resizeSession(PicoDet_session);
+  std::shared_ptr<MNN::CV::ImageProcess> pretreat(MNN::CV::ImageProcess::create(
+      MNN::CV::BGR, MNN::CV::BGR, mean_vals, 3, norm_vals, 3));
+  pretreat->convert(image.data, in_w, in_h, image.step[0], input_tensor);
+
+  auto start = chrono::steady_clock::now();
+
+  // run network
+  PicoDet_interpreter->runSession(PicoDet_session);
+
+  // get output data
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(num_class);
+
+  for (const auto &head_info : heads_info) {
+    MNN::Tensor *tensor_scores = PicoDet_interpreter->getSessionOutput(
+        PicoDet_session, head_info.cls_layer.c_str());
+    MNN::Tensor *tensor_boxes = PicoDet_interpreter->getSessionOutput(
+        PicoDet_session, head_info.dis_layer.c_str());
+
+    MNN::Tensor tensor_scores_host(tensor_scores,
+                                   tensor_scores->getDimensionType());
+    tensor_scores->copyToHostTensor(&tensor_scores_host);
+
+    MNN::Tensor tensor_boxes_host(tensor_boxes,
+                                  tensor_boxes->getDimensionType());
+    tensor_boxes->copyToHostTensor(&tensor_boxes_host);
+
+    decode_infer(&tensor_scores_host,
+                 &tensor_boxes_host,
+                 head_info.stride,
+                 score_threshold,
+                 results);
+  }
+
+  auto end = chrono::steady_clock::now();
+  chrono::duration<double> elapsed = end - start;
+  cout << "inference time:" << elapsed.count() << " s, ";
+
+  for (int i = 0; i < (int)results.size(); i++) {
+    nms(results[i], nms_threshold);
+
+    for (auto box : results[i]) {
+      box.x1 = box.x1 / in_w * image_w;
+      box.x2 = box.x2 / in_w * image_w;
+      box.y1 = box.y1 / in_h * image_h;
+      box.y2 = box.y2 / in_h * image_h;
+      result_list.push_back(box);
+    }
+  }
+  cout << "detect " << result_list.size() << " objects." << std::endl;
+  ;
+
+  return 0;
+}
+
+void PicoDet::decode_infer(MNN::Tensor *cls_pred,
+                           MNN::Tensor *dis_pred,
+                           int stride,
+                           float threshold,
+                           std::vector<std::vector<BoxInfo>> &results) {
+  int feature_h = in_h / stride;
+  int feature_w = in_w / stride;
+
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    const float *scores = cls_pred->host<float>() + (idx * num_class);
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < num_class; label++) {
+      if (scores[label] > score) {
+        score = scores[label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float *bbox_pred =
+          dis_pred->host<float>() + (idx * 4 * (reg_max + 1));
+      results[cur_label].push_back(
+          disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+
+BoxInfo PicoDet::disPred2Bbox(
+    const float *&dfl_det, int label, float score, int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(
+        dfl_det + i * (reg_max + 1), dis_after_sm, reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)in_w);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)in_h);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) {
+    return a.score > b.score;
+  });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
+
+string PicoDet::get_label_str(int label) { return labels[label]; }
+
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
diff --git a/deploy/third_engine/demo_mnn_kpts/picodet_mnn.h b/deploy/third_engine/demo_mnn_kpts/picodet_mnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..8686f5cf69cdb49a3aa09eecaf02f3514b1d05c1
--- /dev/null
+++ b/deploy/third_engine/demo_mnn_kpts/picodet_mnn.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#ifndef __PicoDet_H__
+#define __PicoDet_H__
+
+#pragma once
+
+#include "Interpreter.hpp"
+
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <opencv2/opencv.hpp>
+#include <string>
+#include <vector>
+#include "ImageProcess.hpp"
+#include "MNNDefine.h"
+#include "Tensor.hpp"
+
+typedef struct HeadInfo_ {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} HeadInfo;
+
+typedef struct BoxInfo_ {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+
+class PicoDet {
+ public:
+  PicoDet(const std::string &mnn_path,
+          int input_width,
+          int input_length,
+          int num_thread_ = 4,
+          float score_threshold_ = 0.5,
+          float nms_threshold_ = 0.3);
+
+  ~PicoDet();
+
+  int detect(cv::Mat &img, std::vector<BoxInfo> &result_list);
+  std::string get_label_str(int label);
+
+ private:
+  void decode_infer(MNN::Tensor *cls_pred,
+                    MNN::Tensor *dis_pred,
+                    int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(
+      const float *&dfl_det, int label, float score, int x, int y, int stride);
+  void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH);
+
+ private:
+  std::shared_ptr<MNN::Interpreter> PicoDet_interpreter;
+  MNN::Session *PicoDet_session = nullptr;
+  MNN::Tensor *input_tensor = nullptr;
+
+  int num_thread;
+  int image_w;
+  int image_h;
+
+  int in_w = 320;
+  int in_h = 320;
+
+  float score_threshold;
+  float nms_threshold;
+
+  const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+  const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+
+  const int num_class = 80;
+  const int reg_max = 7;
+
+  std::vector<HeadInfo> heads_info{
+      // cls_pred|dis_pred|stride
+      {"save_infer_model/scale_0.tmp_1", "save_infer_model/scale_4.tmp_1", 8},
+      {"save_infer_model/scale_1.tmp_1", "save_infer_model/scale_5.tmp_1", 16},
+      {"save_infer_model/scale_2.tmp_1", "save_infer_model/scale_6.tmp_1", 32},
+      {"save_infer_model/scale_3.tmp_1", "save_infer_model/scale_7.tmp_1", 64},
+  };
+
+  std::vector<std::string> labels{
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+};
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length);
+
+inline float fast_exp(float x);
+inline float sigmoid(float x);
+
+#endif
diff --git a/deploy/third_engine/demo_ncnn/CMakeLists.txt b/deploy/third_engine/demo_ncnn/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0d4344c699d58082eb37ebe6089e16ad120bc87e
--- /dev/null
+++ b/deploy/third_engine/demo_ncnn/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.9)
+set(CMAKE_CXX_STANDARD 17)
+
+project(picodet_demo)
+
+find_package(OpenMP REQUIRED)
+if(OPENMP_FOUND)
+    message("OPENMP FOUND")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+# find_package(OpenCV REQUIRED)
+find_package(OpenCV REQUIRED PATHS "/path/to/opencv-3.4.16_gcc8.2_ffmpeg")
+
+# find_package(ncnn REQUIRED)
+find_package(ncnn REQUIRED PATHS "/path/to/ncnn/build/install/lib/cmake/ncnn")
+if(NOT TARGET ncnn)
+    message(WARNING "ncnn NOT FOUND!  Please set ncnn_DIR environment variable")
+else()
+    message("ncnn FOUND ")
+endif()
+
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+
+add_executable(picodet_demo main.cpp picodet.cpp)
+
+target_link_libraries(
+    picodet_demo
+    ncnn
+    ${OpenCV_LIBS}
+)
diff --git a/deploy/third_engine/demo_ncnn/README.md b/deploy/third_engine/demo_ncnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9867b8acc9652e22bfd891671da4f5429436c3c
--- /dev/null
+++ b/deploy/third_engine/demo_ncnn/README.md
@@ -0,0 +1,129 @@
+# PicoDet NCNN Demo
+
+该Demo提供的预测代码是根据[Tencent's NCNN framework](https://github.com/Tencent/ncnn)推理库预测的。
+
+# 第一步：编译
+## Windows
+### Step1.
+Download and Install Visual Studio from https://visualstudio.microsoft.com/vs/community/
+
+### Step2.
+Download and install OpenCV from https://github.com/opencv/opencv/releases
+
+为了方便，如果环境是gcc8.2 x86环境，可直接下载以下库：
+```shell
+wget https://paddledet.bj.bcebos.com/data/opencv-3.4.16_gcc8.2_ffmpeg.tar.gz
+tar -xf opencv-3.4.16_gcc8.2_ffmpeg.tar.gz
+```
+
+### Step3(可选).
+Download and install Vulkan SDK from https://vulkan.lunarg.com/sdk/home
+
+### Step4：编译NCNN
+
+``` shell script
+git clone --recursive https://github.com/Tencent/ncnn.git
+```
+Build NCNN following this tutorial: [Build for Windows x64 using VS2017](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-windows-x64-using-visual-studio-community-2017)
+
+### Step5.
+
+增加 `ncnn_DIR` = `YOUR_NCNN_PATH/build/install/lib/cmake/ncnn` 到系统变量中
+
+Build project: Open x64 Native Tools Command Prompt for VS 2019 or 2017
+
+``` cmd
+cd <this-folder>
+mkdir -p build
+cd build
+cmake ..
+msbuild picodet_demo.vcxproj /p:configuration=release /p:platform=x64
+```
+
+## Linux
+
+### Step1.
+Build and install OpenCV from https://github.com/opencv/opencv
+
+### Step2(可选).
+Download Vulkan SDK from https://vulkan.lunarg.com/sdk/home
+
+### Step3：编译NCNN
+Clone NCNN repository
+
+``` shell script
+git clone --recursive https://github.com/Tencent/ncnn.git
+```
+
+Build NCNN following this tutorial: [Build for Linux / NVIDIA Jetson / Raspberry Pi](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)
+
+### Step4：编译可执行文件
+
+``` shell script
+cd <this-folder>
+mkdir build
+cd build
+cmake ..
+make
+```
+# Run demo
+
+- 准备模型
+    ```shell
+    modelName=picodet_s_320_coco_lcnet
+    # 导出Inference model
+    python tools/export_model.py \
+            -c configs/picodet/${modelName}.yml \
+            -o weights=${modelName}.pdparams \
+            --output_dir=inference_model
+    # 转换到ONNX
+    paddle2onnx --model_dir inference_model/${modelName} \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file ${modelName}.onnx
+    # 简化模型
+    python -m onnxsim ${modelName}.onnx ${modelName}_processed.onnx
+    # 将模型转换至NCNN格式
+    Run onnx2ncnn in ncnn tools to generate ncnn .param and .bin file.
+    ```
+转NCNN模型可以利用在线转换工具 [https://convertmodel.com](https://convertmodel.com/)
+
+为了快速测试，可直接下载：[picodet_s_320_coco_lcnet-opt.bin](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_coco_lcnet-opt.bin)/ [picodet_s_320_coco_lcnet-opt.param](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_coco_lcnet-opt.param)（不带后处理）。
+
+**注意：**由于带后处理后，NCNN预测会出NAN，暂时使用不带后处理Demo即可，带后处理的Demo正在升级中，很快发布。
+
+
+## 开始运行
+
+首先新建预测结果存放目录：
+```shell
+cp -r ../demo_onnxruntime/imgs .
+cd build
+mkdir ../results
+```
+
+- 预测一张图片
+``` shell
+./picodet_demo 0 ../picodet_s_320_coco_lcnet.bin ../picodet_s_320_coco_lcnet.param 320 320 ../imgs/dog.jpg 0
+```
+具体参数解析可参考`main.cpp`。
+
+-测试速度Benchmark
+
+``` shell
+./picodet_demo 1 ../picodet_s_320_lcnet.bin ../picodet_s_320_lcnet.param 320 320  0
+```
+
+## FAQ
+
+- 预测结果精度不对：
+请先确认模型输入shape是否对齐，并且模型输出name是否对齐，不带后处理的PicoDet增强版模型输出name如下：
+```shell
+# 分类分支  |  检测分支
+{"transpose_0.tmp_0", "transpose_1.tmp_0"},
+{"transpose_2.tmp_0", "transpose_3.tmp_0"},
+{"transpose_4.tmp_0", "transpose_5.tmp_0"},
+{"transpose_6.tmp_0", "transpose_7.tmp_0"},
+```
+可使用[netron](https://netron.app)查看具体name，并修改`picodet_mnn.hpp`中相应`non_postprocess_heads_info`数组。
diff --git a/deploy/third_engine/demo_ncnn/main.cpp b/deploy/third_engine/demo_ncnn/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f69af93b2de7f9404fb86d5112ce62056d936b4
--- /dev/null
+++ b/deploy/third_engine/demo_ncnn/main.cpp
@@ -0,0 +1,210 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+
+#include "picodet.h"
+#include <benchmark.h>
+#include <iostream>
+#include <net.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#define __SAVE_RESULT__ // if defined save drawed results to ../results, else
+                        // show it in windows
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+
+void draw_bboxes(const cv::Mat &im, const std::vector<BoxInfo> &bboxes,
+                 std::string save_path = "None") {
+  static const char *class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+
+  cv::Mat image = im.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int thickness = 2;
+  auto colormap = GenerateColorMap(sizeof(class_names));
+
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo &bbox = bboxes[i];
+    std::cout << bbox.x1 << ". " << bbox.y1 << ". " << bbox.x2 << ". "
+              << bbox.y2 << ". " << std::endl;
+    int c1 = colormap[3 * bbox.label + 0];
+    int c2 = colormap[3 * bbox.label + 1];
+    int c3 = colormap[3 * bbox.label + 2];
+    cv::Scalar color = cv::Scalar(c1, c2, c3);
+    // cv::Scalar color = cv::Scalar(0, 0, 255);
+    cv::rectangle(image, cv::Rect(cv::Point(bbox.x1, bbox.y1),
+                                  cv::Point(bbox.x2, bbox.y2)),
+                  color, 1);
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+
+    int x = bbox.x1;
+    int y = bbox.y1 - label_size.height - baseLine;
+    if (y < 0)
+      y = 0;
+    if (x + label_size.width > image.cols)
+      x = image.cols - label_size.width;
+
+    cv::rectangle(image, cv::Rect(cv::Point(x, y),
+                                  cv::Size(label_size.width,
+                                           label_size.height + baseLine)),
+                  color, -1);
+
+    cv::putText(image, text, cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 255, 255), 1);
+  }
+
+  if (save_path == "None") {
+    cv::imshow("image", image);
+  } else {
+    cv::imwrite(save_path, image);
+    std::cout << "Result save in: " << save_path << std::endl;
+  }
+}
+
+int image_demo(PicoDet &detector, const char *imagepath,
+               int has_postprocess = 0) {
+  std::vector<cv::String> filenames;
+  cv::glob(imagepath, filenames, false);
+  bool is_postprocess = has_postprocess > 0 ? true : false;
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name, cv::IMREAD_COLOR);
+    if (image.empty()) {
+      fprintf(stderr, "cv::imread %s failed\n", img_name.c_str());
+      return -1;
+    }
+    std::vector<BoxInfo> results;
+    detector.detect(image, results, is_postprocess);
+    std::cout << "detect done." << std::endl;
+
+#ifdef __SAVE_RESULT__
+    std::string save_path = img_name;
+    draw_bboxes(image, results, save_path.replace(3, 4, "results"));
+#else
+    draw_bboxes(image, results);
+    cv::waitKey(0);
+#endif
+  }
+  return 0;
+}
+
+int benchmark(PicoDet &detector, int width, int height,
+              int has_postprocess = 0) {
+  int loop_num = 100;
+  int warm_up = 8;
+
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(width, height, CV_8UC3, cv::Scalar(1, 1, 1));
+  bool is_postprocess = has_postprocess > 0 ? true : false;
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    double start = ncnn::get_current_time();
+    std::vector<BoxInfo> results;
+    detector.detect(image, results, is_postprocess);
+    double end = ncnn::get_current_time();
+
+    double time = end - start;
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", "picodet",
+          time_min, time_max, time_avg);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int mode = atoi(argv[1]);
+  char *bin_model_path = argv[2];
+  char *param_model_path = argv[3];
+  int height = 320;
+  int width = 320;
+  if (argc == 5) {
+    height = atoi(argv[4]);
+    width = atoi(argv[5]);
+  }
+  PicoDet detector =
+      PicoDet(param_model_path, bin_model_path, width, height, true, 0.45, 0.3);
+  if (mode == 1) {
+
+    benchmark(detector, width, height, atoi(argv[6]));
+  } else {
+    if (argc != 6) {
+      std::cout << "Must set image file, such as ./picodet_demo 0 "
+                   "../picodet_s_320_lcnet.bin ../picodet_s_320_lcnet.param "
+                   "320 320 img.jpg"
+                << std::endl;
+    }
+    const char *images = argv[6];
+    image_demo(detector, images, atoi(argv[7]));
+  }
+}
diff --git a/deploy/third_engine/demo_ncnn/picodet.cpp b/deploy/third_engine/demo_ncnn/picodet.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d5f0ba3c788b0813f85dc61e35ac543661212d1c
--- /dev/null
+++ b/deploy/third_engine/demo_ncnn/picodet.cpp
@@ -0,0 +1,236 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+
+#include "picodet.h"
+#include <benchmark.h>
+#include <iostream>
+
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+bool PicoDet::hasGPU = false;
+PicoDet *PicoDet::detector = nullptr;
+
+PicoDet::PicoDet(const char *param, const char *bin, int input_width,
+                 int input_hight, bool useGPU, float score_threshold_ = 0.5,
+                 float nms_threshold_ = 0.3) {
+  this->Net = new ncnn::Net();
+#if NCNN_VULKAN
+  this->hasGPU = ncnn::get_gpu_count() > 0;
+#endif
+  this->Net->opt.use_vulkan_compute = this->hasGPU && useGPU;
+  this->Net->opt.use_fp16_arithmetic = true;
+  this->Net->load_param(param);
+  this->Net->load_model(bin);
+  this->in_w = input_width;
+  this->in_h = input_hight;
+  this->score_threshold = score_threshold_;
+  this->nms_threshold = nms_threshold_;
+}
+
+PicoDet::~PicoDet() { delete this->Net; }
+
+void PicoDet::preprocess(cv::Mat &image, ncnn::Mat &in) {
+  // cv::resize(image, image, cv::Size(this->in_w, this->in_h), 0.f, 0.f);
+  int img_w = image.cols;
+  int img_h = image.rows;
+  in = ncnn::Mat::from_pixels_resize(image.data, ncnn::Mat::PIXEL_BGR, img_w,
+                                     img_h, this->in_w, this->in_h);
+  const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+  const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+  in.substract_mean_normalize(mean_vals, norm_vals);
+}
+
+int PicoDet::detect(cv::Mat image, std::vector<BoxInfo> &result_list,
+                    bool has_postprocess) {
+
+  ncnn::Mat input;
+  preprocess(image, input);
+  auto ex = this->Net->create_extractor();
+  ex.set_light_mode(false);
+  ex.set_num_threads(4);
+#if NCNN_VULKAN
+  ex.set_vulkan_compute(this->hasGPU);
+#endif
+  ex.input("image", input); // picodet
+
+  this->image_h = image.rows;
+  this->image_w = image.cols;
+
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(this->num_class);
+
+  if (has_postprocess) {
+    ncnn::Mat dis_pred;
+    ncnn::Mat cls_pred;
+    ex.extract(this->nms_heads_info[0].c_str(), dis_pred);
+    ex.extract(this->nms_heads_info[1].c_str(), cls_pred);
+    std::cout << dis_pred.h << "  " << dis_pred.w << std::endl;
+    std::cout << cls_pred.h << "  " << cls_pred.w << std::endl;
+    this->nms_boxes(cls_pred, dis_pred, this->score_threshold, results);
+  } else {
+    for (const auto &head_info : this->non_postprocess_heads_info) {
+      ncnn::Mat dis_pred;
+      ncnn::Mat cls_pred;
+      ex.extract(head_info.dis_layer.c_str(), dis_pred);
+      ex.extract(head_info.cls_layer.c_str(), cls_pred);
+      this->decode_infer(cls_pred, dis_pred, head_info.stride,
+                         this->score_threshold, results);
+    }
+  }
+
+  for (int i = 0; i < (int)results.size(); i++) {
+    this->nms(results[i], this->nms_threshold);
+
+    for (auto box : results[i]) {
+      box.x1 = box.x1 / this->in_w * this->image_w;
+      box.x2 = box.x2 / this->in_w * this->image_w;
+      box.y1 = box.y1 / this->in_h * this->image_h;
+      box.y2 = box.y2 / this->in_h * this->image_h;
+      result_list.push_back(box);
+    }
+  }
+  return 0;
+}
+
+void PicoDet::nms_boxes(ncnn::Mat &cls_pred, ncnn::Mat &dis_pred,
+                        float score_threshold,
+                        std::vector<std::vector<BoxInfo>> &result_list) {
+  BoxInfo bbox;
+  int i, j;
+  for (i = 0; i < dis_pred.h; i++) {
+    bbox.x1 = dis_pred.row(i)[0];
+    bbox.y1 = dis_pred.row(i)[1];
+    bbox.x2 = dis_pred.row(i)[2];
+    bbox.y2 = dis_pred.row(i)[3];
+    const float *scores = cls_pred.row(i);
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < this->num_class; label++) {
+      float score_ = cls_pred.row(label)[i];
+      if (score_ > score) {
+        score = score_;
+        cur_label = label;
+      }
+    }
+    bbox.score = score;
+    bbox.label = cur_label;
+    result_list[cur_label].push_back(bbox);
+  }
+}
+
+void PicoDet::decode_infer(ncnn::Mat &cls_pred, ncnn::Mat &dis_pred, int stride,
+                           float threshold,
+                           std::vector<std::vector<BoxInfo>> &results) {
+  int feature_h = ceil((float)this->in_w / stride);
+  int feature_w = ceil((float)this->in_h / stride);
+
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    const float *scores = cls_pred.row(idx);
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < this->num_class; label++) {
+      if (scores[label] > score) {
+        score = scores[label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float *bbox_pred = dis_pred.row(idx);
+      results[cur_label].push_back(
+          this->disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+
+BoxInfo PicoDet::disPred2Bbox(const float *&dfl_det, int label, float score,
+                              int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[this->reg_max + 1];
+    activation_function_softmax(dfl_det + i * (this->reg_max + 1), dis_after_sm,
+                                this->reg_max + 1);
+    for (int j = 0; j < this->reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)this->in_w);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)this->in_w);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(),
+            [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
diff --git a/deploy/third_engine/demo_ncnn/picodet.h b/deploy/third_engine/demo_ncnn/picodet.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd8c8f5af96aed9393e207b6e920259d95befbe7
--- /dev/null
+++ b/deploy/third_engine/demo_ncnn/picodet.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+
+#ifndef PICODET_H
+#define PICODET_H
+
+#include <net.h>
+#include <opencv2/core/core.hpp>
+
+typedef struct NonPostProcessHeadInfo {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} NonPostProcessHeadInfo;
+
+typedef struct BoxInfo {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+
+class PicoDet {
+public:
+  PicoDet(const char *param, const char *bin, int input_width, int input_hight,
+          bool useGPU, float score_threshold_, float nms_threshold_);
+
+  ~PicoDet();
+
+  static PicoDet *detector;
+  ncnn::Net *Net;
+  static bool hasGPU;
+
+  int detect(cv::Mat image, std::vector<BoxInfo> &result_list,
+             bool has_postprocess);
+
+private:
+  void preprocess(cv::Mat &image, ncnn::Mat &in);
+  void decode_infer(ncnn::Mat &cls_pred, ncnn::Mat &dis_pred, int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(const float *&dfl_det, int label, float score, int x,
+                       int y, int stride);
+  static void nms(std::vector<BoxInfo> &result, float nms_threshold);
+  void nms_boxes(ncnn::Mat &cls_pred, ncnn::Mat &dis_pred,
+                 float score_threshold,
+                 std::vector<std::vector<BoxInfo>> &result_list);
+
+  int image_w;
+  int image_h;
+  int in_w = 320;
+  int in_h = 320;
+  int num_class = 80;
+  int reg_max = 7;
+
+  float score_threshold;
+  float nms_threshold;
+
+  std::vector<float> bbox_output_data_;
+  std::vector<float> class_output_data_;
+
+  std::vector<std::string> nms_heads_info{"tmp_16", "concat_4.tmp_0"};
+  // If not export post-process, will use non_postprocess_heads_info
+  std::vector<NonPostProcessHeadInfo> non_postprocess_heads_info{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+};
+
+#endif
diff --git a/deploy/third_engine/demo_onnx_trt/README.md b/deploy/third_engine/demo_onnx_trt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cdd5b603f2f4a3f46fa85f4d4739974d9079a775
--- /dev/null
+++ b/deploy/third_engine/demo_onnx_trt/README.md
@@ -0,0 +1,33 @@
+# PP-YOLOE 转ONNX-TRT教程
+
+本教程内容为：使用PP-YOLOE模型导出转换为ONNX格式，并定制化修改网络，使用[EfficientNMS_TRT](https://github.com/NVIDIA/TensorRT/tree/main/plugin/efficientNMSPlugin) OP，
+可成功运行在[TensorRT](https://github.com/NVIDIA/TensorRT)上，示例仅供参考
+
+## 1. 环境依赖
+CUDA 10.2 + [cudnn 8.2.1](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html) + [TensorRT 8.2](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/install-guide/index.htm)
+```commandline
+onnx
+onnxruntime
+paddle2onnx
+```
+
+## 2. Paddle模型导出
+```commandline
+python tools/export_model.py -c configs/ppyoloe/ppyoloe_crn_l_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams trt=True exclude_nms=True
+```
+
+## 3. ONNX模型转换 + 定制化修改EfficientNMS_TRT
+```commandline
+python deploy/third_engine/demo_onnx_trt/onnx_custom.py --onnx_file=output_inference/ppyoloe_crn_l_300e_coco/ppyoloe_crn_l_300e_coco.onnx --model_dir=output_inference/ppyoloe_crn_l_300e_coco/ --opset_version=11
+```
+
+## 4. TensorRT Engine
+```commandline
+trtexec --onnx=output_inference/ppyoloe_crn_l_300e_coco/ppyoloe_crn_l_300e_coco.onnx --saveEngine=ppyoloe_crn_l_300e_coco.engine
+```
+**注意**：若运行报错，可尝试添加`--tacticSources=-cublasLt,+cublas`参数解决
+
+## 5. 运行TensorRT推理
+```commandline
+python deploy/third_engine/demo_onnx_trt/trt_infer.py --infer_cfg=output_inference/ppyoloe_crn_l_300e_coco/infer_cfg.yml --trt_engine=ppyoloe_crn_l_300e_coco.engine --image_file=demo/000000014439.jpg
+```
diff --git a/deploy/third_engine/demo_onnx_trt/onnx_custom.py b/deploy/third_engine/demo_onnx_trt/onnx_custom.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6ae82869413c8d6e10c4ad123ca5e64073afc8
--- /dev/null
+++ b/deploy/third_engine/demo_onnx_trt/onnx_custom.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import onnx
+import onnx_graphsurgeon
+import numpy as np
+from collections import OrderedDict
+from paddle2onnx.command import program2onnx
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--onnx_file', required=True, type=str, help='onnx model path')
+parser.add_argument(
+    '--model_dir',
+    type=str,
+    default=None,
+    help=("Directory include:'model.pdiparams', 'model.pdmodel', "
+          "'infer_cfg.yml', created by tools/export_model.py."))
+parser.add_argument(
+    "--opset_version",
+    type=int,
+    default=11,
+    help="set onnx opset version to export")
+parser.add_argument(
+    '--topk_all', type=int, default=300, help='topk objects for every images')
+parser.add_argument(
+    '--iou_thres', type=float, default=0.7, help='iou threshold for NMS')
+parser.add_argument(
+    '--conf_thres', type=float, default=0.01, help='conf threshold for NMS')
+
+
+def main(FLAGS):
+    assert os.path.exists(FLAGS.onnx_file)
+    onnx_model = onnx.load(FLAGS.onnx_file)
+    graph = onnx_graphsurgeon.import_onnx(onnx_model)
+    graph.toposort()
+    graph.fold_constants()
+    graph.cleanup()
+
+    num_anchors = graph.outputs[1].shape[2]
+    num_classes = graph.outputs[1].shape[1]
+    scores = onnx_graphsurgeon.Variable(
+        name='scores', shape=[-1, num_anchors, num_classes], dtype=np.float32)
+    graph.layer(
+        op='Transpose',
+        name='lastTranspose',
+        inputs=[graph.outputs[1]],
+        outputs=[scores],
+        attrs=OrderedDict(perm=[0, 2, 1]))
+
+    attrs = OrderedDict(
+        plugin_version="1",
+        background_class=-1,
+        max_output_boxes=FLAGS.topk_all,
+        score_threshold=FLAGS.conf_thres,
+        iou_threshold=FLAGS.iou_thres,
+        score_activation=False,
+        box_coding=0, )
+    outputs = [
+        onnx_graphsurgeon.Variable("num_dets", np.int32, [-1, 1]),
+        onnx_graphsurgeon.Variable("det_boxes", np.float32,
+                                   [-1, FLAGS.topk_all, 4]),
+        onnx_graphsurgeon.Variable("det_scores", np.float32,
+                                   [-1, FLAGS.topk_all]),
+        onnx_graphsurgeon.Variable("det_classes", np.int32,
+                                   [-1, FLAGS.topk_all])
+    ]
+    graph.layer(
+        op='EfficientNMS_TRT',
+        name="batched_nms",
+        inputs=[graph.outputs[0], scores],
+        outputs=outputs,
+        attrs=attrs)
+    graph.outputs = outputs
+    graph.cleanup().toposort()
+    onnx.save(onnx_graphsurgeon.export_onnx(graph), FLAGS.onnx_file)
+    print(f"The modified onnx model is saved in {FLAGS.onnx_file}")
+
+
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    if FLAGS.model_dir is not None:
+        assert os.path.exists(FLAGS.model_dir)
+        program2onnx(
+            model_dir=FLAGS.model_dir,
+            save_file=FLAGS.onnx_file,
+            model_filename="model.pdmodel",
+            params_filename="model.pdiparams",
+            opset_version=FLAGS.opset_version,
+            enable_onnx_checker=True)
+    main(FLAGS)
diff --git a/deploy/third_engine/demo_onnx_trt/preprocess.py b/deploy/third_engine/demo_onnx_trt/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..504762db89c867b12b9e9196a4b6abc182795cc3
--- /dev/null
+++ b/deploy/third_engine/demo_onnx_trt/preprocess.py
@@ -0,0 +1,565 @@
+import numpy as np
+import cv2
+import copy
+
+
+def decode_image(img_path):
+    with open(img_path, 'rb') as f:
+        im_read = f.read()
+    data = np.frombuffer(im_read, dtype='uint8')
+    im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    img_info = {
+        "im_shape": np.array(
+            im.shape[:2], dtype=np.float32),
+        "scale_factor": np.array(
+            [1., 1.], dtype=np.float32)
+    }
+    return im, img_info
+
+
+class Resize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+
+    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+        im_info['scale_factor'] = np.array(
+            [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+
+
+class NormalizeImage(object):
+    """normalize image
+    Args:
+        mean (list): im - mean
+        std (list): im / std
+        is_scale (bool): whether need im / 255
+        norm_type (str): type in ['mean_std', 'none']
+    """
+
+    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+        return im, im_info
+
+
+class Permute(object):
+    """permute image
+    Args:
+        to_bgr (bool): whether convert RGB to BGR
+        channel_first (bool): whether convert HWC to CHW
+    """
+
+    def __init__(self, ):
+        super(Permute, self).__init__()
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.transpose((2, 0, 1)).copy()
+        return im, im_info
+
+
+class PadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im, im_info
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
+
+
+class LetterBoxResize(object):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        height, width = self.target_size
+        h, w = im.shape[:2]
+        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
+
+        new_shape = [round(h * ratio), round(w * ratio)]
+        im_info['im_shape'] = np.array(new_shape, dtype=np.float32)
+        im_info['scale_factor'] = np.array([ratio, ratio], dtype=np.float32)
+        return im, im_info
+
+
+class Pad(object):
+    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
+        """
+        Pad image to a specified size.
+        Args:
+            size (list[int]): image target size
+            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
+        """
+        super(Pad, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+        self.fill_value = fill_value
+
+    def __call__(self, im, im_info):
+        im_h, im_w = im.shape[:2]
+        h, w = self.size
+        if h == im_h and w == im_w:
+            im = im.astype(np.float32)
+            return im, im_info
+
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
+        im = canvas
+        return im, im_info
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def get_affine_transform(center,
+                         input_size,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+    if not isinstance(input_size, (np.ndarray, list)):
+        input_size = np.array([input_size, input_size], dtype=np.float32)
+    scale_tmp = input_size
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+class WarpAffine(object):
+    """Warp affine the image
+    """
+
+    def __init__(self,
+                 keep_res=False,
+                 pad=31,
+                 input_h=512,
+                 input_w=512,
+                 scale=0.4,
+                 shift=0.1):
+        self.keep_res = keep_res
+        self.pad = pad
+        self.input_h = input_h
+        self.input_w = input_w
+        self.scale = scale
+        self.shift = shift
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        img = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
+
+        h, w = img.shape[:2]
+
+        if self.keep_res:
+            input_h = (h | self.pad) + 1
+            input_w = (w | self.pad) + 1
+            s = np.array([input_w, input_h], dtype=np.float32)
+            c = np.array([w // 2, h // 2], dtype=np.float32)
+
+        else:
+            s = max(h, w) * 1.0
+            input_h, input_w = self.input_h, self.input_w
+            c = np.array([w / 2., h / 2.], dtype=np.float32)
+
+        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
+        img = cv2.resize(img, (w, h))
+        inp = cv2.warpAffine(
+            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+        return inp, im_info
+
+
+# keypoint preprocess
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """This code is based on
+        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+        Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+    Returns:
+        matrix (np.ndarray): A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = np.cos(theta) * scale_x
+    matrix[0, 1] = -np.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
+        np.sin(theta) + 0.5 * size_target[0])
+    matrix[1, 0] = np.sin(theta) * scale_y
+    matrix[1, 1] = np.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
+        np.cos(theta) + 0.5 * size_target[1])
+    return matrix
+
+
+class TopDownEvalAffine(object):
+    """apply affine transform to image and coords
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+    Returns:
+        records (dict): contain the image and coords after tranformed
+    """
+
+    def __init__(self, trainsize, use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+
+    def __call__(self, image, im_info):
+        rot = 0
+        imshape = im_info['im_shape'][::-1]
+        center = im_info['center'] if 'center' in im_info else imshape / 2.
+        scale = im_info['scale'] if 'scale' in im_info else imshape
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot, center * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        else:
+            trans = get_affine_transform(center, scale, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+
+        return image, im_info
+
+
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = []
+        for op_info in transforms:
+            new_op_info = op_info.copy()
+            op_type = new_op_info.pop('type')
+            self.transforms.append(eval(op_type)(**new_op_info))
+
+    def __call__(self, img_path):
+        img, im_info = decode_image(img_path)
+        for t in self.transforms:
+            img, im_info = t(img, im_info)
+        inputs = copy.deepcopy(im_info)
+        inputs['image'] = np.ascontiguousarray(img.astype('float32'))
+        return inputs
+
+
+coco_clsid2catid = {
+    0: 1,
+    1: 2,
+    2: 3,
+    3: 4,
+    4: 5,
+    5: 6,
+    6: 7,
+    7: 8,
+    8: 9,
+    9: 10,
+    10: 11,
+    11: 13,
+    12: 14,
+    13: 15,
+    14: 16,
+    15: 17,
+    16: 18,
+    17: 19,
+    18: 20,
+    19: 21,
+    20: 22,
+    21: 23,
+    22: 24,
+    23: 25,
+    24: 27,
+    25: 28,
+    26: 31,
+    27: 32,
+    28: 33,
+    29: 34,
+    30: 35,
+    31: 36,
+    32: 37,
+    33: 38,
+    34: 39,
+    35: 40,
+    36: 41,
+    37: 42,
+    38: 43,
+    39: 44,
+    40: 46,
+    41: 47,
+    42: 48,
+    43: 49,
+    44: 50,
+    45: 51,
+    46: 52,
+    47: 53,
+    48: 54,
+    49: 55,
+    50: 56,
+    51: 57,
+    52: 58,
+    53: 59,
+    54: 60,
+    55: 61,
+    56: 62,
+    57: 63,
+    58: 64,
+    59: 65,
+    60: 67,
+    61: 70,
+    62: 72,
+    63: 73,
+    64: 74,
+    65: 75,
+    66: 76,
+    67: 77,
+    68: 78,
+    69: 79,
+    70: 80,
+    71: 81,
+    72: 82,
+    73: 84,
+    74: 85,
+    75: 86,
+    76: 87,
+    77: 88,
+    78: 89,
+    79: 90
+}
diff --git a/deploy/third_engine/demo_onnx_trt/trt_infer.py b/deploy/third_engine/demo_onnx_trt/trt_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4266dc74d492d6ae6660a09dfe4c3fe771783a1c
--- /dev/null
+++ b/deploy/third_engine/demo_onnx_trt/trt_infer.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+import tensorrt as trt
+from collections import OrderedDict
+import os
+import yaml
+import json
+import glob
+import argparse
+
+from preprocess import Compose
+from preprocess import coco_clsid2catid
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--infer_cfg", type=str, help="infer_cfg.yml")
+parser.add_argument(
+    "--trt_engine", required=True, type=str, help="trt engine path")
+parser.add_argument("--image_dir", type=str)
+parser.add_argument("--image_file", type=str)
+parser.add_argument(
+    "--repeats",
+    type=int,
+    default=1,
+    help="Repeat the running test `repeats` times in benchmark")
+parser.add_argument(
+    "--save_coco",
+    action='store_true',
+    default=False,
+    help="Whether to save coco results")
+parser.add_argument(
+    "--coco_file", type=str, default="results.json", help="coco results path")
+
+TRT_LOGGER = trt.Logger()
+trt.init_libnvinfer_plugins(TRT_LOGGER, namespace="")
+# Global dictionary
+SUPPORT_MODELS = {
+    'YOLO', 'PPYOLOE', 'YOLOX', 'YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8', 'RTMDet'
+}
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    print("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+class PredictConfig(object):
+    """set config of preprocess, postprocess and visualize
+    Args:
+        infer_config (str): path of infer_cfg.yml
+    """
+
+    def __init__(self, infer_config):
+        # parsing Yaml config for Preprocess
+        with open(infer_config) as f:
+            yml_conf = yaml.safe_load(f)
+        self.check_model(yml_conf)
+        self.arch = yml_conf['arch']
+        self.preprocess_infos = yml_conf['Preprocess']
+        self.min_subgraph_size = yml_conf['min_subgraph_size']
+        self.label_list = yml_conf['label_list']
+        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
+        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
+        self.mask = yml_conf.get("mask", False)
+        self.tracker = yml_conf.get("tracker", None)
+        self.nms = yml_conf.get("NMS", None)
+        self.fpn_stride = yml_conf.get("fpn_stride", None)
+        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
+            print(
+                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
+            )
+        self.print_config()
+
+    def check_model(self, yml_conf):
+        """
+        Raises:
+            ValueError: loaded model not in supported model type
+        """
+        for support_model in SUPPORT_MODELS:
+            if support_model in yml_conf['arch']:
+                return True
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
+            'arch'], SUPPORT_MODELS))
+
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+
+
+def load_trt_engine(engine_path):
+    assert os.path.exists(engine_path)
+    print("Reading engine from file {}".format(engine_path))
+    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+        return runtime.deserialize_cuda_engine(f.read())
+
+
+def predict_image(infer_config, engine, img_list, save_coco=False, repeats=1):
+    # load preprocess transforms
+    transforms = Compose(infer_config.preprocess_infos)
+
+    stream = cuda.Stream()
+    coco_results = []
+    num_data = len(img_list)
+    avg_time = []
+    with engine.create_execution_context() as context:
+        # Allocate host and device buffers
+        bindings = create_trt_bindings(engine, context)
+        # warmup
+        run_trt_context(context, bindings, stream, repeats=10)
+        # predict image
+        for i, img_path in enumerate(img_list):
+            inputs = transforms(img_path)
+            inputs_name = [k for k, v in bindings.items() if v['is_input']]
+            inputs = {
+                k: inputs[k][None, ]
+                for k in inputs.keys() if k in inputs_name
+            }
+            # run infer
+            for k, v in inputs.items():
+                bindings[k]['cpu_data'][...] = v
+            output = run_trt_context(context, bindings, stream, repeats=repeats)
+            print(f"{i + 1}/{num_data} infer time: {output['infer_time']} ms.")
+            avg_time.append(output['infer_time'])
+            # get output
+            for k, v in output.items():
+                if k in bindings.keys():
+                    output[k] = np.reshape(v, bindings[k]['shape'])
+            if save_coco:
+                coco_results.extend(
+                    format_coco_results(os.path.split(img_path)[-1], output))
+    avg_time = np.mean(avg_time)
+    print(
+        f"Run on {num_data} data, repeats {repeats} times, avg time: {avg_time} ms."
+    )
+    if save_coco:
+        with open(FLAGS.coco_file, 'w') as f:
+            json.dump(coco_results, f)
+        print(f"save coco json to {FLAGS.coco_file}")
+
+
+def create_trt_bindings(engine, context):
+    bindings = OrderedDict()
+    for name in engine:
+        binding_idx = engine.get_binding_index(name)
+        size = trt.volume(context.get_binding_shape(binding_idx))
+        dtype = trt.nptype(engine.get_binding_dtype(name))
+        shape = list(engine.get_binding_shape(binding_idx))
+        if shape[0] == -1:
+            shape[0] = 1
+        bindings[name] = {
+            "idx": binding_idx,
+            "size": size,
+            "dtype": dtype,
+            "shape": shape,
+            "cpu_data": None,
+            "cuda_ptr": None,
+            "is_input": True if engine.binding_is_input(name) else False
+        }
+        if engine.binding_is_input(name):
+            bindings[name]['cpu_data'] = np.random.randn(*shape).astype(
+                np.float32)
+            bindings[name]['cuda_ptr'] = cuda.mem_alloc(bindings[name][
+                'cpu_data'].nbytes)
+        else:
+            bindings[name]['cpu_data'] = cuda.pagelocked_empty(size, dtype)
+            bindings[name]['cuda_ptr'] = cuda.mem_alloc(bindings[name][
+                'cpu_data'].nbytes)
+    return bindings
+
+
+def run_trt_context(context, bindings, stream, repeats=1):
+    # Transfer input data to the GPU.
+    for k, v in bindings.items():
+        if v['is_input']:
+            cuda.memcpy_htod_async(v['cuda_ptr'], v['cpu_data'], stream)
+    in_bindings = [int(v['cuda_ptr']) for k, v in bindings.items()]
+    output_data = {}
+    avg_time = []
+    for _ in range(repeats):
+        # Run inference
+        t1 = time.time()
+        context.execute_async_v2(
+            bindings=in_bindings, stream_handle=stream.handle)
+        # Transfer prediction output from the GPU.
+        for k, v in bindings.items():
+            if not v['is_input']:
+                cuda.memcpy_dtoh_async(v['cpu_data'], v['cuda_ptr'], stream)
+                output_data[k] = v['cpu_data']
+        # Synchronize the stream
+        stream.synchronize()
+        t2 = time.time()
+        avg_time.append(t2 - t1)
+    output_data['infer_time'] = np.mean(avg_time) * 1000
+    return output_data
+
+
+def format_coco_results(file_name, result):
+    try:
+        image_id = int(os.path.splitext(file_name)[0])
+    except:
+        image_id = file_name
+    num_dets = result['num_dets'].tolist()
+    det_classes = result['det_classes'].tolist()
+    det_scores = result['det_scores'].tolist()
+    det_boxes = result['det_boxes'].tolist()
+    per_result = [
+        {
+            'image_id': image_id,
+            'category_id': coco_clsid2catid[int(det_classes[0][idx])],
+            'file_name': file_name,
+            'bbox': [
+                det_boxes[0][idx][0], det_boxes[0][idx][1],
+                det_boxes[0][idx][2] - det_boxes[0][idx][0],
+                det_boxes[0][idx][3] - det_boxes[0][idx][1]
+            ],  # xyxy -> xywh
+            'score': det_scores[0][idx]
+        } for idx in range(num_dets[0][0])
+    ]
+
+    return per_result
+
+
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    # load image list
+    img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
+    # load trt engine
+    engine = load_trt_engine(FLAGS.trt_engine)
+    # load infer config
+    infer_config = PredictConfig(FLAGS.infer_cfg)
+
+    predict_image(infer_config, engine, img_list, FLAGS.save_coco,
+                  FLAGS.repeats)
+    print('Done!')
diff --git a/deploy/third_engine/demo_onnxruntime/README.md b/deploy/third_engine/demo_onnxruntime/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bdf7a9432f3e35499c616524b031a27cb2e99fc4
--- /dev/null
+++ b/deploy/third_engine/demo_onnxruntime/README.md
@@ -0,0 +1,43 @@
+# PicoDet ONNX Runtime Demo
+
+本文件夹提供利用[ONNX Runtime](https://onnxruntime.ai/docs/)进行 PicoDet 部署与Inference images 的 Demo。
+
+## 安装 ONNX Runtime
+
+本demo采用的是 ONNX Runtime 1.10.0，可直接运行如下指令安装：
+```shell
+pip install onnxruntime
+```
+
+详细安装步骤，可参考 [Install ONNX Runtime](https://onnxruntime.ai/docs/install/)。
+
+## Inference images
+
+- 准备测试模型：根据[PicoDet](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/picodet)中【导出及转换模型】步骤，采用包含后处理的方式导出模型（`-o export.benchmark=False` ），并生成待测试模型简化后的onnx模型（可在下文链接中直接下载）。同时在本目录下新建```onnx_file```文件夹，将导出的onnx模型放在该目录下。
+
+- 准备测试所用图片：将待测试图片放在```./imgs```文件夹下，本demo已提供了两张测试图片。
+
+- 在本目录下直接运行：
+    ```shell
+    python infer_demo.py --modelpath ./onnx_file/picodet_s_320_lcnet_postprocessed.onnx
+    ```
+    将会对```./imgs```文件夹下所有图片进行识别，并将识别结果保存在```./results```文件夹下。
+
+- 结果：
+    <div align="center">
+      <img src="../../../docs/images/bus.jpg" height="300px" ><img src="../../../docs/images/dog.jpg" height="300px" >
+    </div>
+
+## 模型下载
+
+| 模型     | 输入尺寸 | ONNX( w/ 后处理)  |
+| :-------- | :--------: | :---------------------: |
+| PicoDet-XS |  320*320   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_320_lcnet_postprocessed.onnx) |
+| PicoDet-XS |  416*416   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_416_lcnet_postprocessed.onnx) |
+| PicoDet-S |  320*320   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_lcnet_postprocessed.onnx) |
+| PicoDet-S |  416*416   |  [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_416_lcnet_postprocessed.onnx) |
+| PicoDet-M |  320*320   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_320_lcnet_postprocessed.onnx) |
+| PicoDet-M |  416*416   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_lcnet_postprocessed.onnx) |
+| PicoDet-L |  320*320   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_320_lcnet_postprocessed.onnx) |
+| PicoDet-L |  416*416   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_416_lcnet_postprocessed.onnx) |
+| PicoDet-L |  640*640   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_640_lcnet_postprocessed.onnx) |
diff --git a/deploy/third_engine/demo_onnxruntime/coco_label.txt b/deploy/third_engine/demo_onnxruntime/coco_label.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa
--- /dev/null
+++ b/deploy/third_engine/demo_onnxruntime/coco_label.txt
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/deploy/third_engine/demo_onnxruntime/imgs/bus.jpg b/deploy/third_engine/demo_onnxruntime/imgs/bus.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b43e311165c785f000eb7493ff8fb662d06a3f83
Binary files /dev/null and b/deploy/third_engine/demo_onnxruntime/imgs/bus.jpg differ
diff --git a/deploy/third_engine/demo_onnxruntime/imgs/dog.jpg b/deploy/third_engine/demo_onnxruntime/imgs/dog.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..77b0381222eaed50867643f4166092c781e56d5b
Binary files /dev/null and b/deploy/third_engine/demo_onnxruntime/imgs/dog.jpg differ
diff --git a/deploy/third_engine/demo_onnxruntime/infer_demo.py b/deploy/third_engine/demo_onnxruntime/infer_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b6de5286af6c51ae607201355a30167e894cbd
--- /dev/null
+++ b/deploy/third_engine/demo_onnxruntime/infer_demo.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import argparse
+import onnxruntime as ort
+from pathlib import Path
+from tqdm import tqdm
+
+
+class PicoDet():
+    def __init__(self,
+                 model_pb_path,
+                 label_path,
+                 prob_threshold=0.4,
+                 iou_threshold=0.3):
+        self.classes = list(
+            map(lambda x: x.strip(), open(label_path, 'r').readlines()))
+        self.num_classes = len(self.classes)
+        self.prob_threshold = prob_threshold
+        self.iou_threshold = iou_threshold
+        self.mean = np.array(
+            [103.53, 116.28, 123.675], dtype=np.float32).reshape(1, 1, 3)
+        self.std = np.array(
+            [57.375, 57.12, 58.395], dtype=np.float32).reshape(1, 1, 3)
+        so = ort.SessionOptions()
+        so.log_severity_level = 3
+        self.net = ort.InferenceSession(model_pb_path, so)
+        inputs_name = [a.name for a in self.net.get_inputs()]
+        inputs_shape = {
+            k: v.shape
+            for k, v in zip(inputs_name, self.net.get_inputs())
+        }
+        self.input_shape = inputs_shape['image'][2:]
+
+    def _normalize(self, img):
+        img = img.astype(np.float32)
+        img = (img / 255.0 - self.mean / 255.0) / (self.std / 255.0)
+        return img
+
+    def resize_image(self, srcimg, keep_ratio=False):
+        top, left, newh, neww = 0, 0, self.input_shape[0], self.input_shape[1]
+        origin_shape = srcimg.shape[:2]
+        im_scale_y = newh / float(origin_shape[0])
+        im_scale_x = neww / float(origin_shape[1])
+        img_shape = np.array([
+            [float(self.input_shape[0]), float(self.input_shape[1])]
+        ]).astype('float32')
+        scale_factor = np.array([[im_scale_y, im_scale_x]]).astype('float32')
+
+        if keep_ratio and srcimg.shape[0] != srcimg.shape[1]:
+            hw_scale = srcimg.shape[0] / srcimg.shape[1]
+            if hw_scale > 1:
+                newh, neww = self.input_shape[0], int(self.input_shape[1] /
+                                                      hw_scale)
+                img = cv2.resize(
+                    srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
+                left = int((self.input_shape[1] - neww) * 0.5)
+                img = cv2.copyMakeBorder(
+                    img,
+                    0,
+                    0,
+                    left,
+                    self.input_shape[1] - neww - left,
+                    cv2.BORDER_CONSTANT,
+                    value=0)  # add border
+            else:
+                newh, neww = int(self.input_shape[0] *
+                                 hw_scale), self.input_shape[1]
+                img = cv2.resize(
+                    srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
+                top = int((self.input_shape[0] - newh) * 0.5)
+                img = cv2.copyMakeBorder(
+                    img,
+                    top,
+                    self.input_shape[0] - newh - top,
+                    0,
+                    0,
+                    cv2.BORDER_CONSTANT,
+                    value=0)
+        else:
+            img = cv2.resize(
+                srcimg, self.input_shape, interpolation=cv2.INTER_LINEAR)
+
+        return img, img_shape, scale_factor
+
+    def get_color_map_list(self, num_classes):
+        color_map = num_classes * [0, 0, 0]
+        for i in range(0, num_classes):
+            j = 0
+            lab = i
+            while lab:
+                color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+                color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+                color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+                j += 1
+                lab >>= 3
+        color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+        return color_map
+
+    def detect(self, srcimg):
+        img, im_shape, scale_factor = self.resize_image(srcimg)
+        img = self._normalize(img)
+
+        blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0)
+
+        inputs_dict = {
+            'im_shape': im_shape,
+            'image': blob,
+            'scale_factor': scale_factor
+        }
+        inputs_name = [a.name for a in self.net.get_inputs()]
+        net_inputs = {k: inputs_dict[k] for k in inputs_name}
+
+        outs = self.net.run(None, net_inputs)
+
+        outs = np.array(outs[0])
+        expect_boxes = (outs[:, 1] > 0.5) & (outs[:, 0] > -1)
+        np_boxes = outs[expect_boxes, :]
+
+        color_list = self.get_color_map_list(self.num_classes)
+        clsid2color = {}
+
+        for i in range(np_boxes.shape[0]):
+            classid, conf = int(np_boxes[i, 0]), np_boxes[i, 1]
+            xmin, ymin, xmax, ymax = int(np_boxes[i, 2]), int(np_boxes[
+                i, 3]), int(np_boxes[i, 4]), int(np_boxes[i, 5])
+
+            if classid not in clsid2color:
+                clsid2color[classid] = color_list[classid]
+            color = tuple(clsid2color[classid])
+
+            cv2.rectangle(
+                srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2)
+            print(self.classes[classid] + ': ' + str(round(conf, 3)))
+            cv2.putText(
+                srcimg,
+                self.classes[classid] + ':' + str(round(conf, 3)), (xmin,
+                                                                    ymin - 10),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.8, (0, 255, 0),
+                thickness=2)
+
+        return srcimg
+
+    def detect_folder(self, img_fold, result_path):
+        img_fold = Path(img_fold)
+        result_path = Path(result_path)
+        result_path.mkdir(parents=True, exist_ok=True)
+
+        img_name_list = filter(
+            lambda x: str(x).endswith(".png") or str(x).endswith(".jpg"),
+            img_fold.iterdir(), )
+        img_name_list = list(img_name_list)
+        print(f"find {len(img_name_list)} images")
+
+        for img_path in tqdm(img_name_list):
+            img = cv2.imread(str(img_path), 1)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+            srcimg = net.detect(img)
+            save_path = str(result_path / img_path.name.replace(".png", ".jpg"))
+            cv2.imwrite(save_path, srcimg)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--modelpath',
+        type=str,
+        default='onnx_file/picodet_s_320_lcnet_postprocessed.onnx',
+        help="onnx filepath")
+    parser.add_argument(
+        '--classfile',
+        type=str,
+        default='coco_label.txt',
+        help="classname filepath")
+    parser.add_argument(
+        '--confThreshold', default=0.5, type=float, help='class confidence')
+    parser.add_argument(
+        '--nmsThreshold', default=0.6, type=float, help='nms iou thresh')
+    parser.add_argument(
+        "--img_fold", dest="img_fold", type=str, default="./imgs")
+    parser.add_argument(
+        "--result_fold", dest="result_fold", type=str, default="results")
+    args = parser.parse_args()
+
+    net = PicoDet(
+        args.modelpath,
+        args.classfile,
+        prob_threshold=args.confThreshold,
+        iou_threshold=args.nmsThreshold)
+
+    net.detect_folder(args.img_fold, args.result_fold)
+    print(
+        f'infer results in ./deploy/third_engine/demo_onnxruntime/{args.result_fold}'
+    )
diff --git a/deploy/third_engine/demo_openvino/CMakeLists.txt b/deploy/third_engine/demo_openvino/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8ee82513f414779ba8c7d4ff97ffa90051e8fc35
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.4.1)
+set(CMAKE_CXX_STANDARD 14)
+
+project(picodet_demo)
+
+find_package(OpenCV REQUIRED)
+find_package(InferenceEngine REQUIRED)
+find_package(ngraph REQUIRED)
+
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_executable(picodet_demo main.cpp picodet_openvino.cpp)
+
+target_link_libraries(
+    picodet_demo
+    ${InferenceEngine_LIBRARIES}
+    ${NGRAPH_LIBRARIES}
+    ${OpenCV_LIBS}
+)
diff --git a/deploy/third_engine/demo_openvino/README.md b/deploy/third_engine/demo_openvino/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..99a3e0f27c519308f915627e66118c965b600e6d
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/README.md
@@ -0,0 +1,143 @@
+# PicoDet OpenVINO Demo
+
+This fold provides PicoDet inference code using
+[Intel's OpenVINO Toolkit](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html). Most of the implements in this fold are same as *demo_ncnn*.  
+**Recommand** to use the xxx.tar.gz file to install instead of github method, [link](https://registrationcenter-download.intel.com/akdlm/irc_nas/18096/l_openvino_toolkit_p_2021.4.689.tgz).
+
+
+## Install OpenVINO Toolkit
+
+Go to [OpenVINO HomePage](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)
+
+Download a suitable version and install.
+
+Follow the official Get Started Guides: https://docs.openvinotoolkit.org/latest/get_started_guides.html
+
+## Set the Environment Variables
+
+### Windows:
+
+Run this command in cmd. (Every time before using OpenVINO)
+```cmd
+<INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+```
+
+Or set the system environment variables once for all:
+
+Name                  |Value
+:--------------------:|:--------:
+INTEL_OPENVINO_DIR | <INSTSLL_DIR>\openvino_2021
+INTEL_CVSDK_DIR | %INTEL_OPENVINO_DIR%
+InferenceEngine_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\share
+HDDL_INSTALL_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\hddl
+ngraph_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\ngraph\cmake
+
+And add this to ```Path```
+```
+%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Debug;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Release;%HDDL_INSTALL_DIR%\bin;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\tbb\bin;%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\lib
+```
+
+### Linux
+
+Run this command in shell. (Every time before using OpenVINO)
+
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+
+Or edit .bashrc
+
+```shell
+vi ~/.bashrc
+```
+
+Add this line to the end of the file
+
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+
+## Convert model
+
+   Convert to OpenVINO
+
+   ``` shell
+   cd <INSTSLL_DIR>/openvino_2021/deployment_tools/model_optimizer
+   ```
+
+   Install requirements for convert tool
+
+   ```shell
+   cd ./install_prerequisites
+   sudo install_prerequisites_onnx.sh
+
+   ```
+
+   Then convert model. Notice: mean_values and scale_values should be the same with your training settings in YAML config file.
+   ```shell
+   python3 mo_onnx.py --input_model <ONNX_MODEL> --mean_values [103.53,116.28,123.675] --scale_values [57.375,57.12,58.395]
+   ```
+
+## Build
+
+### Windows
+
+```cmd
+<OPENVINO_INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+mkdir -p build
+cd build
+cmake ..
+msbuild picodet_demo.vcxproj /p:configuration=release /p:platform=x64
+```
+
+### Linux
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+mkdir build
+cd build
+cmake ..
+make
+```
+
+
+## Run demo
+Download PicoDet openvino model [PicoDet openvino model download link](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_openvino.zip).
+
+move picodet openvino model files to the demo's weight folder.
+
+### Edit file
+```
+step1:
+main.cpp
+#define image_size 416
+...
+auto detector = PicoDet("../weight/picodet_m_416.xml");
+...
+step2:
+picodet_openvino.h
+#define image_size 416
+```
+
+### Webcam
+
+```shell
+picodet_demo 0 0
+```
+
+### Inference images
+
+```shell
+picodet_demo 1 IMAGE_FOLDER/*.jpg
+```
+
+### Inference video
+
+```shell
+picodet_demo 2 VIDEO_PATH
+```
+
+### Benchmark
+
+```shell
+picodet_demo 3 0
+```
diff --git a/deploy/third_engine/demo_openvino/main.cpp b/deploy/third_engine/demo_openvino/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e24b6070fbcbe9b95a02a7cd07c68bea8afc165d
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/main.cpp
@@ -0,0 +1,302 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet
+
+#include "picodet_openvino.h"
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#define image_size 416
+
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+
+int resize_uniform(cv::Mat &src, cv::Mat &dst, cv::Size dst_size,
+                   object_rect &effect_area) {
+  int w = src.cols;
+  int h = src.rows;
+  int dst_w = dst_size.width;
+  int dst_h = dst_size.height;
+  dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+
+  float ratio_src = w * 1.0 / h;
+  float ratio_dst = dst_w * 1.0 / dst_h;
+
+  int tmp_w = 0;
+  int tmp_h = 0;
+  if (ratio_src > ratio_dst) {
+    tmp_w = dst_w;
+    tmp_h = floor((dst_w * 1.0 / w) * h);
+  } else if (ratio_src < ratio_dst) {
+    tmp_h = dst_h;
+    tmp_w = floor((dst_h * 1.0 / h) * w);
+  } else {
+    cv::resize(src, dst, dst_size);
+    effect_area.x = 0;
+    effect_area.y = 0;
+    effect_area.width = dst_w;
+    effect_area.height = dst_h;
+    return 0;
+  }
+  cv::Mat tmp;
+  cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+
+  if (tmp_w != dst_w) {
+    int index_w = floor((dst_w - tmp_w) / 2.0);
+    for (int i = 0; i < dst_h; i++) {
+      memcpy(dst.data + i * dst_w * 3 + index_w * 3, tmp.data + i * tmp_w * 3,
+             tmp_w * 3);
+    }
+    effect_area.x = index_w;
+    effect_area.y = 0;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else if (tmp_h != dst_h) {
+    int index_h = floor((dst_h - tmp_h) / 2.0);
+    memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+    effect_area.x = 0;
+    effect_area.y = index_h;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else {
+    printf("error\n");
+  }
+  return 0;
+}
+
+const int color_list[80][3] = {
+    {216, 82, 24},   {236, 176, 31},  {125, 46, 141},  {118, 171, 47},
+    {76, 189, 237},  {238, 19, 46},   {76, 76, 76},    {153, 153, 153},
+    {255, 0, 0},     {255, 127, 0},   {190, 190, 0},   {0, 255, 0},
+    {0, 0, 255},     {170, 0, 255},   {84, 84, 0},     {84, 170, 0},
+    {84, 255, 0},    {170, 84, 0},    {170, 170, 0},   {170, 255, 0},
+    {255, 84, 0},    {255, 170, 0},   {255, 255, 0},   {0, 84, 127},
+    {0, 170, 127},   {0, 255, 127},   {84, 0, 127},    {84, 84, 127},
+    {84, 170, 127},  {84, 255, 127},  {170, 0, 127},   {170, 84, 127},
+    {170, 170, 127}, {170, 255, 127}, {255, 0, 127},   {255, 84, 127},
+    {255, 170, 127}, {255, 255, 127}, {0, 84, 255},    {0, 170, 255},
+    {0, 255, 255},   {84, 0, 255},    {84, 84, 255},   {84, 170, 255},
+    {84, 255, 255},  {170, 0, 255},   {170, 84, 255},  {170, 170, 255},
+    {170, 255, 255}, {255, 0, 255},   {255, 84, 255},  {255, 170, 255},
+    {42, 0, 0},      {84, 0, 0},      {127, 0, 0},     {170, 0, 0},
+    {212, 0, 0},     {255, 0, 0},     {0, 42, 0},      {0, 84, 0},
+    {0, 127, 0},     {0, 170, 0},     {0, 212, 0},     {0, 255, 0},
+    {0, 0, 42},      {0, 0, 84},      {0, 0, 127},     {0, 0, 170},
+    {0, 0, 212},     {0, 0, 255},     {0, 0, 0},       {36, 36, 36},
+    {72, 72, 72},    {109, 109, 109}, {145, 145, 145}, {182, 182, 182},
+    {218, 218, 218}, {0, 113, 188},   {80, 182, 188},  {127, 127, 0},
+};
+
+void draw_bboxes(const cv::Mat &bgr, const std::vector<BoxInfo> &bboxes,
+                 object_rect effect_roi) {
+  static const char *class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+
+  cv::Mat image = bgr.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo &bbox = bboxes[i];
+    cv::Scalar color =
+        cv::Scalar(color_list[bbox.label][0], color_list[bbox.label][1],
+                   color_list[bbox.label][2]);
+    cv::rectangle(image,
+                  cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio,
+                                     (bbox.y1 - effect_roi.y) * height_ratio),
+                           cv::Point((bbox.x2 - effect_roi.x) * width_ratio,
+                                     (bbox.y2 - effect_roi.y) * height_ratio)),
+                  color);
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+    int x = (bbox.x1 - effect_roi.x) * width_ratio;
+    int y =
+        (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+    if (y < 0)
+      y = 0;
+    if (x + label_size.width > image.cols)
+      x = image.cols - label_size.width;
+
+    cv::rectangle(image, cv::Rect(cv::Point(x, y),
+                                  cv::Size(label_size.width,
+                                           label_size.height + baseLine)),
+                  color, -1);
+    cv::putText(image, text, cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 255, 255));
+  }
+
+  cv::imwrite("../predict.jpg", image);
+}
+
+int image_demo(PicoDet &detector, const char *imagepath) {
+  std::vector<std::string> filenames;
+  cv::glob(imagepath, filenames, false);
+
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name);
+    if (image.empty()) {
+      return -1;
+    }
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(image_size, image_size),
+                   effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    draw_bboxes(image, results, effect_roi);
+  }
+  return 0;
+}
+
+int webcam_demo(PicoDet &detector, int cam_id) {
+  cv::Mat image;
+  cv::VideoCapture cap(cam_id);
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(image_size, image_size),
+                   effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    draw_bboxes(image, results, effect_roi);
+    cv::waitKey(1);
+  }
+  return 0;
+}
+
+int video_demo(PicoDet &detector, const char *path) {
+  cv::Mat image;
+  cv::VideoCapture cap(path);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(image_size, image_size),
+                   effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    draw_bboxes(image, results, effect_roi);
+    cv::waitKey(1);
+  }
+  return 0;
+}
+
+int benchmark(PicoDet &detector) {
+  int loop_num = 100;
+  int warm_up = 8;
+
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(image_size, image_size, CV_8UC3, cv::Scalar(1, 1, 1));
+
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    auto start = std::chrono::steady_clock::now();
+    std::vector<BoxInfo> results;
+    results = detector.detect(image, 0.4, 0.5);
+    auto end = std::chrono::steady_clock::now();
+    double time =
+        std::chrono::duration<double, std::milli>(end - start).count();
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", "picodet",
+          time_min, time_max, time_avg);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 3) {
+    fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is "
+                    "cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n "
+                    "For video, mode=2; \n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    return -1;
+  }
+  std::cout << "start init model" << std::endl;
+  auto detector = PicoDet("../weight/picodet_m_416.xml");
+  std::cout << "success" << std::endl;
+
+  int mode = atoi(argv[1]);
+  switch (mode) {
+  case 0: {
+    int cam_id = atoi(argv[2]);
+    webcam_demo(detector, cam_id);
+    break;
+  }
+  case 1: {
+    const char *images = argv[2];
+    image_demo(detector, images);
+    break;
+  }
+  case 2: {
+    const char *path = argv[2];
+    video_demo(detector, path);
+    break;
+  }
+  case 3: {
+    benchmark(detector);
+    break;
+  }
+  default: {
+    fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is "
+                    "cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n "
+                    "For video, mode=2; \n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    break;
+  }
+  }
+}
diff --git a/deploy/third_engine/demo_openvino/picodet_openvino.cpp b/deploy/third_engine/demo_openvino/picodet_openvino.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04b0c482d5738ffb97428efb0faf68f3d6a03e1a
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/picodet_openvino.cpp
@@ -0,0 +1,209 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+
+#include "picodet_openvino.h"
+
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+PicoDet::PicoDet(const char *model_path) {
+  InferenceEngine::Core ie;
+  InferenceEngine::CNNNetwork model = ie.ReadNetwork(model_path);
+  // prepare input settings
+  InferenceEngine::InputsDataMap inputs_map(model.getInputsInfo());
+  input_name_ = inputs_map.begin()->first;
+  InferenceEngine::InputInfo::Ptr input_info = inputs_map.begin()->second;
+  // prepare output settings
+  InferenceEngine::OutputsDataMap outputs_map(model.getOutputsInfo());
+  for (auto &output_info : outputs_map) {
+    output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+  }
+
+  // get network
+  network_ = ie.LoadNetwork(model, "CPU");
+  infer_request_ = network_.CreateInferRequest();
+}
+
+PicoDet::~PicoDet() {}
+
+void PicoDet::preprocess(cv::Mat &image, InferenceEngine::Blob::Ptr &blob) {
+  int img_w = image.cols;
+  int img_h = image.rows;
+  int channels = 3;
+
+  InferenceEngine::MemoryBlob::Ptr mblob =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+  if (!mblob) {
+    THROW_IE_EXCEPTION
+        << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
+        << "but by fact we were not able to cast inputBlob to MemoryBlob";
+  }
+  auto mblobHolder = mblob->wmap();
+  float *blob_data = mblobHolder.as<float *>();
+
+  for (size_t c = 0; c < channels; c++) {
+    for (size_t h = 0; h < img_h; h++) {
+      for (size_t w = 0; w < img_w; w++) {
+        blob_data[c * img_w * img_h + h * img_w + w] =
+            (float)image.at<cv::Vec3b>(h, w)[c];
+      }
+    }
+  }
+}
+
+std::vector<BoxInfo> PicoDet::detect(cv::Mat image, float score_threshold,
+                                     float nms_threshold) {
+  InferenceEngine::Blob::Ptr input_blob = infer_request_.GetBlob(input_name_);
+  preprocess(image, input_blob);
+
+  // do inference
+  infer_request_.Infer();
+
+  // get output
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(this->num_class_);
+
+  for (const auto &head_info : this->heads_info_) {
+    const InferenceEngine::Blob::Ptr dis_pred_blob =
+        infer_request_.GetBlob(head_info.dis_layer);
+    const InferenceEngine::Blob::Ptr cls_pred_blob =
+        infer_request_.GetBlob(head_info.cls_layer);
+
+    auto mdis_pred =
+        InferenceEngine::as<InferenceEngine::MemoryBlob>(dis_pred_blob);
+    auto mdis_pred_holder = mdis_pred->rmap();
+    const float *dis_pred = mdis_pred_holder.as<const float *>();
+
+    auto mcls_pred =
+        InferenceEngine::as<InferenceEngine::MemoryBlob>(cls_pred_blob);
+    auto mcls_pred_holder = mcls_pred->rmap();
+    const float *cls_pred = mcls_pred_holder.as<const float *>();
+    this->decode_infer(cls_pred, dis_pred, head_info.stride, score_threshold,
+                       results);
+  }
+
+  std::vector<BoxInfo> dets;
+  for (int i = 0; i < (int)results.size(); i++) {
+    this->nms(results[i], nms_threshold);
+
+    for (auto &box : results[i]) {
+      dets.push_back(box);
+    }
+  }
+  return dets;
+}
+
+void PicoDet::decode_infer(const float *&cls_pred, const float *&dis_pred,
+                           int stride, float threshold,
+                           std::vector<std::vector<BoxInfo>> &results) {
+  int feature_h = ceil((float)input_size_ / stride);
+  int feature_w = ceil((float)input_size_ / stride);
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+
+    for (int label = 0; label < num_class_; label++) {
+      if (cls_pred[idx * num_class_ + label] > score) {
+        score = cls_pred[idx * num_class_ + label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float *bbox_pred = dis_pred + idx * (reg_max_ + 1) * 4;
+      results[cur_label].push_back(
+          this->disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+
+BoxInfo PicoDet::disPred2Bbox(const float *&dfl_det, int label, float score,
+                              int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max_ + 1];
+    activation_function_softmax(dfl_det + i * (reg_max_ + 1), dis_after_sm,
+                                reg_max_ + 1);
+    for (int j = 0; j < reg_max_ + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)this->input_size_);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)this->input_size_);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(),
+            [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
diff --git a/deploy/third_engine/demo_openvino/picodet_openvino.h b/deploy/third_engine/demo_openvino/picodet_openvino.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a5bced16a3c3d57096adbdfa263b634c74377db
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/picodet_openvino.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+
+#ifndef _PICODET_OPENVINO_H_
+#define _PICODET_OPENVINO_H_
+
+#include <inference_engine.hpp>
+#include <opencv2/core.hpp>
+#include <string>
+
+#define image_size 416
+
+typedef struct HeadInfo {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} HeadInfo;
+
+typedef struct BoxInfo {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+
+class PicoDet {
+public:
+  PicoDet(const char *param);
+
+  ~PicoDet();
+
+  InferenceEngine::ExecutableNetwork network_;
+  InferenceEngine::InferRequest infer_request_;
+  // static bool hasGPU;
+
+  std::vector<HeadInfo> heads_info_{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+
+  std::vector<BoxInfo> detect(cv::Mat image, float score_threshold,
+                              float nms_threshold);
+
+private:
+  void preprocess(cv::Mat &image, InferenceEngine::Blob::Ptr &blob);
+  void decode_infer(const float *&cls_pred, const float *&dis_pred, int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(const float *&dfl_det, int label, float score, int x,
+                       int y, int stride);
+  static void nms(std::vector<BoxInfo> &result, float nms_threshold);
+  std::string input_name_;
+  int input_size_ = image_size;
+  int num_class_ = 80;
+  int reg_max_ = 7;
+};
+
+#endif
diff --git a/deploy/third_engine/demo_openvino/python/README.md b/deploy/third_engine/demo_openvino/python/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1862417db48882b02459dd3b2a425758473f09f2
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/python/README.md
@@ -0,0 +1,75 @@
+# PicoDet OpenVINO Benchmark Demo
+
+本文件夹提供利用[Intel's OpenVINO Toolkit](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)进行PicoDet测速的Benchmark Demo与带后处理的模型Inference Demo。
+
+## 安装 OpenVINO Toolkit
+
+前往 [OpenVINO HomePage](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)，下载对应版本并安装。
+
+本demo安装的是 OpenVINO 2022.1.0，可直接运行如下指令安装：
+```shell
+pip install openvino==2022.1.0
+```
+
+详细安装步骤，可参考[OpenVINO官网](https://docs.openvinotoolkit.org/latest/get_started_guides.html)
+
+## Benchmark测试
+
+- 准备测试模型：根据[PicoDet](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/picodet)中【导出及转换模型】步骤，采用不包含后处理的方式导出模型（`-o export.benchmark=True` ），并生成待测试模型简化后的onnx模型（可在下文链接中直接下载）。同时在本目录下新建```out_onnxsim```文件夹，将导出的onnx模型放在该目录下。
+
+- 准备测试所用图片：本demo默认利用PaddleDetection/demo/[000000014439.jpg](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/demo/000000014439.jpg)
+
+- 在本目录下直接运行：
+
+```shell
+# Linux
+python openvino_benchmark.py --img_path ../../../../demo/000000014439.jpg --onnx_path out_onnxsim/picodet_s_320_coco_lcnet.onnx --in_shape 320
+# Windows
+python openvino_benchmark.py --img_path ..\..\..\..\demo\000000014439.jpg --onnx_path out_onnxsim\picodet_s_320_coco_lcnet.onnx --in_shape 320
+```
+- 注意：```--in_shape```为对应模型输入size，默认为320
+
+## 真实图片测试(网络包含后处理，但不包含NMS)
+
+- 准备测试模型：根据[PicoDet](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/picodet)中【导出及转换模型】步骤，采用**包含后处理**但**不包含NMS**的方式导出模型（`-o export.benchmark=False export.nms=False` ），并生成待测试模型简化后的onnx模型（可在下文链接中直接下载）。同时在本目录下新建```out_onnxsim_infer```文件夹，将导出的onnx模型放在该目录下。
+
+- 准备测试所用图片：默认利用../../demo_onnxruntime/imgs/[bus.jpg](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/deploy/third_engine/demo_onnxruntime/imgs/bus.jpg)
+
+```shell
+# Linux
+python openvino_infer.py --img_path ../../demo_onnxruntime/imgs/bus.jpg --onnx_path out_onnxsim_infer/picodet_s_320_postproccesed_woNMS.onnx --in_shape 320
+# Windows
+python openvino_infer.py --img_path ..\..\demo_onnxruntime\imgs\bus.jpg --onnx_path out_onnxsim_infer\picodet_s_320_postproccesed_woNMS.onnx --in_shape 320
+```
+
+### 真实图片测试(网络不包含后处理)
+
+```shell
+# Linux
+python openvino_benchmark.py --benchmark 0 --img_path ../../../../demo/000000014439.jpg --onnx_path out_onnxsim/picodet_s_320_coco_lcnet.onnx --in_shape 320
+# Windows
+python openvino_benchmark.py --benchmark 0 --img_path ..\..\..\..\demo\000000014439.jpg --onnx_path out_onnxsim\picodet_s_320_coco_lcnet.onnx --in_shape 320
+```
+
+- 结果：
+    <div align="center">
+      <img src="../../../../docs/images/res.jpg" height="500px" >
+    </div>
+
+## Benchmark结果
+
+- 测速结果如下：
+
+| 模型     | 输入尺寸 | ONNX  | 预测时延<sup><small>[CPU](#latency)|
+| :-------- | :--------: | :---------------------: | :----------------: |
+| PicoDet-XS |  320*320   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_320_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_320_coco_lcnet.onnx) | 3.9ms |
+| PicoDet-XS |  416*416   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_416_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_416_coco_lcnet.onnx) | 6.1ms |
+| PicoDet-S |  320*320   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_coco_lcnet.onnx) |     4.8ms |
+| PicoDet-S |  416*416   |  [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_416_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_416_coco_lcnet.onnx) |     6.6ms |
+| PicoDet-M |  320*320   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_320_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_320_coco_lcnet.onnx) | 8.2ms  |
+| PicoDet-M |  416*416   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_coco_lcnet.onnx) | 12.7ms |
+| PicoDet-L |  320*320   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_320_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_320_coco_lcnet.onnx) | 11.5ms |
+| PicoDet-L |  416*416   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_416_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_416_coco_lcnet.onnx) |     20.7ms |
+| PicoDet-L |  640*640   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_640_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_640_coco_lcnet.onnx) |     62.5ms |
+
+- <a name="latency">测试环境：</a> 英特尔酷睿i7 10750H CPU。
diff --git a/deploy/third_engine/demo_openvino/python/coco_label.txt b/deploy/third_engine/demo_openvino/python/coco_label.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/python/coco_label.txt
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/deploy/third_engine/demo_openvino/python/openvino_benchmark.py b/deploy/third_engine/demo_openvino/python/openvino_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..f21a8d5d1ed83c159818d2b405d1b5c9e5daa927
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/python/openvino_benchmark.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import time
+import argparse
+from scipy.special import softmax
+from openvino.runtime import Core
+
+
+def image_preprocess(img_path, re_shape):
+    img = cv2.imread(img_path)
+    img = cv2.resize(
+        img, (re_shape, re_shape), interpolation=cv2.INTER_LANCZOS4)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, [2, 0, 1]) / 255
+    img = np.expand_dims(img, 0)
+    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    return img.astype(np.float32)
+
+
+def draw_box(img, results, class_label, scale_x, scale_y):
+
+    label_list = list(
+        map(lambda x: x.strip(), open(class_label, 'r').readlines()))
+
+    for i in range(len(results)):
+        print(label_list[int(results[i][0])], ':', results[i][1])
+        bbox = results[i, 2:]
+        label_id = int(results[i, 0])
+        score = results[i, 1]
+        if (score > 0.20):
+            xmin, ymin, xmax, ymax = [
+                int(bbox[0] * scale_x), int(bbox[1] * scale_y),
+                int(bbox[2] * scale_x), int(bbox[3] * scale_y)
+            ]
+            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 255, 0), 3)
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            label_text = label_list[label_id]
+            cv2.rectangle(img, (xmin, ymin), (xmax, ymin - 60), (0, 255, 0), -1)
+            cv2.putText(img, "#" + label_text, (xmin, ymin - 10), font, 1,
+                        (255, 255, 255), 2, cv2.LINE_AA)
+            cv2.putText(img,
+                        str(round(score, 3)), (xmin, ymin - 40), font, 0.8,
+                        (255, 255, 255), 2, cv2.LINE_AA)
+    return img
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    indexes = np.argsort(scores)
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(
+                current_box, axis=0), )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+
+
+class PicoDetPostProcess(object):
+    """
+    Args:
+        input_shape (int): network input image size
+        ori_shape (int): ori image shape of before padding
+        scale_factor (float): scale factor of ori image
+        enable_mkldnn (bool): whether to open MKLDNN
+    """
+
+    def __init__(self,
+                 input_shape,
+                 ori_shape,
+                 scale_factor,
+                 strides=[8, 16, 32, 64],
+                 score_threshold=0.4,
+                 nms_threshold=0.5,
+                 nms_top_k=1000,
+                 keep_top_k=100):
+        self.ori_shape = ori_shape
+        self.input_shape = input_shape
+        self.scale_factor = scale_factor
+        self.strides = strides
+        self.score_threshold = score_threshold
+        self.nms_threshold = nms_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+
+    def warp_boxes(self, boxes, ori_shape):
+        """Apply transform to boxes
+        """
+        width, height = ori_shape[1], ori_shape[0]
+        n = len(boxes)
+        if n:
+            # warp points
+            xy = np.ones((n * 4, 3))
+            xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+                n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+            # xy = xy @ M.T  # transform
+            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+            # create new boxes
+            x = xy[:, [0, 2, 4, 6]]
+            y = xy[:, [1, 3, 5, 7]]
+            xy = np.concatenate(
+                (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+            # clip boxes
+            xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+            xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+            return xy.astype(np.float32)
+        else:
+            return boxes
+
+    def __call__(self, scores, raw_boxes):
+        batch_size = raw_boxes[0].shape[0]
+        reg_max = int(raw_boxes[0].shape[-1] / 4 - 1)
+        out_boxes_num = []
+        out_boxes_list = []
+        for batch_id in range(batch_size):
+            # generate centers
+            decode_boxes = []
+            select_scores = []
+            for stride, box_distribute, score in zip(self.strides, raw_boxes,
+                                                     scores):
+                box_distribute = box_distribute[batch_id]
+                score = score[batch_id]
+                # centers
+                fm_h = self.input_shape[0] / stride
+                fm_w = self.input_shape[1] / stride
+                h_range = np.arange(fm_h)
+                w_range = np.arange(fm_w)
+                ww, hh = np.meshgrid(w_range, h_range)
+                ct_row = (hh.flatten() + 0.5) * stride
+                ct_col = (ww.flatten() + 0.5) * stride
+                center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1)
+
+                # box distribution to distance
+                reg_range = np.arange(reg_max + 1)
+                box_distance = box_distribute.reshape((-1, reg_max + 1))
+                box_distance = softmax(box_distance, axis=1)
+                box_distance = box_distance * np.expand_dims(reg_range, axis=0)
+                box_distance = np.sum(box_distance, axis=1).reshape((-1, 4))
+                box_distance = box_distance * stride
+
+                # top K candidate
+                topk_idx = np.argsort(score.max(axis=1))[::-1]
+                topk_idx = topk_idx[:self.nms_top_k]
+                center = center[topk_idx]
+                score = score[topk_idx]
+                box_distance = box_distance[topk_idx]
+
+                # decode box
+                decode_box = center + [-1, -1, 1, 1] * box_distance
+
+                select_scores.append(score)
+                decode_boxes.append(decode_box)
+
+            # nms
+            bboxes = np.concatenate(decode_boxes, axis=0)
+            confidences = np.concatenate(select_scores, axis=0)
+            picked_box_probs = []
+            picked_labels = []
+            for class_index in range(0, confidences.shape[1]):
+                probs = confidences[:, class_index]
+                mask = probs > self.score_threshold
+                probs = probs[mask]
+                if probs.shape[0] == 0:
+                    continue
+                subset_boxes = bboxes[mask, :]
+                box_probs = np.concatenate(
+                    [subset_boxes, probs.reshape(-1, 1)], axis=1)
+                box_probs = hard_nms(
+                    box_probs,
+                    iou_threshold=self.nms_threshold,
+                    top_k=self.keep_top_k, )
+                picked_box_probs.append(box_probs)
+                picked_labels.extend([class_index] * box_probs.shape[0])
+
+            if len(picked_box_probs) == 0:
+                out_boxes_list.append(np.empty((0, 4)))
+                out_boxes_num.append(0)
+
+            else:
+                picked_box_probs = np.concatenate(picked_box_probs)
+
+                # resize output boxes
+                picked_box_probs[:, :4] = self.warp_boxes(
+                    picked_box_probs[:, :4], self.ori_shape[batch_id])
+                im_scale = np.concatenate([
+                    self.scale_factor[batch_id][::-1],
+                    self.scale_factor[batch_id][::-1]
+                ])
+                picked_box_probs[:, :4] /= im_scale
+                # clas score box
+                out_boxes_list.append(
+                    np.concatenate(
+                        [
+                            np.expand_dims(
+                                np.array(picked_labels),
+                                axis=-1), np.expand_dims(
+                                    picked_box_probs[:, 4], axis=-1),
+                            picked_box_probs[:, :4]
+                        ],
+                        axis=1))
+                out_boxes_num.append(len(picked_labels))
+
+        out_boxes_list = np.concatenate(out_boxes_list, axis=0)
+        out_boxes_num = np.asarray(out_boxes_num).astype(np.int32)
+        return out_boxes_list, out_boxes_num
+
+
+def detect(img_file, compiled_model, re_shape, class_label):
+    output = compiled_model.infer_new_request({0: test_image})
+    result_ie = list(output.values())  #[0]
+
+    test_im_shape = np.array([[re_shape, re_shape]]).astype('float32')
+    test_scale_factor = np.array([[1, 1]]).astype('float32')
+
+    np_score_list = []
+    np_boxes_list = []
+
+    num_outs = int(len(result_ie) / 2)
+    for out_idx in range(num_outs):
+        np_score_list.append(result_ie[out_idx])
+        np_boxes_list.append(result_ie[out_idx + num_outs])
+
+    postprocess = PicoDetPostProcess(test_image.shape[2:], test_im_shape,
+                                     test_scale_factor)
+
+    np_boxes, np_boxes_num = postprocess(np_score_list, np_boxes_list)
+
+    image = cv2.imread(img_file, 1)
+    scale_x = image.shape[1] / test_image.shape[3]
+    scale_y = image.shape[0] / test_image.shape[2]
+    res_image = draw_box(image, np_boxes, class_label, scale_x, scale_y)
+
+    cv2.imwrite('res.jpg', res_image)
+    cv2.imshow("res", res_image)
+    cv2.waitKey()
+
+
+def benchmark(test_image, compiled_model):
+
+    # benchmark       
+    loop_num = 100
+    warm_up = 8
+    timeall = 0
+    time_min = float("inf")
+    time_max = float('-inf')
+
+    for i in range(loop_num + warm_up):
+        time0 = time.time()
+        #perform the inference step
+
+        output = compiled_model.infer_new_request({0: test_image})
+        time1 = time.time()
+        timed = time1 - time0
+
+        if i >= warm_up:
+            timeall = timeall + timed
+            time_min = min(time_min, timed)
+            time_max = max(time_max, timed)
+
+    time_avg = timeall / loop_num
+
+    print('inference_time(ms): min={}, max={}, avg={}'.format(
+        round(time_min * 1000, 2),
+        round(time_max * 1000, 1), round(time_avg * 1000, 1)))
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--benchmark', type=int, default=1, help="0:detect; 1:benchmark")
+    parser.add_argument(
+        '--img_path',
+        type=str,
+        default='../../../../demo/000000014439.jpg',
+        help="image path")
+    parser.add_argument(
+        '--onnx_path',
+        type=str,
+        default='out_onnxsim/picodet_s_320_processed.onnx',
+        help="onnx filepath")
+    parser.add_argument('--in_shape', type=int, default=320, help="input_size")
+    parser.add_argument(
+        '--class_label',
+        type=str,
+        default='coco_label.txt',
+        help="class label file")
+    args = parser.parse_args()
+
+    ie = Core()
+    net = ie.read_model(args.onnx_path)
+    test_image = image_preprocess(args.img_path, args.in_shape)
+    compiled_model = ie.compile_model(net, 'CPU')
+
+    if args.benchmark == 0:
+        detect(args.img_path, compiled_model, args.in_shape, args.class_label)
+    if args.benchmark == 1:
+        benchmark(test_image, compiled_model)
diff --git a/deploy/third_engine/demo_openvino/python/openvino_infer.py b/deploy/third_engine/demo_openvino/python/openvino_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ad51022b1793e7b6430025a7c71cc0de7658c8c
--- /dev/null
+++ b/deploy/third_engine/demo_openvino/python/openvino_infer.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import argparse
+from scipy.special import softmax
+from openvino.runtime import Core
+
+
+def image_preprocess(img_path, re_shape):
+    img = cv2.imread(img_path)
+    img = cv2.resize(
+        img, (re_shape, re_shape), interpolation=cv2.INTER_LANCZOS4)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, [2, 0, 1]) / 255
+    img = np.expand_dims(img, 0)
+    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    return img.astype(np.float32)
+
+
+def get_color_map_list(num_classes):
+    color_map = num_classes * [0, 0, 0]
+    for i in range(0, num_classes):
+        j = 0
+        lab = i
+        while lab:
+            color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+            color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+            color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+            j += 1
+            lab >>= 3
+    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+    return color_map
+
+
+def draw_box(srcimg, results, class_label):
+    label_list = list(
+        map(lambda x: x.strip(), open(class_label, 'r').readlines()))
+    for i in range(len(results)):
+        color_list = get_color_map_list(len(label_list))
+        clsid2color = {}
+        classid, conf = int(results[i, 0]), results[i, 1]
+        xmin, ymin, xmax, ymax = int(results[i, 2]), int(results[i, 3]), int(
+            results[i, 4]), int(results[i, 5])
+
+        if classid not in clsid2color:
+            clsid2color[classid] = color_list[classid]
+        color = tuple(clsid2color[classid])
+
+        cv2.rectangle(srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2)
+        print(label_list[classid] + ': ' + str(round(conf, 3)))
+        cv2.putText(
+            srcimg,
+            label_list[classid] + ':' + str(round(conf, 3)), (xmin, ymin - 10),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8, (0, 255, 0),
+            thickness=2)
+    return srcimg
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    indexes = np.argsort(scores)
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(
+                current_box, axis=0), )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+
+
+class PicoDetNMS(object):
+    """
+    Args:
+        input_shape (int): network input image size
+        scale_factor (float): scale factor of ori image
+    """
+
+    def __init__(self,
+                 input_shape,
+                 scale_x,
+                 scale_y,
+                 strides=[8, 16, 32, 64],
+                 score_threshold=0.4,
+                 nms_threshold=0.5,
+                 nms_top_k=1000,
+                 keep_top_k=100):
+        self.input_shape = input_shape
+        self.scale_x = scale_x
+        self.scale_y = scale_y
+        self.strides = strides
+        self.score_threshold = score_threshold
+        self.nms_threshold = nms_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+
+    def __call__(self, decode_boxes, select_scores):
+        batch_size = 1
+        out_boxes_list = []
+        for batch_id in range(batch_size):
+            # nms
+            bboxes = np.concatenate(decode_boxes, axis=0)
+            confidences = np.concatenate(select_scores, axis=0)
+            picked_box_probs = []
+            picked_labels = []
+            for class_index in range(0, confidences.shape[1]):
+                probs = confidences[:, class_index]
+                mask = probs > self.score_threshold
+                probs = probs[mask]
+                if probs.shape[0] == 0:
+                    continue
+                subset_boxes = bboxes[mask, :]
+                box_probs = np.concatenate(
+                    [subset_boxes, probs.reshape(-1, 1)], axis=1)
+                box_probs = hard_nms(
+                    box_probs,
+                    iou_threshold=self.nms_threshold,
+                    top_k=self.keep_top_k, )
+                picked_box_probs.append(box_probs)
+                picked_labels.extend([class_index] * box_probs.shape[0])
+
+            if len(picked_box_probs) == 0:
+                out_boxes_list.append(np.empty((0, 4)))
+
+            else:
+                picked_box_probs = np.concatenate(picked_box_probs)
+
+                # resize output boxes
+                picked_box_probs[:, 0] *= self.scale_x
+                picked_box_probs[:, 2] *= self.scale_x
+                picked_box_probs[:, 1] *= self.scale_y
+                picked_box_probs[:, 3] *= self.scale_y
+
+                # clas score box
+                out_boxes_list.append(
+                    np.concatenate(
+                        [
+                            np.expand_dims(
+                                np.array(picked_labels),
+                                axis=-1), np.expand_dims(
+                                    picked_box_probs[:, 4], axis=-1),
+                            picked_box_probs[:, :4]
+                        ],
+                        axis=1))
+
+        out_boxes_list = np.concatenate(out_boxes_list, axis=0)
+        return out_boxes_list
+
+
+def detect(img_file, compiled_model, class_label):
+    output = compiled_model.infer_new_request({0: test_image})
+    result_ie = list(output.values())
+
+    decode_boxes = []
+    select_scores = []
+    num_outs = int(len(result_ie) / 2)
+    for out_idx in range(num_outs):
+        decode_boxes.append(result_ie[out_idx])
+        select_scores.append(result_ie[out_idx + num_outs])
+
+    image = cv2.imread(img_file, 1)
+    scale_x = image.shape[1] / test_image.shape[3]
+    scale_y = image.shape[0] / test_image.shape[2]
+
+    nms = PicoDetNMS(test_image.shape[2:], scale_x, scale_y)
+    np_boxes = nms(decode_boxes, select_scores)
+
+    res_image = draw_box(image, np_boxes, class_label)
+
+    cv2.imwrite('res.jpg', res_image)
+    cv2.imshow("res", res_image)
+    cv2.waitKey()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--img_path',
+        type=str,
+        default='../../demo_onnxruntime/imgs/bus.jpg',
+        help="image path")
+    parser.add_argument(
+        '--onnx_path',
+        type=str,
+        default='out_onnxsim_infer/picodet_s_320_postproccesed_woNMS.onnx',
+        help="onnx filepath")
+    parser.add_argument('--in_shape', type=int, default=320, help="input_size")
+    parser.add_argument(
+        '--class_label',
+        type=str,
+        default='coco_label.txt',
+        help="class label file")
+    args = parser.parse_args()
+
+    ie = Core()
+    net = ie.read_model(args.onnx_path)
+    test_image = image_preprocess(args.img_path, args.in_shape)
+    compiled_model = ie.compile_model(net, 'CPU')
+
+    detect(args.img_path, compiled_model, args.class_label)
diff --git a/deploy/third_engine/demo_openvino_kpts/CMakeLists.txt b/deploy/third_engine/demo_openvino_kpts/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4692f1cca12dcf544dcaa375b740e356135bac4a
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.4.1)
+set(CMAKE_CXX_STANDARD 14)
+
+project(tinypose_demo)
+
+find_package(OpenCV REQUIRED)
+find_package(InferenceEngine REQUIRED)
+find_package(ngraph REQUIRED)
+
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_executable(tinypose_demo main.cpp picodet_openvino.cpp keypoint_detector.cpp keypoint_postprocess.cpp)
+
+target_link_libraries(
+    tinypose_demo
+    ${InferenceEngine_LIBRARIES}
+    ${NGRAPH_LIBRARIES}
+    ${OpenCV_LIBS}
+)
diff --git a/deploy/third_engine/demo_openvino_kpts/README.md b/deploy/third_engine/demo_openvino_kpts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7d7ce0de80eabcfeffd580d920a25e1341f575b
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/README.md
@@ -0,0 +1,227 @@
+# TinyPose OpenVINO Demo
+
+This fold provides TinyPose inference code using
+[Intel's OpenVINO Toolkit](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html). Most of the implements in this fold are same as *demo_ncnn*.  
+**Recommand** 
+1. To use the xxx.tar.gz file to install instead of github method, [link](https://registrationcenter-download.intel.com/akdlm/irc_nas/18096/l_openvino_toolkit_p_2021.4.689.tgz).
+2. Your can also deploy openvino with docker, the command is :
+```
+docker pull openvino/ubuntu18_dev:2021.4.1
+```
+
+## Install OpenVINO Toolkit
+
+Go to [OpenVINO HomePage](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)
+
+Download a suitable version and install.
+
+Follow the official Get Started Guides: https://docs.openvinotoolkit.org/latest/get_started_guides.html
+
+## Set the Environment Variables
+
+### Windows:
+
+Run this command in cmd. (Every time before using OpenVINO)
+```cmd
+<INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+```
+
+Or set the system environment variables once for all:
+
+Name                  |Value
+:--------------------:|:--------:
+INTEL_OPENVINO_DIR | <INSTSLL_DIR>\openvino_2021
+INTEL_CVSDK_DIR | %INTEL_OPENVINO_DIR%
+InferenceEngine_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\share
+HDDL_INSTALL_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\hddl
+ngraph_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\ngraph\cmake
+
+And add this to ```Path```
+```
+%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Debug;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Release;%HDDL_INSTALL_DIR%\bin;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\tbb\bin;%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\lib
+```
+
+### Linux
+
+Run this command in shell. (Every time before using OpenVINO)
+
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+
+Or edit .bashrc
+
+```shell
+vi ~/.bashrc
+```
+
+Add this line to the end of the file
+
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+
+## Convert model
+
+  **1. Conver to onnx**
+
+  Create picodet_m_416_coco.onnx and tinypose256.onnx
+
+  example:
+
+    ```shell
+    modelName=picodet_m_416_coco
+    # export model
+    python tools/export_model.py \
+            -c configs/picodet/${modelName}.yml \
+            -o weights=${modelName}.pdparams \
+            --output_dir=inference_model
+    # convert to onnx
+    paddle2onnx --model_dir inference_model/${modelName} \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file ${modelName}.onnx
+    # onnxsim
+    python -m onnxsim ${modelName}.onnx ${modelName}_sim.onnx
+    ```
+
+  **2.Convert to OpenVINO**
+
+   ``` shell
+   cd <INSTSLL_DIR>/openvino_2021/deployment_tools/model_optimizer
+   ```
+
+   Install requirements for convert tool
+
+   ```shell
+   cd ./install_prerequisites
+   sudo install_prerequisites_onnx.sh
+
+   ```
+
+   Then convert model. Notice: mean_values and scale_values should be the same with your training settings in YAML config file.
+   ```shell
+   mo_onnx.py --input_model <ONNX_MODEL> --mean_values [103.53,116.28,123.675] --scale_values [57.375,57.12,58.395] --input_shape [1,3,256,192]
+   ```
+
+   **Note: The new version of openvino convert tools may cause error in Resize op. If you has problem with this, please try the version: openvino_2021.4.689**
+
+## Build
+
+### Windows
+
+```cmd
+<OPENVINO_INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+mkdir -p build
+cd build
+cmake ..
+msbuild tinypose_demo.vcxproj /p:configuration=release /p:platform=x64
+```
+
+### Linux
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+mkdir build
+cd build
+cmake ..
+make
+```
+
+
+## Run demo
+
+Download PicoDet openvino model [PicoDet openvino model download link](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_openvino.zip).
+
+Download TinyPose openvino model [TinyPose openvino model download link](https://bj.bcebos.com/v1/paddledet/deploy/third_engine/demo_openvino_kpts.tar.gz), the origin paddlepaddle model is [Tinypose256](https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/tinypose_256x192.pdparams).
+
+move picodet and tinypose openvino model files to the demo's weight folder. 
+
+Note:
+1. The model output node name may update by new version of paddle\paddle2onnx\onnxsim\openvino, please checkout your own model output node when the code can't find "conv2d_441.tmp_1"\"argmax_0.tmp_0".
+2. If you happened with this error "Cannot find blob with name: transpose_1.tmp_0", it means your picodet model is oldversion. you can modify the below code to fix it.
+
+```
+#picodet_openvino.h line 50-54
+
+  std::vector<HeadInfo> heads_info_{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+
+  modify to:
+
+  std::vector<HeadInfo> heads_info_{
+    // cls_pred|dis_pred|stride
+    {"save_infer_model/scale_0.tmp_1", "save_infer_model/scale_4.tmp_1", 8},
+    {"save_infer_model/scale_1.tmp_1", "save_infer_model/scale_5.tmp_1", 16},
+    {"save_infer_model/scale_2.tmp_1", "save_infer_model/scale_6.tmp_1", 32},
+    {"save_infer_model/scale_3.tmp_1", "save_infer_model/scale_7.tmp_1", 64},
+  };
+```
+
+3. you can view your onnx model with [Netron](https://netron.app/).
+
+### Edit file
+```
+step1:
+main.cpp
+#define image_size 416
+...
+  cv::Mat image(256, 192, CV_8UC3, cv::Scalar(1, 1, 1));
+  std::vector<float> center = {128, 96};
+  std::vector<float> scale = {256, 192};
+...
+  auto detector = PicoDet("../weight/picodet_m_416.xml");
+  auto kpts_detector = new KeyPointDetector("../weight/tinypose256.xml", -1, 256, 192);
+...
+step2:
+picodet_openvino.h
+#define image_size 416
+```
+
+### Run
+
+Run command:
+``` shell
+./tinypose_demo [mode] [image_file]
+```
+|  param   | detail  |
+|  ----  | ----  |
+| --mode  | input mode，0:camera；1:image；2:video；3:benchmark |
+| --image_file  | input image path |
+
+#### Webcam
+
+```shell
+tinypose_demo 0 0
+```
+
+#### Inference images
+
+```shell
+tinypose_demo 1 IMAGE_FOLDER/*.jpg
+```
+
+#### Inference video
+
+```shell
+tinypose_demo 2 VIDEO_PATH
+```
+
+### Benchmark
+
+```shell
+tinypose_demo 3 0
+```
+
+Plateform: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz x 24(核)
+Model: [Tinypose256_Openvino](https://paddledet.bj.bcebos.com/deploy/third_engine/tinypose_256_openvino.zip)
+
+| param         | Min   | Max   | Avg   |
+| ------------- | ----- | ----- | ----- |
+| infer time(s) | 0.018 | 0.062 | 0.028 |
+
diff --git a/deploy/third_engine/demo_openvino_kpts/keypoint_detector.cpp b/deploy/third_engine/demo_openvino_kpts/keypoint_detector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4200dd93b3375fa1d9c511aaacd4ebf4e0903189
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/keypoint_detector.cpp
@@ -0,0 +1,207 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "keypoint_detector.h"
+
+namespace PaddleDetection {
+
+// Visualiztion MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold) {
+  const int edge[][2] = {{0, 1},
+                         {0, 2},
+                         {1, 3},
+                         {2, 4},
+                         {3, 5},
+                         {4, 6},
+                         {5, 7},
+                         {6, 8},
+                         {7, 9},
+                         {8, 10},
+                         {5, 11},
+                         {6, 12},
+                         {11, 13},
+                         {12, 14},
+                         {13, 15},
+                         {14, 16},
+                         {11, 12}};
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > threshold) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[edge[i][0] * 3] > threshold &&
+          results[batchid].keypoints[edge[i][1] * 3] > threshold) {
+        int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+        int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+        int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+        int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+        cv::line(vis_img,
+                 cv::Point2d(x_start, y_start),
+                 cv::Point2d(x_end, y_end),
+                 colormap[i],
+                 1);
+      }
+    }
+  }
+  return vis_img;
+}
+
+void KeyPointDetector::Postprocess(std::vector<float>& output,
+                                   std::vector<uint64_t>& output_shape,
+                                   std::vector<float>& idxout,
+                                   std::vector<uint64_t>& idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  std::vector<float> preds(output_shape[1] * 3, 0);
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(output,
+                    output_shape,
+                    idxout,
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid,
+                    this->use_dark());
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+}
+
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               std::vector<KeyPointResult>* result) {
+  int batch_size = imgs.size();
+  auto insize = 3 * in_h * in_w;
+
+  InferenceEngine::Blob::Ptr input_blob = infer_request_.GetBlob(input_name_);
+  // Preprocess image
+  InferenceEngine::MemoryBlob::Ptr mblob =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob);
+  if (!mblob) {
+    THROW_IE_EXCEPTION
+        << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
+        << "but by fact we were not able to cast inputBlob to MemoryBlob";
+  }
+  auto mblobHolder = mblob->wmap();
+  float* blob_data = mblobHolder.as<float*>();
+
+  cv::Mat resized_im;
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+
+    cv::resize(im, resized_im, cv::Size(in_w, in_h));
+    for (size_t c = 0; c < 3; c++) {
+      for (size_t h = 0; h < in_h; h++) {
+        for (size_t w = 0; w < in_w; w++) {
+          blob_data[c * in_w * in_h + h * in_w + w] =
+              (float)resized_im.at<cv::Vec3b>(h, w)[c];
+        }
+      }
+    }
+  }
+  // Run predictor
+  auto inference_start = std::chrono::steady_clock::now();
+  // do inference
+  infer_request_.Infer();
+
+  InferenceEngine::Blob::Ptr output_blob =
+      infer_request_.GetBlob("conv2d_441.tmp_1");
+  auto output_shape = output_blob->getTensorDesc().getDims();
+  InferenceEngine::MemoryBlob::Ptr moutput =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
+
+  if (moutput) {
+    // locked memory holder should be alive all time while access to its
+    // buffer happens
+    auto minputHolder = moutput->rmap();
+
+    auto data = minputHolder.as<const InferenceEngine::PrecisionTrait<
+        InferenceEngine::Precision::FP32>::value_type*>();
+
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    
+    output_data_.resize(output_size);
+    std::copy_n(data, output_size, output_data_.data());
+
+  }
+
+
+  InferenceEngine::Blob::Ptr output_blob2 =
+      infer_request_.GetBlob("argmax_0.tmp_0");
+  auto idx_shape = output_blob2->getTensorDesc().getDims();
+  InferenceEngine::MemoryBlob::Ptr moutput2 =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob2);
+
+  if (moutput2) {
+    // locked memory holder should be alive all time while access to its
+    // buffer happens
+    auto minputHolder = moutput2->rmap();
+    // Original I64 precision was converted to I32
+    auto data = minputHolder.as<const InferenceEngine::PrecisionTrait<
+        InferenceEngine::Precision::FP32>::value_type*>();
+
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+
+    idx_data_.resize(output_size);
+    std::copy_n(data, output_size, idx_data_.data());
+  }
+
+  auto inference_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed = inference_end - inference_start;
+  printf("keypoint inference time: %f s\n", elapsed.count());
+
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+}
+
+}  // namespace PaddleDetection
diff --git a/deploy/third_engine/demo_openvino_kpts/keypoint_detector.h b/deploy/third_engine/demo_openvino_kpts/keypoint_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..e72e63dcc30bfacff21181b383ecbc23a580438d
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/keypoint_detector.h
@@ -0,0 +1,118 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <inference_engine.hpp>
+
+#include "keypoint_postprocess.h"
+
+namespace PaddleDetection {
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold = 0.2);
+
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_path,
+                            int input_height = 256,
+                            int input_width = 192,
+                            float score_threshold = 0.3,
+                            const int batch_size = 1,
+                            bool use_dark = true) {
+    use_dark_ = use_dark;
+
+    in_w = input_width;
+    in_h = input_height;
+    threshold_ = score_threshold;
+
+    InferenceEngine::Core ie;
+    auto model = ie.ReadNetwork(model_path);
+    // prepare input settings
+    InferenceEngine::InputsDataMap inputs_map(model.getInputsInfo());
+    input_name_ = inputs_map.begin()->first;
+    InferenceEngine::InputInfo::Ptr input_info = inputs_map.begin()->second;
+    // prepare output settings
+    InferenceEngine::OutputsDataMap outputs_map(model.getOutputsInfo());
+    int idx = 0;
+    for (auto& output_info : outputs_map) {
+      if (idx == 0) {
+        output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+      } else {
+        output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+      }
+      idx++;
+    }
+
+    // get network
+    network_ = ie.LoadNetwork(model, "CPU");
+    infer_request_ = network_.CreateInferRequest();
+  }
+
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               std::vector<KeyPointResult>* result = nullptr);
+
+  bool use_dark() { return this->use_dark_; }
+
+  inline float get_threshold() { return threshold_; };
+
+  int in_w = 128;
+  int in_h = 256;
+
+ private:
+  // Postprocess result
+  void Postprocess(std::vector<float>& output,
+                   std::vector<uint64_t>& output_shape,
+                   std::vector<float>& idxout,
+                   std::vector<uint64_t>& idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+
+  std::vector<float> output_data_;
+  std::vector<float> idx_data_;
+  float threshold_;
+  bool use_dark_;
+
+  InferenceEngine::ExecutableNetwork network_;
+  InferenceEngine::InferRequest infer_request_;
+  std::string input_name_;
+};
+
+}  // namespace PaddleDetection
diff --git a/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.cpp b/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..65430ab1f07c0690aad8a26d5d3abda52badd9c4
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.cpp
@@ -0,0 +1,273 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "keypoint_postprocess.h"
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{static_cast<float>(-0.5) * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<uint64_t>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine=false) {
+  if (affine) {
+    cv::Mat trans(2, 3, CV_64FC1);
+    get_affine_transform(center, scale, 0, output_size, trans, 1);
+    for (int p = 0; p < dim[1]; ++p) {
+      affine_tranform(
+          coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+    }
+  } else {
+    float heat_w = static_cast<float>(output_size[0]);
+    float heat_h = static_cast<float>(output_size[1]);
+    float x_scale = scale[0] / heat_w;
+    float y_scale = scale[1] / heat_h;
+    float offset_x = center[0] - scale[0] / 2.;
+    float offset_y = center[1] - scale[1] / 2.;
+    for (int i = 0; i < dim[1]; i++) {
+      target_coords[i * 3 + 1] = x_scale * coords[i * 2] + offset_x;
+      target_coords[i * 3 + 2] = y_scale * coords[i * 2 + 1] + offset_y;
+    }
+  }
+}
+
+// only for batchsize == 1
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<uint64_t>& dim,
+                std::vector<float>& coords,
+                int px,
+                int py,
+                int index,
+                int ch) {
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 =
+      heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0, dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1, 1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1, 1));
+
+  float epsilon = 1e-10;
+  // sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+
+  // compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2 * xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2 * xy + yd2);
+
+  // finally get offset by derivative and hassian, which combined by dx/dy and
+  // dxx/dyy
+  if (dxx * dyy - dxy * dxy != 0) {
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2, 2, CV_32F, M);
+    cv::Mat derivative(2, 1, CV_32F, D);
+    cv::Mat offset = -hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0, 0);
+    coords[ch * 2 + 1] += offset.at<float>(1, 0);
+  }
+}
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<uint64_t>& dim,
+                     std::vector<float>& idxout,
+                     std::vector<uint64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+
+    int idx = int(idxout[batchid * dim[1] + j]);
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+
+    if (DARK && px > 1 && px < heatmap_width - 2 && py > 1 &&
+        py < heatmap_height - 2) {
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    } else {
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                       heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                       heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
diff --git a/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.h b/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9bd743b772d226b4b02c4f411e8492fda220571
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.h
@@ -0,0 +1,68 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+void affine_tranform(float pt_x,
+                     float pt_y,
+                     cv::Mat& trans,
+                     std::vector<float>& x,
+                     int p,
+                     int num);
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<uint64_t>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine);
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx);
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<uint64_t>& dim,
+                     std::vector<float>& idxout,
+                     std::vector<uint64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK = true);
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.25);
diff --git a/deploy/third_engine/demo_openvino_kpts/main.cpp b/deploy/third_engine/demo_openvino_kpts/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc580e41db18fbd1a5f61302f1b633eb65254f8a
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/main.cpp
@@ -0,0 +1,415 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet
+
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#define image_size 416
+
+#include "keypoint_detector.h"
+#include "picodet_openvino.h"
+
+using namespace PaddleDetection;
+
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+
+int resize_uniform(cv::Mat& src,
+                   cv::Mat& dst,
+                   cv::Size dst_size,
+                   object_rect& effect_area) {
+  int w = src.cols;
+  int h = src.rows;
+  int dst_w = dst_size.width;
+  int dst_h = dst_size.height;
+  dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+
+  float ratio_src = w * 1.0 / h;
+  float ratio_dst = dst_w * 1.0 / dst_h;
+
+  int tmp_w = 0;
+  int tmp_h = 0;
+  if (ratio_src > ratio_dst) {
+    tmp_w = dst_w;
+    tmp_h = floor((dst_w * 1.0 / w) * h);
+  } else if (ratio_src < ratio_dst) {
+    tmp_h = dst_h;
+    tmp_w = floor((dst_h * 1.0 / h) * w);
+  } else {
+    cv::resize(src, dst, dst_size);
+    effect_area.x = 0;
+    effect_area.y = 0;
+    effect_area.width = dst_w;
+    effect_area.height = dst_h;
+    return 0;
+  }
+  cv::Mat tmp;
+  cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+
+  if (tmp_w != dst_w) {
+    int index_w = floor((dst_w - tmp_w) / 2.0);
+    for (int i = 0; i < dst_h; i++) {
+      memcpy(dst.data + i * dst_w * 3 + index_w * 3,
+             tmp.data + i * tmp_w * 3,
+             tmp_w * 3);
+    }
+    effect_area.x = index_w;
+    effect_area.y = 0;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else if (tmp_h != dst_h) {
+    int index_h = floor((dst_h - tmp_h) / 2.0);
+    memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+    effect_area.x = 0;
+    effect_area.y = index_h;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else {
+    printf("error\n");
+  }
+  return 0;
+}
+
+const int color_list[80][3] = {
+    {216, 82, 24},   {236, 176, 31},  {125, 46, 141},  {118, 171, 47},
+    {76, 189, 237},  {238, 19, 46},   {76, 76, 76},    {153, 153, 153},
+    {255, 0, 0},     {255, 127, 0},   {190, 190, 0},   {0, 255, 0},
+    {0, 0, 255},     {170, 0, 255},   {84, 84, 0},     {84, 170, 0},
+    {84, 255, 0},    {170, 84, 0},    {170, 170, 0},   {170, 255, 0},
+    {255, 84, 0},    {255, 170, 0},   {255, 255, 0},   {0, 84, 127},
+    {0, 170, 127},   {0, 255, 127},   {84, 0, 127},    {84, 84, 127},
+    {84, 170, 127},  {84, 255, 127},  {170, 0, 127},   {170, 84, 127},
+    {170, 170, 127}, {170, 255, 127}, {255, 0, 127},   {255, 84, 127},
+    {255, 170, 127}, {255, 255, 127}, {0, 84, 255},    {0, 170, 255},
+    {0, 255, 255},   {84, 0, 255},    {84, 84, 255},   {84, 170, 255},
+    {84, 255, 255},  {170, 0, 255},   {170, 84, 255},  {170, 170, 255},
+    {170, 255, 255}, {255, 0, 255},   {255, 84, 255},  {255, 170, 255},
+    {42, 0, 0},      {84, 0, 0},      {127, 0, 0},     {170, 0, 0},
+    {212, 0, 0},     {255, 0, 0},     {0, 42, 0},      {0, 84, 0},
+    {0, 127, 0},     {0, 170, 0},     {0, 212, 0},     {0, 255, 0},
+    {0, 0, 42},      {0, 0, 84},      {0, 0, 127},     {0, 0, 170},
+    {0, 0, 212},     {0, 0, 255},     {0, 0, 0},       {36, 36, 36},
+    {72, 72, 72},    {109, 109, 109}, {145, 145, 145}, {182, 182, 182},
+    {218, 218, 218}, {0, 113, 188},   {80, 182, 188},  {127, 127, 0},
+};
+
+void draw_bboxes(const cv::Mat& bgr,
+                 const std::vector<BoxInfo>& bboxes,
+                 object_rect effect_roi) {
+  static const char* class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+
+  cv::Mat image = bgr.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo& bbox = bboxes[i];
+    cv::Scalar color = cv::Scalar(color_list[bbox.label][0],
+                                  color_list[bbox.label][1],
+                                  color_list[bbox.label][2]);
+    cv::rectangle(image,
+                  cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio,
+                                     (bbox.y1 - effect_roi.y) * height_ratio),
+                           cv::Point((bbox.x2 - effect_roi.x) * width_ratio,
+                                     (bbox.y2 - effect_roi.y) * height_ratio)),
+                  color);
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+    int x = (bbox.x1 - effect_roi.x) * width_ratio;
+    int y =
+        (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+    if (y < 0) y = 0;
+    if (x + label_size.width > image.cols) x = image.cols - label_size.width;
+
+    cv::rectangle(
+        image,
+        cv::Rect(cv::Point(x, y),
+                 cv::Size(label_size.width, label_size.height + baseLine)),
+        color,
+        -1);
+
+    cv::putText(image,
+                text,
+                cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX,
+                0.4,
+                cv::Scalar(255, 255, 255));
+  }
+
+  cv::imwrite("../predict.jpg", image);
+}
+
+std::vector<BoxInfo> coordsback(const cv::Mat image,
+                                const object_rect effect_roi,
+                                const std::vector<BoxInfo>& bboxes) {
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  std::vector<BoxInfo> bboxes_oimg;
+
+  for (int i = 0; i < bboxes.size(); i++) {
+    auto bbox = bboxes[i];
+    bbox.x1 = (bbox.x1 - effect_roi.x) * width_ratio;
+    bbox.y1 = (bbox.y1 - effect_roi.y) * height_ratio;
+    bbox.x2 = (bbox.x2 - effect_roi.x) * width_ratio;
+    bbox.y2 = (bbox.y2 - effect_roi.y) * height_ratio;
+    bboxes_oimg.emplace_back(bbox);
+  }
+  return bboxes_oimg;
+}
+
+void image_infer_kpts(KeyPointDetector* kpts_detector,
+                      cv::Mat image,
+                      const object_rect effect_roi,
+                      const std::vector<BoxInfo>& results,
+                      std::string img_name = "kpts_vis",
+                      bool save_img = true) {
+  std::vector<cv::Mat> cropimgs;
+  std::vector<std::vector<float>> center_bs;
+  std::vector<std::vector<float>> scale_bs;
+  std::vector<KeyPointResult> kpts_results;
+  auto results_oimg = coordsback(image, effect_roi, results);
+
+  for (int i = 0; i < results_oimg.size(); i++) {
+    auto rect = results_oimg[i];
+    if (rect.label == 0) {
+      cv::Mat cropimg;
+      std::vector<float> center, scale;
+      std::vector<int> area = {static_cast<int>(rect.x1),
+                               static_cast<int>(rect.y1),
+                               static_cast<int>(rect.x2),
+                               static_cast<int>(rect.y2)};
+      CropImg(image, cropimg, area, center, scale);
+      cropimgs.emplace_back(cropimg);
+      center_bs.emplace_back(center);
+      scale_bs.emplace_back(scale);
+    }
+    if (cropimgs.size() == 1 ||
+        (cropimgs.size() > 0 && i == results_oimg.size() - 1)) {
+      kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+      cropimgs.clear();
+      center_bs.clear();
+      scale_bs.clear();
+    }
+  }
+  std::vector<int> compression_params;
+  compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+  compression_params.push_back(95);
+  std::string kpts_savepath =
+      "keypoint_" + img_name.substr(img_name.find_last_of('/') + 1);
+  cv::Mat kpts_vis_img =
+      VisualizeKptsResult(image, kpts_results, {0, 255, 0}, 0.1);
+  if (save_img) {
+    cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+    printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+  } else {
+    cv::imshow("image", kpts_vis_img);
+  }
+}
+
+int image_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* imagepath) {
+  std::vector<std::string> filenames;
+  cv::glob(imagepath, filenames, false);
+
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name);
+    if (image.empty()) {
+      return -1;
+    }
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(
+        image, resized_img, cv::Size(image_size, image_size), effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, img_name);
+    }
+  }
+  return 0;
+}
+
+int webcam_demo(PicoDet& detector,
+                KeyPointDetector* kpts_detector,
+                int cam_id) {
+  cv::Mat image;
+  cv::VideoCapture cap(cam_id);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(
+        image, resized_img, cv::Size(image_size, image_size), effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+
+int video_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* path) {
+  cv::Mat image;
+  cv::VideoCapture cap(path);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(
+        image, resized_img, cv::Size(image_size, image_size), effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+
+int benchmark(KeyPointDetector* kpts_detector) {
+  int loop_num = 100;
+  int warm_up = 8;
+
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(256, 192, CV_8UC3, cv::Scalar(1, 1, 1));
+  std::vector<float> center = {128, 96};
+  std::vector<float> scale = {256, 192};
+  std::vector<cv::Mat> cropimgs = {image};
+  std::vector<std::vector<float>> center_bs = {center};
+  std::vector<std::vector<float>> scale_bs = {scale};
+  std::vector<KeyPointResult> kpts_results;
+
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    auto start = std::chrono::steady_clock::now();
+    std::vector<BoxInfo> results;
+    kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+    auto end = std::chrono::steady_clock::now();
+
+    std::chrono::duration<double> elapsed = end - start;
+    double time = elapsed.count();
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr,
+          "%20s  min = %7.4f  max = %7.4f  avg = %7.4f\n",
+          "tinypose",
+          time_min,
+          time_max,
+          time_avg);
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n "
+            "For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; "
+            "\n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    return -1;
+  }
+  std::cout << "start init model" << std::endl;
+  auto detector = PicoDet("./weight/picodet_m_416.xml");
+  auto kpts_detector =
+      new KeyPointDetector("./weight/tinypose256_git2-sim.xml", 256, 192);
+  std::cout << "success" << std::endl;
+
+  int mode = atoi(argv[1]);
+  switch (mode) {
+    case 0: {
+      int cam_id = atoi(argv[2]);
+      webcam_demo(detector, kpts_detector, cam_id);
+      break;
+    }
+    case 1: {
+      const char* images = argv[2];
+      image_demo(detector, kpts_detector, images);
+      break;
+    }
+    case 2: {
+      const char* path = argv[2];
+      video_demo(detector, kpts_detector, path);
+      break;
+    }
+    case 3: {
+      benchmark(kpts_detector);
+      break;
+    }
+    default: {
+      fprintf(stderr,
+              "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; "
+              "\n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, "
+              "mode=2; \n For benchmark, mode=3 path=0.\n",
+              argv[0]);
+      break;
+    }
+  }
+  delete kpts_detector;
+  kpts_detector = nullptr;
+}
diff --git a/deploy/third_engine/demo_openvino_kpts/picodet_openvino.cpp b/deploy/third_engine/demo_openvino_kpts/picodet_openvino.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..14ddab3baf1bf059e30d82c415d0c9e5da0034fc
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/picodet_openvino.cpp
@@ -0,0 +1,213 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+
+#include "picodet_openvino.h"
+
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp* src, _Tp* dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+PicoDet::PicoDet(const char* model_path) {
+  InferenceEngine::Core ie;
+  InferenceEngine::CNNNetwork model = ie.ReadNetwork(model_path);
+  // prepare input settings
+  InferenceEngine::InputsDataMap inputs_map(model.getInputsInfo());
+  input_name_ = inputs_map.begin()->first;
+  InferenceEngine::InputInfo::Ptr input_info = inputs_map.begin()->second;
+  // prepare output settings
+  InferenceEngine::OutputsDataMap outputs_map(model.getOutputsInfo());
+  for (auto& output_info : outputs_map) {
+    output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+  }
+
+  // get network
+  network_ = ie.LoadNetwork(model, "CPU");
+  infer_request_ = network_.CreateInferRequest();
+}
+
+PicoDet::~PicoDet() {}
+
+void PicoDet::preprocess(cv::Mat& image, InferenceEngine::Blob::Ptr& blob) {
+  int img_w = image.cols;
+  int img_h = image.rows;
+  int channels = 3;
+
+  InferenceEngine::MemoryBlob::Ptr mblob =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+  if (!mblob) {
+    THROW_IE_EXCEPTION
+        << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
+        << "but by fact we were not able to cast inputBlob to MemoryBlob";
+  }
+  auto mblobHolder = mblob->wmap();
+  float* blob_data = mblobHolder.as<float*>();
+
+  for (size_t c = 0; c < channels; c++) {
+    for (size_t h = 0; h < img_h; h++) {
+      for (size_t w = 0; w < img_w; w++) {
+        blob_data[c * img_w * img_h + h * img_w + w] =
+            (float)image.at<cv::Vec3b>(h, w)[c];
+      }
+    }
+  }
+}
+
+std::vector<BoxInfo> PicoDet::detect(cv::Mat image,
+                                     float score_threshold,
+                                     float nms_threshold) {
+  InferenceEngine::Blob::Ptr input_blob = infer_request_.GetBlob(input_name_);
+  preprocess(image, input_blob);
+
+  // do inference
+  infer_request_.Infer();
+
+  // get output
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(this->num_class_);
+
+  for (const auto& head_info : this->heads_info_) {
+    const InferenceEngine::Blob::Ptr dis_pred_blob =
+        infer_request_.GetBlob(head_info.dis_layer);
+    const InferenceEngine::Blob::Ptr cls_pred_blob =
+        infer_request_.GetBlob(head_info.cls_layer);
+
+    auto mdis_pred =
+        InferenceEngine::as<InferenceEngine::MemoryBlob>(dis_pred_blob);
+    auto mdis_pred_holder = mdis_pred->rmap();
+    const float* dis_pred = mdis_pred_holder.as<const float*>();
+
+    auto mcls_pred =
+        InferenceEngine::as<InferenceEngine::MemoryBlob>(cls_pred_blob);
+    auto mcls_pred_holder = mcls_pred->rmap();
+    const float* cls_pred = mcls_pred_holder.as<const float*>();
+    this->decode_infer(
+        cls_pred, dis_pred, head_info.stride, score_threshold, results);
+  }
+
+  std::vector<BoxInfo> dets;
+  for (int i = 0; i < (int)results.size(); i++) {
+    this->nms(results[i], nms_threshold);
+
+    for (auto& box : results[i]) {
+      dets.push_back(box);
+    }
+  }
+  return dets;
+}
+
+void PicoDet::decode_infer(const float*& cls_pred,
+                           const float*& dis_pred,
+                           int stride,
+                           float threshold,
+                           std::vector<std::vector<BoxInfo>>& results) {
+  int feature_h = input_size_ / stride;
+  int feature_w = input_size_ / stride;
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+
+    for (int label = 0; label < num_class_; label++) {
+      if (cls_pred[idx * num_class_ + label] > score) {
+        score = cls_pred[idx * num_class_ + label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float* bbox_pred = dis_pred + idx * (reg_max_ + 1) * 4;
+      results[cur_label].push_back(
+          this->disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+
+BoxInfo PicoDet::disPred2Bbox(
+    const float*& dfl_det, int label, float score, int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float* dis_after_sm = new float[reg_max_ + 1];
+    activation_function_softmax(
+        dfl_det + i * (reg_max_ + 1), dis_after_sm, reg_max_ + 1);
+    for (int j = 0; j < reg_max_ + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)this->input_size_);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)this->input_size_);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+
+void PicoDet::nms(std::vector<BoxInfo>& input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) {
+    return a.score > b.score;
+  });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
diff --git a/deploy/third_engine/demo_openvino_kpts/picodet_openvino.h b/deploy/third_engine/demo_openvino_kpts/picodet_openvino.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bd3d79c44a2f6ae62eaba82bcafcae45a84254f
--- /dev/null
+++ b/deploy/third_engine/demo_openvino_kpts/picodet_openvino.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+
+#ifndef _PICODET_OPENVINO_H_
+#define _PICODET_OPENVINO_H_
+
+#include <inference_engine.hpp>
+#include <opencv2/core.hpp>
+#include <string>
+
+#define image_size 416
+
+typedef struct HeadInfo {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} HeadInfo;
+
+typedef struct BoxInfo {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+
+class PicoDet {
+public:
+  PicoDet(const char *param);
+
+  ~PicoDet();
+
+  InferenceEngine::ExecutableNetwork network_;
+  InferenceEngine::InferRequest infer_request_;
+
+  std::vector<HeadInfo> heads_info_{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+
+  std::vector<BoxInfo> detect(cv::Mat image, float score_threshold,
+                              float nms_threshold);
+
+private:
+  void preprocess(cv::Mat &image, InferenceEngine::Blob::Ptr &blob);
+  void decode_infer(const float *&cls_pred, const float *&dis_pred, int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(const float *&dfl_det, int label, float score, int x,
+                       int y, int stride);
+  static void nms(std::vector<BoxInfo> &result, float nms_threshold);
+  std::string input_name_;
+  int input_size_ = image_size;
+  int num_class_ = 80;
+  int reg_max_ = 7;
+};
+
+#endif
diff --git a/deploy/third_engine/onnx/infer.py b/deploy/third_engine/onnx/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d34cdd2f18b4c3857b810cb2f0d0cf26464cefc
--- /dev/null
+++ b/deploy/third_engine/onnx/infer.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import numpy as np
+import glob
+from onnxruntime import InferenceSession
+
+from preprocess import Compose
+
+# Global dictionary
+SUPPORT_MODELS = {
+    'YOLO', 'PPYOLOE', 'YOLOX', 'YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8', 'RTMDet'
+}
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--infer_cfg", type=str, help="infer_cfg.yml")
+parser.add_argument(
+    '--onnx_file', type=str, default="model.onnx", help="onnx model file path")
+parser.add_argument("--image_dir", type=str)
+parser.add_argument("--image_file", type=str)
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    print("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+class PredictConfig(object):
+    """set config of preprocess, postprocess and visualize
+    Args:
+        infer_config (str): path of infer_cfg.yml
+    """
+
+    def __init__(self, infer_config):
+        # parsing Yaml config for Preprocess
+        with open(infer_config) as f:
+            yml_conf = yaml.safe_load(f)
+        self.check_model(yml_conf)
+        self.arch = yml_conf['arch']
+        self.preprocess_infos = yml_conf['Preprocess']
+        self.min_subgraph_size = yml_conf['min_subgraph_size']
+        self.label_list = yml_conf['label_list']
+        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
+        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
+        self.mask = yml_conf.get("mask", False)
+        self.tracker = yml_conf.get("tracker", None)
+        self.nms = yml_conf.get("NMS", None)
+        self.fpn_stride = yml_conf.get("fpn_stride", None)
+        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
+            print(
+                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
+            )
+        self.print_config()
+
+    def check_model(self, yml_conf):
+        """
+        Raises:
+            ValueError: loaded model not in supported model type
+        """
+        for support_model in SUPPORT_MODELS:
+            if support_model in yml_conf['arch']:
+                return True
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
+            'arch'], SUPPORT_MODELS))
+
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+
+
+def predict_image(infer_config, predictor, img_list):
+    # load preprocess transforms
+    transforms = Compose(infer_config.preprocess_infos)
+    # predict image
+    for img_path in img_list:
+        inputs = transforms(img_path)
+        inputs_name = [var.name for var in predictor.get_inputs()]
+        inputs = {k: inputs[k][None, ] for k in inputs_name}
+
+        outputs = predictor.run(output_names=None, input_feed=inputs)
+
+        print("ONNXRuntime predict: ")
+        if infer_config.arch in ["HRNet"]:
+            print(np.array(outputs[0]))
+        else:
+            bboxes = np.array(outputs[0])
+            for bbox in bboxes:
+                if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold:
+                    print(f"{int(bbox[0])} {bbox[1]} "
+                          f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")
+
+
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    # load image list
+    img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
+    # load predictor
+    predictor = InferenceSession(FLAGS.onnx_file)
+    # load infer config
+    infer_config = PredictConfig(FLAGS.infer_cfg)
+
+    predict_image(infer_config, predictor, img_list)
diff --git a/deploy/third_engine/onnx/preprocess.py b/deploy/third_engine/onnx/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..3554b7f81250bdffbbd78d1ec4905bcf722943e6
--- /dev/null
+++ b/deploy/third_engine/onnx/preprocess.py
@@ -0,0 +1,494 @@
+import numpy as np
+import cv2
+import copy
+
+
+def decode_image(img_path):
+    with open(img_path, 'rb') as f:
+        im_read = f.read()
+    data = np.frombuffer(im_read, dtype='uint8')
+    im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    img_info = {
+        "im_shape": np.array(
+            im.shape[:2], dtype=np.float32),
+        "scale_factor": np.array(
+            [1., 1.], dtype=np.float32)
+    }
+    return im, img_info
+
+
+class Resize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+
+    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+        im_info['scale_factor'] = np.array(
+            [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+
+
+class NormalizeImage(object):
+    """normalize image
+    Args:
+        mean (list): im - mean
+        std (list): im / std
+        is_scale (bool): whether need im / 255
+        norm_type (str): type in ['mean_std', 'none']
+    """
+
+    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+        return im, im_info
+
+
+class Permute(object):
+    """permute image
+    Args:
+        to_bgr (bool): whether convert RGB to BGR
+        channel_first (bool): whether convert HWC to CHW
+    """
+
+    def __init__(self, ):
+        super(Permute, self).__init__()
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.transpose((2, 0, 1)).copy()
+        return im, im_info
+
+
+class PadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im, im_info
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
+
+
+class LetterBoxResize(object):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        height, width = self.target_size
+        h, w = im.shape[:2]
+        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
+
+        new_shape = [round(h * ratio), round(w * ratio)]
+        im_info['im_shape'] = np.array(new_shape, dtype=np.float32)
+        im_info['scale_factor'] = np.array([ratio, ratio], dtype=np.float32)
+        return im, im_info
+
+
+class Pad(object):
+    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
+        """
+        Pad image to a specified size.
+        Args:
+            size (list[int]): image target size
+            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
+        """
+        super(Pad, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+        self.fill_value = fill_value
+
+    def __call__(self, im, im_info):
+        im_h, im_w = im.shape[:2]
+        h, w = self.size
+        if h == im_h and w == im_w:
+            im = im.astype(np.float32)
+            return im, im_info
+
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
+        im = canvas
+        return im, im_info
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def get_affine_transform(center,
+                         input_size,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+    if not isinstance(input_size, (np.ndarray, list)):
+        input_size = np.array([input_size, input_size], dtype=np.float32)
+    scale_tmp = input_size
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+class WarpAffine(object):
+    """Warp affine the image
+    """
+
+    def __init__(self,
+                 keep_res=False,
+                 pad=31,
+                 input_h=512,
+                 input_w=512,
+                 scale=0.4,
+                 shift=0.1):
+        self.keep_res = keep_res
+        self.pad = pad
+        self.input_h = input_h
+        self.input_w = input_w
+        self.scale = scale
+        self.shift = shift
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        img = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
+
+        h, w = img.shape[:2]
+
+        if self.keep_res:
+            input_h = (h | self.pad) + 1
+            input_w = (w | self.pad) + 1
+            s = np.array([input_w, input_h], dtype=np.float32)
+            c = np.array([w // 2, h // 2], dtype=np.float32)
+
+        else:
+            s = max(h, w) * 1.0
+            input_h, input_w = self.input_h, self.input_w
+            c = np.array([w / 2., h / 2.], dtype=np.float32)
+
+        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
+        img = cv2.resize(img, (w, h))
+        inp = cv2.warpAffine(
+            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+        return inp, im_info
+
+
+# keypoint preprocess
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """This code is based on
+        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+
+        Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        matrix (np.ndarray): A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = np.cos(theta) * scale_x
+    matrix[0, 1] = -np.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
+        np.sin(theta) + 0.5 * size_target[0])
+    matrix[1, 0] = np.sin(theta) * scale_y
+    matrix[1, 1] = np.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
+        np.cos(theta) + 0.5 * size_target[1])
+    return matrix
+
+
+class TopDownEvalAffine(object):
+    """apply affine transform to image and coords
+
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self, trainsize, use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+
+    def __call__(self, image, im_info):
+        rot = 0
+        imshape = im_info['im_shape'][::-1]
+        center = im_info['center'] if 'center' in im_info else imshape / 2.
+        scale = im_info['scale'] if 'scale' in im_info else imshape
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot, center * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        else:
+            trans = get_affine_transform(center, scale, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+
+        return image, im_info
+
+
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = []
+        for op_info in transforms:
+            new_op_info = op_info.copy()
+            op_type = new_op_info.pop('type')
+            self.transforms.append(eval(op_type)(**new_op_info))
+
+    def __call__(self, img_path):
+        img, im_info = decode_image(img_path)
+        for t in self.transforms:
+            img, im_info = t(img, im_info)
+        inputs = copy.deepcopy(im_info)
+        inputs['image'] = img
+        return inputs
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..55760f5efd6a9c647db0f700a453b61781585d07
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,5 @@
+FROM image.sourcefind.cn:5000/dcu/admin/base/paddlepaddle:2.4.2-centos7.6-dtk-23.04-py38-latest
+RUN source /opt/dtk/env.sh
+COPY requirements.txt requirements.txt
+COPY requirements/ requirements/
+RUN pip3 install -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com -r requirements.txt
diff --git a/docker/requirements.txt b/docker/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d281b346b328ac407c84846c5a63419eed66f3ad
--- /dev/null
+++ b/docker/requirements.txt
@@ -0,0 +1,12 @@
+numpy < 1.24
+tqdm
+typeguard
+visualdl>=2.2.0
+opencv-python <= 4.6.0
+PyYAML
+shapely
+scipy
+terminaltables
+Cython
+pycocotools
+setuptools
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
new file mode 100644
index 0000000000000000000000000000000000000000..86b1bf0f7463e08861a95c770d49d2c89785953f
--- /dev/null
+++ b/docs/CHANGELOG.md
@@ -0,0 +1,373 @@
+简体中文 | [English](./CHANGELOG_en.md)
+
+# 版本更新信息
+
+## 最新版本信息
+
+### 2.5(08.26/2022)
+
+- 特色模型
+  - PP-YOLOE+：
+    - 发布PP-YOLOE+模型，COCO test2017数据集精度提升0.7%-2.4% mAP，模型训练收敛速度提升3.75倍，端到端预测速度提升1.73-2.3倍
+    - 发布智慧农业，夜间安防检测，工业质检场景预训练模型，精度提升1.3%-8.1% mAP
+    - 支持分布式训练、在线量化、serving部署等10大高性能训练部署能力，新增C++/Python Serving、TRT原生推理、ONNX Runtime等5+部署demo教程
+  - PP-PicoDet：
+    - 发布PicoDet-NPU模型，支持模型全量化部署
+    - 新增PicoDet版面分析模型，基于FGD蒸馏算法精度提升0.5% mAP
+  - PP-TinyPose
+    - 发布PP-TinyPose增强版，在健身、舞蹈等场景的业务数据集端到端AP提升9.1% AP
+    - 覆盖侧身、卧躺、跳跃、高抬腿等非常规动作
+    - 新增滤波稳定模块，关键点稳定性显著增强
+
+- 场景能力
+  - PP-Human v2
+    - 发布PP-Human v2，支持四大产业特色功能：多方案行为识别案例库、人体属性识别、人流检测与轨迹留存以及高精度跨镜跟踪
+    - 底层算法能力升级，行人检测精度提升1.5% mAP；行人跟踪精度提升10.2% MOTA，轻量级模型速度提升34%；属性识别精度提升0.6% ma，轻量级模型速度提升62.5%
+    - 提供全流程教程，覆盖数据采集标注，模型训练优化和预测部署，及pipeline中后处理代码修改
+    - 新增在线视频流输入支持
+    - 易用性提升，一行代码执行功能，执行流程判断、模型下载背后自动完成。
+  - PP-Vehicle
+    - 全新发布PP-Vehicle，支持四大交通场景核心功能：车牌识别、属性识别、车流量统计、违章检测
+    - 车牌识别支持基于PP-OCR v3的轻量级车牌识别模型
+    - 车辆属性识别支持基于PP-LCNet多标签分类模型
+    - 兼容图片、视频、在线视频流等各类数据输入格式
+    - 易用性提升，一行代码执行功能，执行流程判断、模型下载背后自动完成。
+
+- 前沿算法
+  - YOLO家族全系列模型
+    - 发布YOLO家族全系列模型，覆盖前沿检测算法YOLOv5、YOLOv6及YOLOv7
+    - 基于ConvNext骨干网络，YOLO各算法训练周期缩5-8倍，精度普遍提升1%-5% mAP；使用模型压缩策略实现精度无损的同时速度提升30%以上
+  - 新增基于ViT骨干网络高精度检测模型，COCO数据集精度达到55.7% mAP
+  - 新增OC-SORT多目标跟踪模型
+  - 新增ConvNeXt骨干网络
+
+- 产业实践范例教程
+  - 基于PP-TinyPose增强版的智能健身动作识别
+  - 基于PP-Human的打架识别
+  - 基于PP-Human的营业厅来客分析
+  - 基于PP-Vehicle的车辆结构化分析
+  - 基于PP-YOLOE+的PCB电路板缺陷检测
+
+- 框架能力
+  - 功能新增
+    - 新增自动压缩工具支持并提供demo，PP-YOLOE l版本精度损失0.3% mAP，V100速度提升13%
+    - 新增PaddleServing python/C++和ONNXRuntime部署demo
+    - 新增PP-YOLOE 端到端TensorRT部署demo
+    - 新增FGC蒸馏算法，RetinaNet精度提升3.3%
+    - 新增分布式训练文档
+  - 功能完善/Bug修复
+    - 修复Windows c++部署编译问题
+    - 修复VOC格式数据预测时保存结果问题
+    - 修复FairMOT c++部署检测框输出
+    - 旋转框检测模型S2ANet支持batch size>1部署
+
+### 2.4(03.24/2022)
+
+- PP-YOLOE：
+  - 发布PP-YOLOE特色模型，l版本COCO test2017数据集精度51.6%，V100预测速度78.1 FPS，精度速度服务器端SOTA
+  - 发布s/m/l/x系列模型，打通TensorRT、ONNX部署能力
+  - 支持混合精度训练，训练较PP-YOLOv2加速33%
+
+- PP-PicoDet:
+  - 发布PP-PicoDet优化模型，精度提升2%左右，CPU预测速度提升63%。
+  - 新增参数量0.7M的PicoDet-XS模型
+  - 后处理集成到网络中，优化端到端部署成本
+
+- 行人分析Pipeline：
+  - 发布PP-Human行人分析Pipeline，覆盖行人检测、属性识别、行人跟踪、跨镜跟踪、人流量统计、动作识别多种功能，打通TensorRT部署
+  - 属性识别支持StrongBaseline模型
+  - ReID支持Centroid模型
+  - 动作识别支持ST-GCN摔倒检测
+
+- 模型丰富度:
+  - 发布YOLOX，支持nano/tiny/s/m/l/x版本，x版本COCO val2017数据集精度51.8%
+
+- 框架功能优化：
+  - EMA训练速度优化20%，优化EMA训练模型保存方式
+  - 支持infer预测结果保存为COCO格式
+
+- 部署优化：
+  - RCNN全系列模型支持Paddle2ONNX导出ONNX模型
+  - SSD模型支持导出时融合解码OP，优化边缘端部署速度
+  - 支持NMS导出TensorRT，TensorRT部署端到端速度提升
+
+### 2.3(11.03/2021)
+
+- 特色模型:
+  - 检测: 轻量级移动端检测模型PP-PicoDet，精度速度达到移动端SOTA
+  - 关键点: 轻量级移动端关键点模型PP-TinyPose
+
+- 模型丰富度:
+  - 检测：
+    - 新增Swin-Transformer目标检测模型
+    - 新增TOOD(Task-aligned One-stage Object Detection)模型
+    - 新增GFL(Generalized Focal Loss)目标检测模型
+    - 发布Sniper小目标检测优化方法，支持Faster RCNN及PP-YOLO系列模型
+    - 发布针对EdgeBoard优化的PP-YOLO-EB模型
+
+  - 跟踪
+    - 发布实时跟踪系统PP-Tracking
+    - 发布FairMot高精度模型、小尺度模型和轻量级模型
+    - 发布行人、人头和车辆实跟踪垂类模型库，覆盖航拍监控、自动驾驶、密集人群、极小目标等场景
+    - DeepSORT模型适配PP-YOLO, PP-PicoDet等更多检测器
+
+  - 关键点
+    - 新增Lite HRNet模型
+
+- 预测部署:
+  - YOLOv3系列模型支持NPU预测部署
+  - FairMot模型C++预测部署打通
+  - 关键点系列模型C++预测部署打通, Paddle Lite预测部署打通
+
+- 文档:
+  - 新增各系列模型英文文档
+
+### 2.2(08.10/2021)
+
+- 模型丰富度：
+    - 发布Transformer检测模型：DETR、Deformable DETR、Sparse RCNN
+    - 关键点检测新增Dark模型，发布Dark HRNet模型
+    - 发布MPII数据集HRNet关键点检测模型
+    - 发布人头、车辆跟踪垂类模型
+
+- 模型优化：
+    - 旋转框检测模型S2ANet发布Align Conv优化模型，DOTA数据集mAP优化至74.0
+
+- 预测部署
+    - 主流模型支持batch size>1预测部署，包含YOLOv3，PP-YOLO，Faster RCNN，SSD，TTFNet，FCOS
+    - 新增多目标跟踪模型(JDE, FairMot, DeepSort) Python端预测部署支持，并支持TensorRT预测
+    - 新增多目标跟踪模型FairMot联合关键点检测模型部署Python端预测部署支持
+    - 新增关键点检测模型联合PP-YOLO预测部署支持
+
+- 文档：
+    - Windows预测部署文档新增TensorRT版本说明
+    - FAQ文档更新发布
+
+- 问题修复：
+    - 修复PP-YOLO系列模型训练收敛性问题
+    - 修复batch size>1时无标签数据训练问题
+
+
+### 2.1(05.20/2021)
+- 模型丰富度提升：
+    - 发布关键点模型HRNet，HigherHRNet
+    - 发布多目标跟踪模型DeepSort, FairMot, JDE
+
+- 框架基础能力：
+    - 支持无标注框训练
+
+- 预测部署：
+    - Paddle Inference YOLOv3系列模型支持batch size>1预测
+    - 旋转框检测S2ANet模型预测部署打通
+    - 增加量化模型Benchmark
+    - 增加动态图模型与静态图模型Paddle-Lite demo
+
+- 检测模型压缩：
+    - 发布PPYOLO系列模型压缩模型
+
+- 文档：
+    - 更新快速开始，预测部署等教程文档
+    - 新增ONNX模型导出教程
+    - 新增移动端部署文档
+
+
+### 2.0(04.15/2021)
+
+  **说明：** 自2.0版本开始，动态图作为PaddleDetection默认版本，原`dygraph`目录切换为根目录，原静态图实现移动到`static`目录下。
+
+  - 动态图模型丰富度提升：
+    - 发布PP-YOLOv2及PP-YOLO tiny模型，PP-YOLOv2 COCO test数据集精度达到49.5%，V100预测速度达到68.9 FPS
+    - 发布旋转框检测模型S2ANet
+    - 发布两阶段实用模型PSS-Det
+    - 发布人脸检测模型Blazeface
+
+  - 新增基础模块：
+    - 新增SENet，GhostNet，Res2Net骨干网络
+    - 新增VisualDL训练可视化支持
+    - 新增单类别精度计算及PR曲线绘制功能
+    - YOLO系列模型支持NHWC数据格式
+
+  - 预测部署：
+    - 发布主要模型的预测benchmark数据
+    - 适配TensorRT6，支持TensorRT动态尺寸输入，支持TensorRT int8量化预测
+    - PP-YOLO, YOLOv3, SSD, TTFNet, FCOS, Faster RCNN等7类模型在Linux、Windows、NV Jetson平台下python/cpp/TRT预测部署打通:
+
+  - 检测模型压缩：
+    - 蒸馏：新增动态图蒸馏支持，并发布YOLOv3-MobileNetV1蒸馏模型
+    - 联合策略：新增动态图剪裁+蒸馏联合策略压缩方案，并发布YOLOv3-MobileNetV1的剪裁+蒸馏压缩模型
+    - 问题修复：修复动态图量化模型导出问题
+
+  - 文档：
+    - 新增动态图英文文档：包含首页文档，入门使用，快速开始，模型算法、新增数据集等
+    - 新增动态图中英文安装文档
+    - 新增动态图RCNN系列和YOLO系列配置文件模板及配置项说明文档
+
+
+## 历史版本信息
+
+### 2.0-rc(02.23/2021)
+  - 动态图模型丰富度提升：
+    - 优化RCNN模型组网及训练方式，RCNN系列模型精度提升(依赖Paddle develop或2.0.1版本)
+    - 新增支持SSDLite，FCOS，TTFNet，SOLOv2系列模型
+    - 新增行人和车辆垂类目标检测模型
+
+  - 新增动态图基础模块：
+    - 新增MobileNetV3，HRNet骨干网络
+    - 优化RoIAlign计算逻辑，RCNN系列模型精度提升(依赖Paddle develop或2.0.1版本)
+    - 新增支持Synchronized Batch Norm
+    - 新增支持Modulated Deformable Convolution
+
+  - 预测部署：
+    - 发布动态图python、C++、Serving部署解决方案及文档，支持Faster RCNN，Mask RCNN，YOLOv3，PP-YOLO，SSD，TTFNet，FCOS，SOLOv2等系列模型预测部署
+    - 动态图预测部署支持TensorRT模式FP32，FP16推理加速
+
+  - 检测模型压缩：
+    - 裁剪：新增动态图裁剪支持，并发布YOLOv3-MobileNetV1裁剪模型
+    - 量化：新增动态图量化支持，并发布YOLOv3-MobileNetV1和YOLOv3-MobileNetV3量化模型
+
+  - 文档：
+    - 新增动态图入门教程文档：包含安装说明，快速开始，准备数据，训练/评估/预测流程文档
+    - 新增动态图进阶教程文档：包含模型压缩、推理部署文档
+    - 新增动态图模型库文档
+
+### v2.0-beta(12.20/2020)
+  - 动态图支持:
+    - 支持Faster-RCNN, Mask-RCNN, FPN, Cascade Faster/Mask RCNN, YOLOv3和SSD模型，试用版本。
+  - 模型提升：
+    - 更新PP-YOLO MobileNetv3 large和small模型，精度提升，并新增裁剪和蒸馏后的模型。
+  - 新功能：
+    - 支持VisualDL可视化数据预处理图片。
+
+  - Bug修复:
+    - 修复BlazeFace人脸关键点预测bug。
+
+
+### v0.5.0(11/2020)
+  - 模型丰富度提升：
+    - 发布SOLOv2系列模型，其中SOLOv2-Light-R50-VD-DCN-FPN 模型在单卡V100上达到 38.6 FPS，加速24% ，COCO验证集精度达到38.8%, 提升2.4绝对百分点。
+    - 新增Android移动端检测demo，包括SSD、YOLO系列模型，可直接扫码安装体验。
+
+  - 移动端模型优化：
+    - 新增PACT新量化策略，YOLOv3-Mobilenetv3在COCO数据集上比普通量化相比提升0.7%。
+
+  - 易用性提升及功能组件：
+    - 增强generate_proposal_labels算子功能，规避模型出nan风险。
+    - 修复deploy下python与C++预测若干问题。
+    - 统一COCO与VOC数据集下评估流程，支持输出单类AP和P-R曲线。
+    - PP-YOLO支持矩形输入图像。
+
+  - 文档：
+    - 新增目标检测全流程教程，新增Jetson平台部署教程。
+
+
+### v0.4.0(07/2020)
+  - 模型丰富度提升：
+    - 发布PPYOLO模型，COCO数据集精度达到45.2%，单卡V100预测速度达到72.9 FPS，精度和预测速度优于YOLOv4模型。
+    - 新增TTFNet模型，base版本对齐竞品，COCO数据集精度达到32.9%。
+    - 新增HTC模型，base版本对齐竞品，COCO数据集精度达到42.2%。
+    - 新增BlazeFace人脸关键点检测模型，在Wider-Face数据集的Easy-Set精度达到85.2%。
+    - 新增ACFPN模型， COCO数据集精度达到39.6%。
+    - 发布服务器端通用目标检测模型（包含676类），相同策略在COCO数据集上，V100为19.5FPS时，COCO mAP可以达到49.4%。
+
+  - 移动端模型优化：
+    - 新增SSDLite系列优化模型，包括新增GhostNet的Backbone，新增FPN组件等，精度提升0.5%-1.5%。
+
+  - 易用性提升及功能组件：
+    - 新增GridMask, RandomErasing数据增强方法。
+    - 新增Matrix NMS支持。
+    - 新增EMA(Exponential Moving Average)训练支持。
+    - 新增多机训练方法，两机相对于单机平均加速比80%，多机训练支持待进一步验证。
+
+### v0.3.0(05/2020)
+  - 模型丰富度提升：
+    - 添加Efficientdet-D0模型，速度与精度优于竞品。
+    - 新增YOLOv4预测模型，精度对齐竞品；新增YOLOv4在Pascal VOC数据集上微调训练，精度达到85.5%。
+    - YOLOv3新增MobileNetV3骨干网络，COCO数据集精度达到31.6%。
+    - 添加Anchor-free模型FCOS，精度优于竞品。
+    - 添加Anchor-free模型CornernetSqueeze，精度优于竞品，优化模型的COCO数据集精度38.2%, +3.7%，速度较YOLOv3-Darknet53快5%。
+    - 添加服务器端实用目标检测模型CascadeRCNN-ResNet50vd模型，速度与精度优于竞品EfficientDet。
+
+  - 移动端推出3种模型：
+    - SSDLite系列模型：SSDLite-Mobilenetv3 small/large模型，精度优于竞品。
+    - YOLOv3移动端方案: YOLOv3-MobileNetv3模型压缩后加速3.5倍，速度和精度均领先于竞品的SSDLite模型。
+    - RCNN移动端方案：CascadeRCNN-MobileNetv3经过系列优化, 推出输入图像分别为320x320和640x640的模型，速度与精度具有较高性价比。
+
+  - 预测部署重构：
+    - 新增Python预测部署流程，支持RCNN，YOLO，SSD，RetinaNet，人脸系列模型，支持视频预测。
+    - 重构C++预测部署，提高易用性。
+
+  - 易用性提升及功能组件：
+    - 增加AutoAugment数据增强。
+    - 升级检测库文档结构。
+    - 支持迁移学习自动进行shape匹配。
+    - 优化mask分支评估阶段内存占用。
+
+### v0.2.0(02/2020)
+  - 新增模型:
+    - 新增基于CBResNet模型。
+    - 新增LibraRCNN模型。
+    - 进一步提升YOLOv3模型精度，基于COCO数据精度达到43.2%，相比上个版本提升1.4%。
+  - 新增基础模块:
+    - 主干网络: 新增CBResNet。
+    - loss模块: YOLOv3的loss支持细粒度op组合。
+    - 正则模块: 新增DropBlock模块。
+  - 功能优化和改进:
+    - 加速YOLOv3数据预处理，整体训练提速40%。
+    - 优化数据预处理逻辑，提升易用性。
+    - 增加人脸检测预测benchmark数据。
+    - 增加C++预测引擎Python API预测示例。
+  - 检测模型压缩 :
+    - 裁剪: 发布MobileNet-YOLOv3裁剪方案和模型，基于VOC数据FLOPs - 69.6%, mAP + 1.4%，基于COCO数据FLOPS-28.8%, mAP + 0.9%; 发布ResNet50vd-dcn-YOLOv3裁剪方案和模型，基于COCO数据集FLOPS - 18.4%, mAP + 0.8%。
+    - 蒸馏: 发布MobileNet-YOLOv3蒸馏方案和模型，基于VOC数据mAP + 2.8%，基于COCO数据mAP + 2.1%。
+    - 量化: 发布YOLOv3-MobileNet和BlazeFace的量化模型。
+    - 裁剪+蒸馏: 发布MobileNet-YOLOv3裁剪+蒸馏方案和模型，基于COCO数据FLOPS - 69.6%，基于TensorRT预测加速64.5%，mAP - 0.3 %; 发布ResNet50vd-dcn-YOLOv3裁剪+蒸馏方案和模型，基于COCO数据FLOPS - 43.7%，基于TensorRT预测加速24.0%，mAP + 0.6 %。
+    - 搜索: 开源BlazeFace-Nas的完成搜索方案。
+  - 预测部署:
+    - 集成 TensorRT，支持FP16、FP32、INT8量化推理加速。
+  - 文档:
+    - 增加详细的数据预处理模块介绍文档以及实现自定义数据Reader文档。
+    - 增加如何新增算法模型的文档。
+    - 文档部署到网站: https://paddledetection.readthedocs.io
+
+### 12/2019
+- 增加Res2Net模型。
+- 增加HRNet模型。
+- 增加GIOU loss和DIOU loss。
+
+
+### 21/11/2019
+- 增加CascadeClsAware RCNN模型。
+- 增加CBNet，ResNet200和Non-local模型。
+- 增加SoftNMS。
+- 增加Open Image V5数据集和Objects365数据集模型。
+
+### 10/2019
+- 增加增强版YOLOv3模型，精度高达41.4%。
+- 增加人脸检测模型BlazeFace、Faceboxes。
+- 丰富基于COCO的模型，精度高达51.9%。
+- 增加Objects365 2019 Challenge上夺冠的最佳单模型之一CACascade-RCNN。
+- 增加行人检测和车辆检测预训练模型。
+- 支持FP16训练。
+- 增加跨平台的C++推理部署方案。
+- 增加模型压缩示例。
+
+
+### 2/9/2019
+- 增加GroupNorm模型。
+- 增加CascadeRCNN+Mask模型。
+
+### 5/8/2019
+- 增加Modulated Deformable Convolution系列模型。
+
+### 29/7/2019
+
+- 增加检测库中文文档
+- 修复R-CNN系列模型训练同时进行评估的问题
+- 新增ResNext101-vd + Mask R-CNN + FPN模型
+- 新增基于VOC数据集的YOLOv3模型
+
+### 3/7/2019
+
+- 首次发布PaddleDetection检测库和检测模型库
+- 模型包括：Faster R-CNN, Mask R-CNN, Faster R-CNN+FPN, Mask
+  R-CNN+FPN, Cascade-Faster-RCNN+FPN, RetinaNet, YOLOv3, 和SSD.
diff --git a/docs/CHANGELOG_en.md b/docs/CHANGELOG_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..15b8321e941ef0e4e39dd050e1c96afe09b96356
--- /dev/null
+++ b/docs/CHANGELOG_en.md
@@ -0,0 +1,377 @@
+English | [简体中文](./CHANGELOG.md)
+
+# Version Update Information
+
+## Last Version Information
+
+### 2.5(08.26/2022)
+
+- Featured model
+
+  - PP-YOLOE+：
+    - Released PP-YOLOE+ model, with a 0.7%-2.4% mAP improvement on COCO test2017. 3.75 times faster model training convergence rate and 1.73-2.3 times faster end-to-end inference speed
+    - Released pre-trained models for smart agriculture, night security detection, and industrial quality inspection with 1.3%-8.1% mAP accuracy improvement
+    - supports 10 high-performance training deployment capabilities, including distributed training, online quantization, and serving deployment. We also provide more than five new deployment demos, such as C++/Python Serving, TRT native inference, and ONNX Runtime
+  - PP-PicoDet：
+    - Release the PicoDet-NPU model to support full quantization of model deployment
+    - Add PicoDet layout analysis model with 0.5% mAP accuracy improvement due to FGD distillation algorithm
+  - PP-TinyPose
+    - Release PP-TinyPose Plus with 9.1% end-to-end AP improvement for business data sets such as physical exercises, dance, and other scenarios
+    - Covers unconventional movements such as turning to one side, lying down, jumping, high lift
+    - Add stabilization module (via filter) to significantly improve the stability at key points
+
+- Functions in different scenarios
+
+  - PP-Human v2
+    - Release PP-Human v2, which supports four industrial features: behavioral recognition case zoo for multiple solutions, human attribute recognition, human traffic detection and trajectory retention, as well as high precision multi-camera tracking
+    - Upgraded  underlying algorithm capabilities: 1.5% mAP improvement in pedestrian detection accuracy; 10.2% MOTA improvement in pedestrian tracking accuracy, 34% speed improvement in the lightweight model; 0.6% ma improvement in attribute recognition accuracy, 62.5% speed improvement in the lightweight model
+    - Provides comprehensive tutorials covering data collection and annotation, model training optimization and prediction deployment, and post-processing code modification in the pipeline
+    - Supports online video streaming input
+    - Become more user-friendly with a one-line code execution function that automates the process determination and model download
+  - PP-Vehicle
+    - Launch PP-Vehicle, which supports four core functions for traffic application: license plate recognition, attribute recognition, traffic flow statistics, and violation detection
+    - License plate recognition supports a lightweight model based on PP-OCR v3
+    - Vehicle attribute recognition supports a multi-label classification model based on PP-LCNet
+    - Compatible with various data input formats such as pictures, videos and online video streaming
+    - Become more user-friendly with a one-line code execution function that automates the process determination and model download
+
+- Cutting-edge algorithms
+
+  - YOLO Family
+    - Release the full range of YOLO family models covering the cutting-edge detection algorithms YOLOv5, YOLOv6 and YOLOv7
+    - Based on the ConvNext backbone network, YOLO's algorithm training periods are reduced by 5-8 times with accuracy generally improving by 1%-5% mAP; Thanks to the model compression strategy, its speed increased by over 30% with no loss of precision.
+  - Newly add high precision detection model based on [ViT](configs/vitdet) backbone network, with a 55.7% mAP accuracy on the COCO dataset
+  - Newly add multi-object tracking model [OC-SORT](configs/mot/ocsort)
+  - Newly add [ConvNeXt](configs/convnext) backbone network.
+
+- Industrial application
+
+  - Intelligent physical exercise recognition based on PP-TinyPose Plus
+  - Fighting recognition based on PP-Human
+  - Business hall visitor analysis based on PP-Human
+  - Vehicle structuring analysis based on PP-Vehicle
+  - PCB board defect detection based on PP-YOLOE+
+
+- Framework capabilities
+
+  - New functions
+    - Release auto-compression tools and demos, 0.3% mAP accuracy loss for PP-YOLOE l version, while 13% speed increase for V100
+    - Release PaddleServing python/C++ and ONNXRuntime deployment demos
+    - Release PP-YOLOE end-to-end TensorRT deployment demo
+    - Release FGC distillation algorithm with RetinaNet accuracy improved by 3.3%
+    - Release distributed training documentation
+  - Improvement and fixes
+    - Fix compilation problem with Windows c++ deployment
+    - Fix problems when saving results of inference data in VOC format
+    - Fix the detection box output of FairMOT c++ deployment
+    - Rotating frame detection model S2ANet supports batch size>1 deployment
+
+### 2.4(03.24/2022)
+
+- PP-YOLOE：
+  - Release PP-YOLOE object detection models, achieve mAP as 51.6% on COCO test dataset and 78.1 FPS on Nvidia V100 by PP-YOLOE-l, reach SOTA performance for object detection on GPU``
+  - Release series models: s/m/l/x, and support deployment base on TensorRT & ONNX
+  - Spport AMP training and training speed is 33% faster than PP-YOLOv2
+
+- PP-PicoDet:
+  - Release enhanced models of PP-PicoDet, mAP promoted ~2% on COCO and inference speed accelerated 63% on CPU
+  - Release PP-PicoDet-XS model with 0.7M parameters
+  - Post-processing integrated into the network to optimize deployment pipeline
+
+- PP-Human：
+  - Release PP-Human human analysis pipeline，including pedestrian detection, attribute recognition, human tracking, multi-camera tracking, human statistics, action recognition. Supporting deployment with TensorRT
+  - Release StrongBaseline model for attribute recognition
+  - Release Centroid model for ReID
+  - Release ST-GCN model for falldown action recognition
+
+- Model richness:
+  - Publish YOLOX object detection model, release series models: nano/tiny/s/m/l/x, and YOLOX-x achieves mAP as 51.8% on COCO val2017 dataset
+
+- Function Optimize：
+  - Optimize 20% training speed when training with EMA, improve saving method of EMA weights
+  - Support saving inference results in COCO format
+
+- Deployment Optimize：
+  - Support export ONNX model by Paddle2ONNX for all RCNN models
+  - Supoort export model with fused decode OP for SSD models to enhance inference speed in edge side
+  - Support export NMS to TensorRT model, optmize inference speed on TensorRT
+
+### 2.3(11.03/2021)
+
+- Feature models:
+    - Object detection: The lightweight object detection model PP-PicoDet, performace and inference speed reaches SOTA on mobile side
+    - Keypoint detection: The lightweight keypoint detection model PP-TinyPose for mobile side
+
+- Model richness:
+    - Object detection:
+        - Publish Swin-Transformer object detection model
+        - Publish TOOD(Task-aligned One-stage Object Detection) model
+        - Publish GFL(Generalized Focal Loss) object detection model
+        - Publish Sniper optimization method for tiny object detection, supporting Faster RCNN and PP-YOLO series models
+        - Publish PP-YOLO optimized model PP-YOLO-EB for EdgeBoard
+    - Multi-object tracking:
+        - Publish Real-time tracking system PP-Tracking
+        - Publish high-precision, small-scale and lightweight model based on FairMot
+        - Publish real-time tracking model zoo for pedestrian, head and vehicle tracking, including scenarios such as aerial surveillance, autonomous driving, dense crowds, and tiny object tracking
+        - DeepSort support PP-YOLO, PP-PicoDet as object detector
+    - Keypoint detection:
+        - Publish Lite HRNet model
+
+- Inference deployment:
+    - Support NPU deployment for YOLOv3 series
+    - Support C++ deployment for FairMot
+    - Support C++ and PaddleLite deployment for keypoint detection series model
+
+- Documents:
+    - Add series English documents
+
+
+### 2.2(08.10/2021)
+
+- Model richness:
+    - Publish the Transformer test model: DETR, Deformable DETR, Sparse RCNN
+    - Key point test new Dark model, release Dark HRNet model
+    - Publish the MPII dataset HRNet keypoint detection model
+    - Release head and vehicle tracking vertical model
+
+- Model optimization:
+    - AlignConv optimization model was released by S2ANet, and DOTA dataset mAP was optimized to 74.0
+
+- Inference deployment
+    - Mainstream models support batch size>1 predictive deployment, including YOLOv3, PP-YOLO, Faster RCNN, SSD, TTFNet,  FCOS
+    - New addition of target tracking models (JDE, Fair Mot, Deep Sort) Python side prediction deployment support, and support for TensorRT prediction
+    - FairMot joint key point detection model deployment Python side predictive deployment support
+    - Added support for key point detection model combined with PP-YOLO prediction deployment
+
+- Documents:
+    - New TensorRT version notes to Windows Predictive Deployment documentation
+    - FAQ documents are updated
+
+- Bug fixes:
+    - Fixed PP-YOLO series model training convergence problem
+    - Fixed the problem of no label data training when batch_size > 1
+
+
+### 2.1(05.20/2021)
+- Model richness enhancement:
+    - Key point model: HRNet, HigherHRNet
+    - Publish the multi-target tracking model: DeepSort, FairMot, JDE
+
+- Basic framework Capabilities:
+    - Supports training without labels
+
+- Forecast deployment:
+    - Paddle Inference YOLOv3 series model support batch_size>1 prediction
+    - Rotating frame detection S2ANet model prediction deployment is open
+    - Incremental quantization model benchmark
+    - Add dynamic graph model and static graph model: Paddle-Lite demo
+
+- Detection model compression:
+    - Release PP-YOLO series model compression model
+
+- Documents:
+    - Update quick start, forecast deployment and other tutorial documentation
+    - Added ONNX model export tutorial
+    - Added the mobile deployment document
+
+
+### 2.0(04.15/2021)
+
+  **Description:** Since version 2.0, dynamic graphs are used as the default version of Paddle Detection, the original `dygraph` directory is switched to the root directory, and the original static graph implementation is moved to the `static` directory.
+
+  - Enhancement of dynamic graph model richness:
+    - PP-YOLOv2 and PP-YOLO tiny models were published. The accuracy of PP-YOLOv2 COCO Test dataset reached 49.5%, and the prediction speed of V100 reached 68.9 FPS
+    - Release the rotary frame detection model S2ANet
+    - Release the two-phase utility model PSS-Det
+    - Publish the face detection model Blazeface
+
+  - New basic module:
+    - Added SENet, GhostNet, and Res2Net backbone networks
+    - Added VisualDL training visualization support
+    - Added single precision calculation and PR curve drawing function
+    - The YOLO models support THE NHWC data format
+
+  - Forecast deployment:
+    - Publish forecast benchmark data for major models
+    - Adaptive to TensorRT6, support TensorRT dynamic size input, support TensorRT int8 quantitative prediction
+    - 7 types of models including PP-YOLO, YOLOv3, SSD, TTFNet, FCOS, Faster RCNN are deployed in Python/CPP/TRT prediction on Linux, Windows and NV Jetson platforms
+
+  - Detection model compression:
+    - Distillation: Added dynamic map distillation support and released YOLOv3-MobileNetV1 distillation model
+    - Joint strategy: new dynamic graph prunning + distillation joint strategy compression scheme, and release YOLOv3-MobileNetV1 prunning + distillation compression model
+    - Problem fix: Fixed dynamic graph quantization model export problem
+
+  - Documents:
+    - New English document of dynamic graph: including homepage document, getting started, quick start, model algorithm, new dataset, etc
+    - Added both English and Chinese installation documents of dynamic diagrams
+    - Added configuration file templates and description documents of dynamic graph RCNN series and YOLO series
+
+
+## Historical Version Information
+
+### 2.0-rc(02.23/2021)
+  - Enhancement of dynamic graph model richness:
+    - Optimize networking and training mode of RCNN models, and improve accuracy of RCNN series models (depending on Paddle Develop or version 2.0.1)
+    - Added support for SSDLite, FCOS, TTFNet, SOLOv2 series models
+    - Added pedestrian and vehicle vertical object detection models
+
+  - New dynamic graph basic module:
+    - Added MobileNetV3 and HRNet backbone networks
+    - Improved roi-align calculation logic for RCNN series models (depending on Paddle Develop or version 2.0.1)
+    - Added support for Synchronized Batch Norm
+    - Added support for Modulated Deformable Convolution
+
+  - Forecast deployment:
+    - Publish dynamic diagrams in python, C++, and Serving deployment solution and documentation. Support Faster RCNN, Mask RCNN, YOLOv3, PPYOLO, SSD, TTFNet, FCOS, SOLOv2 and other models to predict deployment
+    - Dynamic graph prediction deployment supports TensorRT mode FP32, FP16 inference acceleration
+
+  - Detection model compression:
+    - Prunning: Added dynamic graph prunning support, and released YOLOv3-MobileNetV1 prunning model
+    - Quantization: Added quantization support of dynamic graph, and released quantization models of YOLOv3-MobileNetV1 and YOLOv3-MobileNetV3
+
+  - Documents:
+    - New Dynamic Diagram tutorial documentation: includes installation instructions, quick start, data preparation, and training/evaluation/prediction process documentation
+    - New advanced tutorial documentation for dynamic diagrams: includes documentation for model compression and inference deployment
+    - Added dynamic graph model library documentation
+
+### v2.0-beta(12.20/2020)
+  - Dynamic graph support:
+    -  Support for Faster-RCNN, Mask-RCNN, FPN, Cascade Faster/Mask RCNN, YOLOv3 and SSD models, trial version.
+  - Model upgrade:
+    - Updated PP-YOLO Mobile-Netv3 large and small models with improved accuracy, and added prunning and distillation models.
+  - New features:
+    - Support VisualDL visual data preprocessing pictures.
+
+  - Bug fix:
+    - Fix Blaze Face keypoint prediction bug.
+
+
+### v0.5.0(11/2020)
+  - Model richness enhancement:
+    - SOLOv2 series models were released, in which the SOLOv2-Light-R50-VD-DCN-FPN model achieved 38.6 FPS on a single gpu V100, accelerating by 24%, and the accuracy of COCO verification set reached 38.8%, improving by 2.4 absolute percentage points.
+    - Added Android mobile terminal detection demo, including SSD, YOLO series model, can directly scan code installation experience.
+
+  - Mobile terminal model optimization:
+    - Added to PACT's new quantization strategy, YOLOv3 Mobilenetv3 is 0.7% better than normal quantization on COCO datasets.
+
+  - Ease of use and functional components:
+    - Enhance the function of generate_proposal_labels operator to avoid nan risk of the model.
+    - Fixed several problems with deploy python and C++ prediction.
+    - Unified COCO and VOC datasets under the evaluation process, support the output of a single class of AP and P-R curves.
+    - PP-YOLO supports rectangular input images.
+
+  - Documents:
+    - Added object detection whole process tutorial, added Jetson platform deployment tutorial.
+
+
+### v0.4.0(07/2020)
+  - Model richness enhancement:
+    - The PPYOLO model was released. The accuracy of COCO dataset reached 45.2%, and the prediction speed of single gpu V100 reached 72.9 FPS, which was better than that of YOL Ov4 model.
+    - New TTFNet model, base version aligned with competing products, COCO dataset accuracy up to 32.9%.
+    - New HTC model, base version aligned with competing products, COCO dataset accuracy up to 42.2%.
+    - BlazeFace key point detection model was added, with an accuracy of 85.2% in Wider-Face's Easy-Set.
+    - ACFPN model was added, and the accuracy of COCO dataset reached 39.6%.
+    - General object detection model (including 676 classes) on the publisher side. On the COCO dataset with the same strategy, when V100 is 19.5FPS, the COCO mAP can reach 49.4%.
+
+  - Mobile terminal model optimization:
+    - Added SSD Lite series optimization models, including Ghost Net Backbone, FPN components, etc., with accuracy improved by 0.5% and 1.5%.
+
+  - Ease of use and functional components:
+    - Add GridMask, Random Erasing data enhancement method.
+    - Added support for Matrix NMS.
+    - EMA(Exponential Moving Average) training support.
+    - The new multi-machine training method, the average acceleration ratio of two machines to single machine is 80%, multi-machine training support needs to be further verified.
+
+### v0.3.0(05/2020)
+  - Model richness enhancement:
+    - Efficientdet-D0 model added, speed and accuracy is better than competing products.
+    - Added YOLOv4 prediction model, precision aligned with competing products; Added YOLOv4 fine tuning training on Pascal VOC datasets with accuracy of 85.5%.
+    - YOLOv3 added MobileNetV3 backbone network, COCO dataset accuracy reached 31.6%.
+    - Add Anchor-free model FCOS, the accuracy is better than competing products.
+    - Anchor-free model Cornernet Squeeze was added, the accuracy was better than competing products, and the accuracy of COCO dataset of optimized model was 38.2% and +3.7%, 5% faster than YOL Ov3 Darknet53.
+    - The CascadeRCNN-ResNet50vd model, which is a practical object detection model on the server side, is added, and its speed and accuracy are better than that of the competitive EfficientDet.
+
+  - Mobile terminal launched three models:
+    - SSSDLite model: SSDLite-Mobilenetv3 small/large model, with better accuracy than competitors.
+    - YOLOv3 Mobile solution: The YOLOv3-MobileNetv3 model accelerates 3.5 times after compression, which is faster and more accurate than the SSD Lite model of competing products.
+    - RCNN Mobile terminal scheme: CascadeRCNN-MobileNetv3, after series optimization, launched models with input images of 320x320 and 640x640 respectively, with high cost performance for speed and accuracy.
+
+  - Anticipate deployment refactoring:
+    - New Python prediction deployment process, support for RCNN, YOLO, SSD, Retina Net, face models, support for video prediction.
+    - Refactoring C++ predictive deployment to improve ease of use.
+
+  - Ease of use and functional components:
+    - Added Auto Augment data enhancement.
+    - Upgrade the detection library document structure.
+    - Support shape matching automatically by transfer learning.
+    - Optimize memory footprint during mask branch evaluation.
+
+### v0.2.0(02/2020)
+  - The new model:
+    - Added CBResNet model.
+    - Added LibraRCNN model.
+    - The accuracy of YOLOv3 model was further improved, and the accuracy based on COCO data reached 43.2%, 1.4% higher than the previous version.
+  - New Basic module:
+    - Trunk network: CBResNet is added.
+    - Loss module: Loss of YOLOv3 supports fine-grained OP combinations.
+    - Regular module: Added the Drop Block module.
+  - Function optimization and improvement:
+    - Accelerate YOLOv3 data preprocessing and increase the overall training speed by 40%.
+    - Optimize data preprocessing logic to improve ease of use.
+    - dd face detection prediction benchmark data.
+    - Added C++ prediction engine Python API prediction example.
+  - Detection model compression:
+    - prunning: Release MobileNet-YOLOv3 prunning scheme and model, based on VOC data FLOPs 69.6%, mAP + 1.4%, based on COCO DATA FLOPS 28.8%, mAP + 0.9%; Release ResNet50vd-DCN-YOLOv3 clipped solution and model based on COCO datasets 18.4%, mAP + 0.8%.
+    - Distillation: Release MobileNet-YOLOv3 distillation scheme and model, based on VOC data mAP + 2.8%, COCO data mAP + 2.1%.
+    - Quantification: Release quantification models of YOLOv3 Mobile Net and Blaze Face.
+    - Prunning + distillation: release MobileNet-YOLOv3 prunning + distillation solution and model, 69.6% based on COCO DATA FLOPS, 64.5% based on TensorRT prediction acceleration, 0.3% mAP; Release ResNet50vd-DCN-YOLOv3 tailoring + distillation solution and model, 43.7% based on COCO Data FLOPS, 24.0% based on TensorRT prediction acceleration, mAP + 0.6%.
+    - Search: Open source Blaze Face Nas complete search solution.
+  - Predict deployment:
+    - Integrated TensorRT, support FP16, FP32, INT8 quantitative inference acceleration.
+  - Document:
+    - Add detailed data preprocessing module to introduce documents and implement custom data Reader documents.
+    - Added documentation on how to add algorithm models.
+    - Document deployment to the web site: https://paddledetection.readthedocs.io
+
+### 12/2019
+- Add Res2Net model.
+- Add HRNet model.
+- Add GIOU loss and DIOU loss。
+
+
+### 21/11/2019
+- Add CascadeClsAware RCNN model.
+- Add CBNet, ResNet200 and Non-local model.
+- Add SoftNMS.
+- Add Open Image V5 dataset and Objects365 dataset model
+
+### 10/2019
+- Added enhanced YOLOv3 model with accuracy up to 41.4%.
+- Added Face detection models BlazeFace and Faceboxes.
+- Rich COCO based models, accuracy up to 51.9%.
+- Added CA-Cascade-RCNN, one of the best single models to win on Objects365 2019 Challenge.
+- Add pedestrian detection and vehicle detection pre-training models.
+- Support FP16 training.
+- Added cross-platform C++ inference deployment scheme.
+- Add model compression examples.
+
+
+### 2/9/2019
+- Add GroupNorm model.
+- Add CascadeRCNN+Mask model.
+
+### 5/8/2019
+- Add Modulated Deformable Convolution series model
+
+### 29/7/2019
+
+- Add detection library Chinese document
+- Fixed an issue where R-CNN series model training was evaluated simultaneously
+- Add ResNext101-vd + Mask R-CNN + FPN models
+- Added YOLOv3 model based on VOC dataset
+
+### 3/7/2019
+
+- First release of PaddleDetection Detection library and Detection model library
+- models：Faster R-CNN, Mask R-CNN, Faster R-CNN+FPN, Mask
+  R-CNN+FPN, Cascade-Faster-RCNN+FPN, RetinaNet, YOLOv3, 和SSD.
diff --git a/docs/MODEL_ZOO_cn.md b/docs/MODEL_ZOO_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e4c08b9f52b31e26e46bc6345c15d406c09cbe9
--- /dev/null
+++ b/docs/MODEL_ZOO_cn.md
@@ -0,0 +1,440 @@
+简体中文 | [English](MODEL_ZOO_en.md)
+
+# [**PaddleYOLO**](https://github.com/PaddlePaddle/PaddleYOLO)
+
+<div  align="center">
+  <img src="https://user-images.githubusercontent.com/13104100/213197403-c8257486-9ac4-486f-a0d5-4e3fe27ca852.jpg" width="480"/>
+  <img src="https://user-images.githubusercontent.com/13104100/213197635-eeb55433-bb2d-44f6-b374-73c616cfab24.jpg" width="480"/>
+</div>
+
+## 内容
+- [简介](#简介)
+- [模型库](#模型库)
+    - [PP-YOLOE](#PP-YOLOE)
+    - [YOLOX](#YOLOX)
+    - [YOLOv5](#YOLOv5)
+    - [YOLOv6](#YOLOv6)
+    - [YOLOv7](#YOLOv7)
+    - [YOLOv8](#YOLOv8)
+    - [RTMDet](#RTMDet)
+    - [VOC](#VOC)
+- [使用指南](#使用指南)
+    - [一键运行全流程](#一键运行全流程)
+    - [自定义数据集](#自定义数据集)
+- [FastDeploy多硬件快速部署](#FastDeploy多硬件快速部署)
+
+## 简介
+
+**PaddleYOLO**是基于[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)的YOLO系列模型库，**只包含YOLO系列模型的相关代码**，支持`YOLOv3`,`PP-YOLO`,`PP-YOLOv2`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6`,`YOLOv7`,`YOLOv8`,`RTMDet`等模型，欢迎一起使用和建设！
+
+## 更新日志
+
+* 【2023/05/21】支持[RT-DETR](configs/rtdetr)、[YOLOv8](configs/yolov8)、[YOLOv5u](configs/yolov5/yolov5u)和[YOLOv7u](configs/yolov7/yolov7u)训练全流程，支持[YOLOv6Lite](configs/yolov6/yolov6lite)预测和部署；
+* 【2023/03/13】支持[YOLOv5u](configs/yolov5/yolov5u)和[YOLOv7u](configs/yolov7/yolov7u)预测和部署；
+* 【2022/01/10】支持[YOLOv8](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8)预测和部署；
+* 【2022/09/29】支持[RTMDet](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet)预测和部署；
+* 【2022/09/26】发布[`PaddleYOLO`](https://github.com/PaddlePaddle/PaddleYOLO)模型套件；
+* 【2022/09/19】支持[`YOLOv6`](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6)新版，包括n/t/s/m/l模型；
+* 【2022/08/23】发布`YOLOSeries`代码库: 支持`YOLOv3`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6`,`YOLOv7`等YOLO模型，支持`ConvNeXt`骨干网络高精度版`PP-YOLOE`,`YOLOX`和`YOLOv5`等模型，支持PaddleSlim无损加速量化训练`PP-YOLOE`,`YOLOv5`,`YOLOv6`和`YOLOv7`等模型，详情可阅读[此文章](https://mp.weixin.qq.com/s/Hki01Zs2lQgvLSLWS0btrA)；
+
+
+**注意:**
+ - **PaddleYOLO**代码库协议为**GPL 3.0**，[YOLOv5](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5),[YOLOv6](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6),[YOLOv7](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7)和[YOLOv8](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8)这几类模型代码不合入[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)，其余YOLO模型推荐在[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)中使用，**会最先发布PP-YOLO系列特色检测模型的最新进展**；；
+ - **PaddleYOLO**代码库**推荐使用paddlepaddle-2.3.2以上的版本**，请参考[官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载对应适合版本，**Windows平台请安装paddle develop版本**；
+ - PaddleYOLO 的[Roadmap](https://github.com/PaddlePaddle/PaddleYOLO/issues/44) issue用于收集用户的需求，欢迎提出您的建议和需求。
+ - 训练**自定义数据集**请参照[文档](#自定义数据集)和[issue](https://github.com/PaddlePaddle/PaddleYOLO/issues/43)。请首先**确保加载了COCO权重作为预训练**，YOLO检测模型建议**总`batch_size`至少大于`64`**去训练，如果资源不够请**换小模型**或**减小模型的输入尺度**，为了保障较高检测精度，**尽量不要尝试单卡训和总`batch_size`小于`32`训**；
+
+
+## 模型库
+
+### [PP-YOLOE](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe)
+
+<details>
+<summary> 基础模型 </summary>
+
+| 网络模型        | 输入尺寸   | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| PP-YOLOE-s   |     640   |    32    |  400e    |    2.9    |       43.4        |        60.0         |   7.93    |  17.36   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_400e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_s_400e_coco.yml)                   |
+| PP-YOLOE-s   |     640   |    32    |  300e    |    2.9    |       43.0        |        59.6         |   7.93    |  17.36   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml)                   |
+| PP-YOLOE-m   |      640  |    28    |  300e    |    6.0    |       49.0        |        65.9         |   23.43   |  49.91   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_m_300e_coco.yml)                   |
+| PP-YOLOE-l   |      640  |    20    |  300e    |    8.7    |       51.4        |        68.6         |   52.20   |  110.07 | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_l_300e_coco.yml)                   |
+| PP-YOLOE-x   |      640  |    16    |  300e    |    14.9   |       52.3        |        69.5         |   98.42   |  206.59  |[model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_x_300e_coco.yml)    |
+| PP-YOLOE-tiny ConvNeXt| 640 |    16      |   36e    | -   |       44.6        |        63.3         |   33.04   |  13.87 | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_convnext_tiny_36e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/convnext/ppyoloe_convnext_tiny_36e_coco.yml) |
+| **PP-YOLOE+_s**   |     640   |    8    |  80e    |    2.9    |     **43.7**    |      **60.6**     |   7.93    |  17.36   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_s_80e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_plus_crn_s_80e_coco.yml)                   |
+| **PP-YOLOE+_m**   |      640  |    8    |  80e    |    6.0    |     **49.8**    |      **67.1**     |   23.43   |  49.91   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_m_80e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_plus_crn_m_80e_coco.yml)                   |
+| **PP-YOLOE+_l**   |      640  |    8    |  80e    |    8.7    |     **52.9**    |      **70.1**     |   52.20   |  110.07 | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_l_80e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_plus_crn_l_80e_coco.yml)                   |
+| **PP-YOLOE+_x**   |      640  |    8    |  80e    |    14.9   |     **54.7**    |      **72.0**     |   98.42   |  206.59  |[model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_x_80e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_plus_crn_x_80e_coco.yml)                   |
+
+</details>
+
+<details>
+<summary> 部署模型  </summary>
+
+| 网络模型     | 输入尺寸 | 导出后的权重(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| PP-YOLOE-s(400epoch) |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_400e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_400e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_400e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_400e_coco_wo_nms.onnx) |
+| PP-YOLOE-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_300e_coco_wo_nms.onnx) |
+| PP-YOLOE-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_m_300e_coco_wo_nms.onnx) |
+| PP-YOLOE-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_l_300e_coco_wo_nms.onnx) |
+| PP-YOLOE-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_x_300e_coco_wo_nms.onnx) |
+| **PP-YOLOE+_s** |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_s_80e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_s_80e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_s_80e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_s_80e_coco_wo_nms.onnx) |
+| **PP-YOLOE+_m** |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_m_80e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_m_80e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_m_80e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_m_80e_coco_wo_nms.onnx) |
+| **PP-YOLOE+_l** |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_l_80e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_l_80e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_l_80e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_l_80e_coco_wo_nms.onnx) |
+| **PP-YOLOE+_x** |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_x_80e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_x_80e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_x_80e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_x_80e_coco_wo_nms.onnx) |
+
+</details>
+
+### [YOLOX](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox)
+
+<details>
+<summary> 基础模型 </summary>
+
+| 网络模型        | 输入尺寸   | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOX-nano     |  416     |    8      |   300e    |     2.3    |  26.1  |  42.0 |  0.91  |  1.08 | [model](https://paddledet.bj.bcebos.com/models/yolox_nano_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_nano_300e_coco.yml) |
+| YOLOX-tiny     |  416     |    8      |   300e    |     2.8    |  32.9  |  50.4 |  5.06  |  6.45 | [model](https://paddledet.bj.bcebos.com/models/yolox_tiny_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_tiny_300e_coco.yml) |
+| YOLOX-s        |  640     |    8      |   300e    |     3.0    |  40.4  |  59.6 |  9.0  |  26.8 | [model](https://paddledet.bj.bcebos.com/models/yolox_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_s_300e_coco.yml) |
+| YOLOX-m        |  640     |    8      |   300e    |     5.8    |  46.9  |  65.7 |  25.3  |  73.8 | [model](https://paddledet.bj.bcebos.com/models/yolox_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_m_300e_coco.yml) |
+| YOLOX-l        |  640     |    8      |   300e    |     9.3    |  50.1  |  68.8 |  54.2  |  155.6 | [model](https://paddledet.bj.bcebos.com/models/yolox_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_l_300e_coco.yml) |
+| YOLOX-x        |  640     |    8      |   300e    |     16.6   |  **51.8**  |  **70.6** |  99.1  |  281.9 | [model](https://paddledet.bj.bcebos.com/models/yolox_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_x_300e_coco.yml) |
+ YOLOX-cdn-tiny    |  416     |    8      |   300e    |     1.9    |  32.4  |  50.2 |  5.03 |  6.33  | [model](https://paddledet.bj.bcebos.com/models/yolox_cdn_tiny_300e_coco.pdparams) | [config](c../../onfigs/yolox/yolox_cdn_tiny_300e_coco.yml) |
+| YOLOX-crn-s     |  640     |    8      |   300e    |     3.0    |  40.4  |  59.6 |  7.7  |  24.69 | [model](https://paddledet.bj.bcebos.com/models/yolox_crn_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_crn_s_300e_coco.yml) |
+| YOLOX-s ConvNeXt|  640     |    8      |   36e     |     -      |  44.6  |  65.3 |  36.2 |  27.52 | [model](https://paddledet.bj.bcebos.com/models/yolox_convnext_s_36e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/convnext/yolox_convnext_s_36e_coco.yml) |
+
+</details>
+
+<details>
+<summary> 部署模型  </summary>
+
+| 网络模型     | 输入尺寸 | 导出后的权重(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| YOLOx-nano |  416   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_nano_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_nano_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_nano_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_nano_300e_coco_wo_nms.onnx) |
+| YOLOx-tiny |  416   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_tiny_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_tiny_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_tiny_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_tiny_300e_coco_wo_nms.onnx) |
+| YOLOx-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_s_300e_coco_wo_nms.onnx) |
+| YOLOx-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_m_300e_coco_wo_nms.onnx) |
+| YOLOx-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_l_300e_coco_wo_nms.onnx) |
+| YOLOx-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_x_300e_coco_wo_nms.onnx) |
+
+</details>
+
+### [YOLOv5](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5)
+
+<details>
+<summary> 基础模型 </summary>
+
+| 网络模型        | 输入尺寸   | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv5-n        |  640     |    16     |   300e    |     1.5    |  28.0  | 45.7 |  1.87  | 4.52 | [model](https://paddledet.bj.bcebos.com/models/yolov5_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_n_300e_coco.yml) |
+| YOLOv5-s        |  640     |    16      |   300e    |     2.6    |  37.6  | 56.7 |  7.24  | 16.54 | [model](https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_s_300e_coco.yml) |
+| YOLOv5-m        |  640     |    16      |   300e    |     5.2    |  45.4  | 64.1 |  21.19  | 49.08 | [model](https://paddledet.bj.bcebos.com/models/yolov5_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_m_300e_coco.yml) |
+| YOLOv5-l        |  640     |    16      |   300e    |     7.9    |  48.9  | 67.1 |  46.56  | 109.32 | [model](https://paddledet.bj.bcebos.com/models/yolov5_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_l_300e_coco.yml) |
+| YOLOv5-x        |  640     |    16      |   300e    |     13.7   |  50.6  | 68.7 |  86.75  | 205.92 | [model](https://paddledet.bj.bcebos.com/models/yolov5_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_x_300e_coco.yml) |
+| YOLOv5-s ConvNeXt|  640    |    8      |   36e     |     -      |  42.4  |  65.3  |  34.54 |  17.96 | [model](https://paddledet.bj.bcebos.com/models/yolov5_convnext_s_36e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_convnext_s_36e_coco.yml) |
+| *YOLOv5u-n        |  640     |    16      |   300e   |     1.61    |  34.5  | 49.7 |  2.65  | 7.79 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_n_300e_coco.yml) |
+| *YOLOv5u-s        |  640     |    16      |   300e   |     2.66    |  43.0  | 59.7 |  9.15   | 24.12 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_s_300e_coco.yml) |
+| *YOLOv5u-m        |  640     |    16      |   300e   |     5.50    |  49.0  | 65.7 |  25.11  | 64.42 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_m_300e_coco.yml) |
+| *YOLOv5u-l        |  640     |    16      |   300e   |     8.73    |  52.2  | 69.0 |  53.23  | 135.34 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_l_300e_coco.yml) |
+| *YOLOv5u-x        |  640     |    16      |   300e   |     15.49   |  53.1  | 69.9 |  97.28  | 246.89 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_x_300e_coco.yml) |
+| *YOLOv5p6-n        |  1280     |    16     |   300e    |     -    |  35.9  | 54.2 |  3.25  | 9.23 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_n_300e_coco.yml) |
+| *YOLOv5p6-s        |  1280     |    16     |   300e    |     -    |  44.5  | 63.3 |  12.63  | 33.81 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_s_300e_coco.yml) |
+| *YOLOv5p6-m        |  1280     |    16     |   300e    |     -    |  51.1  | 69.0 |  35.73  | 100.21 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_m_300e_coco.yml) |
+| *YOLOv5p6-l        |  1280     |    8      |   300e    |     -    |  53.4  | 71.0 |  76.77  | 223.09 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_l_300e_coco.yml) |
+| *YOLOv5p6-x        |  1280     |    8      |   300e    |     -    |  54.7  | 72.4 |  140.80 | 420.03 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_x_300e_coco.yml) |
+
+</details>
+
+<details>
+<summary> 部署模型  </summary>
+
+| 网络模型     | 输入尺寸 | 导出后的权重(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| YOLOv5-n |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_wo_nms.onnx) |
+| YOLOv5-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_wo_nms.onnx) |
+| YOLOv5-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_wo_nms.onnx) |
+| YOLOv5-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_wo_nms.onnx) |
+| YOLOv5-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_wo_nms.onnx) |
+
+</details>
+
+### [YOLOv6](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6)
+
+<details>
+<summary> 基础模型 </summary>
+
+| 网络网络        | 输入尺寸   | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) |   mAP  |   AP50  | Params(M) | FLOPs(G) |  下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :---------: | :-----: |:-----: | :-----: |:-----: | :-------------: | :-----: |
+| *YOLOv6-n       |  640     |    16      |   300e(+300e) |  1.3  |  37.5 |    53.1 |  5.07  | 12.49 |[model](https://paddledet.bj.bcebos.com/models/yolov6_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6/yolov6_n_300e_coco.yml) |
+| *YOLOv6-s       |  640     |    32      |   300e(+300e) |  2.7  |  44.8 |    61.7 |  20.18  | 49.36 |[model](https://paddledet.bj.bcebos.com/models/yolov6_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6/yolov6_s_300e_coco.yml) |
+| *YOLOv6-m       |  640     |    32      |   300e(+300e) |  5.3  |  49.5 |    66.9 |  37.74  | 92.47 |[model](https://paddledet.bj.bcebos.com/models/yolov6_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6/yolov6_m_300e_coco.yml) |
+| *YOLOv6-l(silu) |  640     |    32      |   300e(+300e) |  9.5  |  52.2 |    70.2 |  59.66  | 149.4 |[model](https://paddledet.bj.bcebos.com/models/yolov6_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6/yolov6_l_300e_coco.yml) |
+
+</details>
+
+<details>
+<summary> 部署模型  </summary>
+
+| 网络模型     | 输入尺寸 | 导出后的权重(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| yolov6-n |  640   | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_n_300e_coco_w_nms.zip) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_n_300e_coco_wo_nms.zip) | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_n_300e_coco_w_nms.onnx) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_n_300e_coco_wo_nms.onnx) |
+| yolov6-s |  640   | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_s_300e_coco_w_nms.zip) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_s_300e_coco_wo_nms.zip) | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_s_300e_coco_w_nms.onnx) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_s_300e_coco_wo_nms.onnx) |
+| yolov6-m |  640   | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_m_300e_coco_w_nms.zip) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_m_300e_coco_wo_nms.zip) | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_m_300e_coco_w_nms.onnx) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_m_300e_coco_wo_nms.onnx) |
+| yolov6-l(silu) |  640  | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_l_300e_coco_w_nms.zip) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_l_300e_coco_wo_nms.zip) | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_l_300e_coco_w_nms.onnx) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_l_300e_coco_wo_nms.onnx) |
+
+</details>
+
+### [YOLOv7](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7)
+
+<details>
+<summary> 基础模型 </summary>
+
+| 网络模型        | 输入尺寸   | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv7-L        |  640     |    32      |   300e    |     7.4     |  51.0  | 70.2 |  37.62  | 106.08 |[model](https://paddledet.bj.bcebos.com/models/yolov7_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_l_300e_coco.yml) |
+| YOLOv7u-L       |  640     |    32      |   300e    |     9.0      |  52.1 | 68.8 |  43.59  | 130.10 |[model](https://paddledet.bj.bcebos.com/models/yolov7u_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7u/yolov7u_l_300e_coco.yml) |
+| *YOLOv7-X        |  640     |    32      |   300e    |     12.2    |  53.0  | 70.8 |  71.34  | 190.08 | [model](https://paddledet.bj.bcebos.com/models/yolov7_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_x_300e_coco.yml) |
+| *YOLOv7P6-W6     |  1280    |    16      |   300e    |     25.5    |  54.4  | 71.8 |  70.43  | 360.26 | [model](https://paddledet.bj.bcebos.com/models/yolov7p6_w6_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7p6_w6_300e_coco.yml) |
+| *YOLOv7P6-E6     |  1280    |    10      |   300e    |     31.1    |  55.7  | 73.0 |  97.25  | 515.4 | [model](https://paddledet.bj.bcebos.com/models/yolov7p6_e6_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7p6_e6_300e_coco.yml) |
+| *YOLOv7P6-D6     |  1280    |    8      |   300e    |     37.4    | 56.1  | 73.3 |  133.81  | 702.92 | [model](https://paddledet.bj.bcebos.com/models/yolov7p6_d6_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7p6_d6_300e_coco.yml) |
+| *YOLOv7P6-E6E    |  1280    |    6      |   300e    |     48.7    |  56.5  | 73.7 |  151.76  | 843.52 | [model](https://paddledet.bj.bcebos.com/models/yolov7p6_e6e_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7p6_e6e_300e_coco.yml) |
+| YOLOv7-tiny     |  640     |    32      |   300e    |     2.4   |  37.3 | 54.5 |  6.23  | 13.80 |[model](https://paddledet.bj.bcebos.com/models/yolov7_tiny_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_tiny_300e_coco.yml) |
+| YOLOv7-tiny     |  416     |    32      |   300e    |     1.3    | 33.3 | 49.5 |  6.23  | 5.82 |[model](https://paddledet.bj.bcebos.com/models/yolov7_tiny_416_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_tiny_416_300e_coco.yml) |
+| YOLOv7-tiny     |  320     |    32      |   300e    |     -    | 29.1 | 43.8 |  6.23  | 3.46 |[model](https://paddledet.bj.bcebos.com/models/yolov7_tiny_320_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_tiny_320_300e_coco.yml) |
+
+</details>
+
+<details>
+<summary> 部署模型  </summary>
+
+| 网络模型     | 输入尺寸 | 导出后的权重(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| YOLOv7-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_l_300e_coco_wo_nms.onnx) |
+| YOLOv7-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_x_300e_coco_wo_nms.onnx) |
+| YOLOv7P6-W6 |  1280   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_w6_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_w6_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_w6_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_w6_300e_coco_wo_nms.onnx) |
+| YOLOv7P6-E6 |  1280   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6_300e_coco_wo_nms.onnx) |
+| YOLOv7P6-D6 |  1280   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_d6_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_d6_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_d6_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_d6_300e_coco_wo_nms.onnx) |
+| YOLOv7P6-E6E |  1280   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6e_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6e_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6e_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6e_300e_coco_wo_nms.onnx) |
+| YOLOv7-tiny |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_300e_coco_wo_nms.onnx) |
+| YOLOv7-tiny |  416   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_416_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_416_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_416_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_416_300e_coco_wo_nms.onnx) |
+| YOLOv7-tiny |  320   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_320_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_320_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_320_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_320_300e_coco_wo_nms.onnx) |
+
+</details>
+
+
+### [YOLOv8](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8)
+
+<details>
+<summary> 基础模型 </summary>
+
+| 网络网络        | 输入尺寸   | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| *YOLOv8-n        |  640     |    16      |   500e   |    1.8   |  37.3  | 53.0 |  3.16   | 8.7 | [model](https://paddledet.bj.bcebos.com/models/yolov8_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_n_300e_coco.yml) |
+| *YOLOv8-s        |  640     |    16      |   500e   |    3.4   |  44.9  | 61.8 |  11.17  | 28.6 | [model](https://paddledet.bj.bcebos.com/models/yolov8_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_s_300e_coco.yml) |
+| *YOLOv8-m        |  640     |    16      |   500e   |    6.5   |  50.2  | 67.3 |  25.90  | 78.9 | [model](https://paddledet.bj.bcebos.com/models/yolov8_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_m_300e_coco.yml) |
+| *YOLOv8-l        |  640     |    16      |   500e   |    10.0   |  52.8  | 69.6 |  43.69  | 165.2 | [model](https://paddledet.bj.bcebos.com/models/yolov8_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_l_300e_coco.yml) |
+| *YOLOv8-x        |  640     |    16      |   500e   |    15.1  |  53.8  | 70.6 |  68.23  | 257.8 | [model](https://paddledet.bj.bcebos.com/models/yolov8_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_x_300e_coco.yml) |
+| *YOLOv8-P6-x     |  1280    |    16      |   500e   |    55.0  |    -   |   -  |  97.42  | 522.93 | [model](https://paddledet.bj.bcebos.com/models/yolov8p6_x_500e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8p6_x_500e_coco.yml) |
+
+</details>
+
+<details>
+<summary> 部署模型  </summary>
+
+| 网络模型   | 输入尺寸 | 导出后的权重(带nms) | 导出后的权重(exclude_nms)| ONNX(exclude_post_process)  |
+| :-------- | :----: | :---------------: | :--------------------: | :-------------------------: |
+| YOLOv8-n |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco.onnx) |
+| YOLOv8-s |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco.onnx) |
+| YOLOv8-m |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco.onnx) |
+| YOLOv8-l |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco.onnx) |
+| YOLOv8-x |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco.onnx) |
+
+</details>
+
+
+### [RTMDet](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet)
+
+<details>
+<summary> 基础模型 </summary>
+
+| 网络网络        | 输入尺寸   | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) |   mAP  |   AP50  | Params(M) | FLOPs(G) |  下载链接       | 配置文件 |
+| :------------- | :------- | :-------: | :------: | :---------: | :-----: |:-----: | :-----: |:-----: | :-------------: | :-----: |
+| *RTMDet-t       |  640     |    32      |   300e    |    2.8   |  40.9 | 57.9 |  4.90  | 16.21 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_t_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_t_300e_coco.yml) |
+| *RTMDet-s       |  640     |    32      |   300e    |    3.3   |  44.5 | 62.0 |  8.89  | 29.71 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_s_300e_coco.yml) |
+| *RTMDet-m       |  640     |    32      |   300e    |    6.4   |  49.1 | 66.8 |  24.71  | 78.47 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_m_300e_coco.yml) |
+| *RTMDet-l       |  640     |    32      |   300e    |    10.2  |  51.2 | 68.8 |  52.31  | 160.32 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_l_300e_coco.yml) |
+| *RTMDet-x       |  640     |    32      |   300e    |    18.0  |  52.6 | 70.4 |  94.86  | 283.12 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_x_300e_coco.yml) |
+
+</details>
+
+<details>
+<summary> 部署模型  </summary>
+
+| 网络模型     | 输入尺寸 | 导出后的权重(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| RTMDet-t |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_t_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_t_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_t_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_t_300e_coco_wo_nms.onnx) |
+| RTMDet-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_s_300e_coco_wo_nms.onnx) |
+| RTMDet-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_m_300e_coco_wo_nms.onnx) |
+| RTMDet-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_l_300e_coco_wo_nms.onnx) |
+| RTMDet-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_x_300e_coco_wo_nms.onnx) |
+
+</details>
+
+
+### **注意:**
+ - 所有模型均使用COCO train2017作为训练集，在COCO val2017上验证精度，模型前带*表示训练更新中。
+ - 具体精度和速度细节请查看[PP-YOLOE](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe),[YOLOX](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox),[YOLOv5](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5),[YOLOv6](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6),[YOLOv7](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7)，**其中YOLOv5,YOLOv6,YOLOv7评估并未采用`multi_label`形式**。
+- 模型推理耗时(ms)为TensorRT-FP16下测试的耗时，**不包含数据预处理和模型输出后处理(NMS)的耗时**。测试采用**单卡Tesla T4 GPU，batch size=1**，测试环境为**paddlepaddle-2.3.2**, **CUDA 11.2**, **CUDNN 8.2**, **GCC-8.2**, **TensorRT 8.0.3.4**，具体请参考各自模型主页。
+- **统计FLOPs(G)和Params(M)**，首先安装[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim), `pip install paddleslim`，然后设置[runtime.yml](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/runtime.yml)里`print_flops: True`和`print_params: True`，并且注意确保是**单尺度**下如640x640，**打印的是MACs，FLOPs=2*MACs**。
+ - 基于[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)对YOLO系列模型进行量化训练，可以实现精度基本无损，速度普遍提升30%以上，具体请参照[模型自动化压缩工具ACT](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/auto_compression)。
+
+
+### [VOC](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc)
+
+<details>
+<summary> 基础模型 </summary>
+
+| 网络模型        | 输入尺寸   | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP(0.50,11point) | Params(M) | FLOPs(G) |    下载链接       | 配置文件 |
+| :-----------: | :-------: | :-------: | :------: | :------------: | :---------------: | :------------------: |:-----------------: | :------: | :------: |
+| YOLOv5-s        |  640     |    16     |   60e    |     3.2   |  80.3 |  7.24  | 16.54 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_s_60e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc/yolov5_s_60e_voc.yml) |
+| YOLOv7-tiny     |  640     |    32     |   60e    |     2.6   |  80.2 |  6.23  | 6.90 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov7_tiny_60e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc/yolov7_tiny_60e_voc.yml) |
+| YOLOX-s         |  640     |    8      |   40e    |     3.0   |  82.9 |  9.0   |  26.8 | [下载链接](https://paddledet.bj.bcebos.com/models/yolox_s_40e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc/yolox_s_40e_voc.yml) |
+| PP-YOLOE+_s     |  640     |    8      |   30e    |     2.9   |  86.7 |  7.93  |  17.36 | [下载链接](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_s_30e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc/ppyoloe_plus_crn_s_30e_voc.yml) |
+
+</details>
+
+**注意:**
+  - VOC数据集训练的mAP为`mAP(IoU=0.5)`的结果，且评估未使用`multi_label`等trick；
+  - 所有YOLO VOC模型均加载各自模型的COCO权重作为预训练，各个配置文件的配置均为默认使用8卡GPU，可作为自定义数据集设置参考，具体精度会因数据集而异；
+  - YOLO检测模型建议**总`batch_size`至少大于`64`**去训练，如果资源不够请**换小模型**或**减小模型的输入尺度**，为了保障较高检测精度，**尽量不要尝试单卡训和总`batch_size`小于`64`训**；
+  - Params(M)和FLOPs(G)均为训练时所测，YOLOv7没有s模型，故选用tiny模型；
+  - TRT-FP16-Latency(ms)测速相关请查看各YOLO模型的config的主页；
+
+
+## 使用指南
+
+下载MS-COCO数据集，[官网](https://cocodataset.org)下载地址为: [annotations](http://images.cocodataset.org/annotations/annotations_trainval2017.zip), [train2017](http://images.cocodataset.org/zips/train2017.zip), [val2017](http://images.cocodataset.org/zips/val2017.zip), [test2017](http://images.cocodataset.org/zips/test2017.zip)。
+PaddleDetection团队提供的下载链接为：[coco](https://bj.bcebos.com/v1/paddledet/data/coco.tar)(共约22G)和[test2017](https://bj.bcebos.com/v1/paddledet/data/cocotest2017.zip)，注意test2017可不下载，评估是使用的val2017。
+
+
+### **一键运行全流程**
+
+将以下命令写在一个脚本文件里如```run.sh```，一键运行命令为：```sh run.sh```，也可命令行一句句去运行。
+
+```bash
+model_name=ppyoloe # 可修改，如 yolov7
+job_name=ppyoloe_plus_crn_s_80e_coco # 可修改，如 yolov7_tiny_300e_coco
+
+config=configs/${model_name}/${job_name}.yml
+log_dir=log_dir/${job_name}
+# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
+weights=output/${job_name}/model_final.pdparams
+
+# 1.训练（单卡/多卡），加 --eval 表示边训边评估，加 --amp 表示混合精度训练
+# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp
+python -m paddle.distributed.launch --log_dir=${log_dir} --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp
+
+# 2.评估，加 --classwise 表示输出每一类mAP
+CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c ${config} -o weights=${weights} --classwise
+
+# 3.预测 (单张图/图片文件夹）
+CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_img=demo/000000014439_640x640.jpg --draw_threshold=0.5
+# CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_dir=demo/ --draw_threshold=0.5
+
+# 4.导出模型，以下3种模式选一种
+## 普通导出，加trt表示用于trt加速，对NMS和silu激活函数提速明显
+CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} # trt=True
+
+## exclude_post_process去除后处理导出，返回和YOLOv5导出ONNX时相同格式的concat后的1个Tensor，是未缩放回原图的坐标+分类置信度
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_post_process=True # trt=True
+
+## exclude_nms去除NMS导出，返回2个Tensor，是缩放回原图后的坐标和分类置信度
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_nms=True # trt=True
+
+# 5.部署预测，注意不能使用 去除后处理 或 去除NMS 导出后的模型去预测
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU
+
+# 6.部署测速，加 “--run_mode=trt_fp16” 表示在TensorRT FP16模式下测速，注意如需用到 trt_fp16 则必须为加 trt=True 导出的模型
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU --run_benchmark=True # --run_mode=trt_fp16
+
+# 7.onnx导出，一般结合 exclude_post_process去除后处理导出的模型
+paddle2onnx --model_dir output_inference/${job_name} --model_filename model.pdmodel --params_filename model.pdiparams --opset_version 12 --save_file ${job_name}.onnx
+
+# 8.onnx trt测速
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp16
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp32
+```
+
+- 如果想切换模型，只要修改开头两行即可，如:
+  ```
+  model_name=yolov7
+  job_name=yolov7_tiny_300e_coco
+  ```
+- 导出**onnx**，首先安装[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX)，`pip install paddle2onnx`；
+- **统计FLOPs(G)和Params(M)**，首先安装[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)，`pip install paddleslim`，然后设置[runtime.yml](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/runtime.yml)里`print_flops: True`和`print_params: True`，并且注意确保是**单尺度**下如640x640，**打印的是MACs，FLOPs=2*MACs**。
+
+
+### 自定义数据集
+
+#### 数据集准备：
+
+1.自定义数据集的标注制作，请参考[DetAnnoTools](../tutorials/data/DetAnnoTools.md);
+
+2.自定义数据集的训练准备，请参考[PrepareDataSet](../tutorials/PrepareDataSet.md)。
+
+
+#### fintune训练：
+
+除了更改数据集的路径外，训练一般推荐加载**对应模型的COCO预训练权重**去fintune，会更快收敛和达到更高精度，如：
+
+```base
+# 单卡fintune训练：
+# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp -o pretrain_weights=https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams
+
+# 多卡fintune训练：
+python -m paddle.distributed.launch --log_dir=./log_dir --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp -o pretrain_weights=https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams
+```
+
+**注意:**
+- fintune训练一般会提示head分类分支最后一层卷积的通道数没对应上，属于正常情况，是由于自定义数据集一般和COCO数据集种类数不一致；
+- fintune训练一般epoch数可以设置更少，lr设置也更小点如1/10，最高精度可能出现在中间某个epoch；
+
+#### 预测和导出：
+
+使用自定义数据集预测和导出模型时，如果TestDataset数据集路径设置不正确会默认使用COCO 80类。
+除了TestDataset数据集路径设置正确外，也可以自行修改和添加对应的label_list.txt文件(一行记录一个对应种类)，TestDataset中的anno_path也可设置为绝对路径，如：
+```
+TestDataset:
+  !ImageFolder
+    anno_path: label_list.txt # 如不使用dataset_dir，则anno_path即为相对于PaddleDetection主目录的相对路径
+    # dataset_dir: dataset/my_coco # 如使用dataset_dir，则dataset_dir/anno_path作为新的anno_path
+```
+label_list.txt里的一行记录一个对应种类，如下所示：
+```
+person
+vehicle
+```
+
+
+## FastDeploy多硬件快速部署
+
+FastDeploy是飞桨推出的统一部署工具，支持云边端部署。目前在YOLO系列支持的部署能力如下所示。具体部署示例，可以前往[FastDeploy仓库](https://github.com/PaddlePaddle/FastDeploy)使用。
+
+|                                                                                                                                | [YOLOv5](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | [YOLOv6](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | [YOLOv7](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | [YOLOv8](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | [PP-YOLOE+](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | 部署特色                       |
+| ------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | ------------------------------------- |
+| [Intel CPU](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/download_prebuilt_libraries.md)  | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                             | 集成PaddleSlim一键压缩压缩，实现极致性能             |
+| [NVIDIA GPU](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/download_prebuilt_libraries.md) | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                             | 集成PaddleSlim一键压缩工具、CUDA预处理加速，实现极致性能   |
+| [飞腾 CPU]()                                                                                                                     | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                             | X86 CPU与ARM CPU无缝切换                   |
+| [昆仑芯 R200*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/kunlunxin.md)                    | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                             | 无缝部署Paddle模型                          |
+| [昇腾310*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/kunlunxin.md)                       | 支持                                                                                                          | 即将支持                                                                                                        | 即将支持                                                                                                        | 即将支持                                                                                                        | 支持                                                                                                             | 无缝部署Paddle模型                          |
+| [算能SC7-FP300*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/sophgo.md)                    | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                             | 充分发挥硬件工具链特性，实现模型快速部署                  |
+| [Jetson](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/jetson.md)                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                             | 集成PaddleSlim一键压缩工具、CUDA预处理加速，实现极致性能   |
+| [ARM CPU](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/download_prebuilt_libraries.md)    | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                             | 集成PaddleSlim一键压缩工具、预处理加速库FlyCV，实现极致性能 |
+| [RK3588*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/rknpu2.md)                         | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                          | 支持                                                                                                             | 充分发挥硬件工具链特性，实现模型快速部署                  |
+| [RV1126*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/rv1126.md)                         | 支持                                                                                                          | 暂不支持                                                                                                        | 暂不支持                                                                                                        | 暂不支持                                                                                                        | 支持                                                                                                             | 联合全量化实现模型端到端的优化                       |
+| [服务化部署](https://github.com/PaddlePaddle/FastDeploy/tree/develop/serving)                                                       | 支持                                                                                                          | 暂不支持                                                                                                        | 暂不支持                                                                                                        | 暂不支持                                                                                                        | 支持                                                                                                             | 实现企业级高并发需求                            |
+| [视频流部署](https://github.com/PaddlePaddle/FastDeploy/tree/develop/streamer)                                                      | 暂不支持                                                                                                        | 暂不支持                                                                                                        | 暂不支持                                                                                                        | 暂不支持                                                                                                        | 支持                                                                                                             | 调用硬解码核，实现数据零拷贝，充分利用硬件资源               |
+
+备注：
+
+*表示：FastDeploy目前在该型号硬件上测试。通常同类型硬件上使用的是相同的软件栈，该部署能力可以延伸到同软件架栈的硬件。譬如RK3588与RK3566、RK3568相同的软件栈。
+
+「硬件列-纵轴」链接到部署预编译包安装或部署示例，「横轴」跳转到具体部署示例。
diff --git a/docs/MODEL_ZOO_en.md b/docs/MODEL_ZOO_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a4303a8fa2047c759ed594d0a8309bc6ff205ac
--- /dev/null
+++ b/docs/MODEL_ZOO_en.md
@@ -0,0 +1,410 @@
+[简体中文](MODEL_ZOO_cn.md) | English
+
+# [**PaddleYOLO**](https://github.com/PaddlePaddle/PaddleYOLO)
+
+<div  align="center">
+  <img src="https://user-images.githubusercontent.com/13104100/213197403-c8257486-9ac4-486f-a0d5-4e3fe27ca852.jpg" width="480"/>
+  <img src="https://user-images.githubusercontent.com/13104100/213197635-eeb55433-bb2d-44f6-b374-73c616cfab24.jpg" width="480"/>
+</div>
+
+## Introduction
+- [Introduction](#Introduction)
+- [ModelZoo](#ModelZoo)
+    - [PP-YOLOE](#PP-YOLOE)
+    - [YOLOX](#YOLOX)
+    - [YOLOv5](#YOLOv5)
+    - [YOLOv6](#YOLOv6)
+    - [YOLOv7](#YOLOv7)
+    - [YOLOv8](#YOLOv8)
+    - [RTMDet](#RTMDet)
+    - [VOC](#VOC)
+- [UserGuide](#UserGuide)
+    - [Pipeline](#Pipeline)
+    - [CustomDataset](#CustomDataset)
+
+## Introduction
+
+**PaddleYOLO** is a YOLO Series toolbox based on [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection), **only relevant codes of YOLO series models are included**. It supports `YOLOv3`,`PP-YOLO`,`PP-YOLOv2`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6`,`YOLOv7`,`YOLOv8`,`RTMDet` and so on. Welcome to use and build it together!
+
+## Updates
+
+* 【2023/03/13】Support [YOLOv5u](configs/yolov5/yolov5u) and [YOLOv7u](configs/yolov7/yolov7u) inference and deploy;
+* 【2023/01/10】Support [YOLOv8](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8) inference and deploy;
+* 【2022/09/29】Support [RTMDet](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet) inference and deploy;
+* 【2022/09/26】Release [`PaddleYOLO`](https://github.com/PaddlePaddle/PaddleYOLO);
+* 【2022/09/19】Support the new version of [`YOLOv6`](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6), including n/t/s/m/l model;
+* 【2022/08/23】Release `YOLOSeries` codebase: support `YOLOv3`,`PP-YOLOE`,`PP-YOLOE+`,`YOLOX`,`YOLOv5`,`YOLOv6` and `YOLOv7`; support using `ConvNeXt` backbone to get high-precision version of `PP-YOLOE`,`YOLOX` and `YOLOv5`; support PaddleSlim accelerated quantitative training `PP-YOLOE`,`YOLOv5`,`YOLOv6` and `YOLOv7`. For details, please read this [article](https://mp.weixin.qq.com/s/Hki01Zs2lQgvLSLWS0btrA)；
+
+
+**Notes：**
+ - The Licence of **PaddleYOLO** is **GPL 3.0**, the codes of [YOLOv5](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5),[YOLOv6](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6),[YOLOv7](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7) and [YOLOv8](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8) will not be merged into [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection). Except for these three YOLO models, other YOLO models are recommended to use in [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection), **which will be the first to release the latest progress of PP-YOLO series detection model**;
+ - To use **PaddleYOLO**, **PaddlePaddle-2.3.2 or above is recommended**，please refer to the [official website](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html) to download the appropriate version. **For Windows platforms, please install the paddle develop version**;
+ - Training **Custom dataset** please refer to [doc](#CustomDataset) and [issue](https://github.com/PaddlePaddle/PaddleYOLO/issues/43). Please **ensure COCO trained weights are loaded as pre-train** at first. We recommend to use YOLO detection model **with a total `batch_size` at least greater than `64` to train**. If the resources are insufficient, please **use the smaller model** or **reduce the input size of the model**. To ensure high detection accuracy, **you'd better never try to using single GPU or total `batch_size` less than `32` for training**;
+
+## ModelZoo
+
+### [PP-YOLOE](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe)
+
+<details>
+<summary> Baseline </summary>
+
+| Model        | Input Size  | images/GPU | Epoch | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    download       | config |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| PP-YOLOE-s   |     640   |    32    |  400e    |    2.9    |       43.4        |        60.0         |   7.93    |  17.36   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_400e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_s_400e_coco.yml)                   |
+| PP-YOLOE-s   |     640   |    32    |  300e    |    2.9    |       43.0        |        59.6         |   7.93    |  17.36   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml)                   |
+| PP-YOLOE-m   |      640  |    28    |  300e    |    6.0    |       49.0        |        65.9         |   23.43   |  49.91   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_m_300e_coco.yml)                   |
+| PP-YOLOE-l   |      640  |    20    |  300e    |    8.7    |       51.4        |        68.6         |   52.20   |  110.07 | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_l_300e_coco.yml)                   |
+| PP-YOLOE-x   |      640  |    16    |  300e    |    14.9   |       52.3        |        69.5         |   98.42   |  206.59  |[model](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_crn_x_300e_coco.yml)    |
+| PP-YOLOE-tiny ConvNeXt| 640 |    16      |   36e    | -   |       44.6        |        63.3         |   33.04   |  13.87 | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_convnext_tiny_36e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/convnext/ppyoloe_convnext_tiny_36e_coco.yml) |
+| **PP-YOLOE+_s**   |     640   |    8    |  80e    |    2.9    |     **43.7**    |      **60.6**     |   7.93    |  17.36   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_s_80e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_plus_crn_s_80e_coco.yml)                   |
+| **PP-YOLOE+_m**   |      640  |    8    |  80e    |    6.0    |     **49.8**    |      **67.1**     |   23.43   |  49.91   | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_m_80e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_plus_crn_m_80e_coco.yml)                   |
+| **PP-YOLOE+_l**   |      640  |    8    |  80e    |    8.7    |     **52.9**    |      **70.1**     |   52.20   |  110.07 | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_l_80e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_plus_crn_l_80e_coco.yml)                   |
+| **PP-YOLOE+_x**   |      640  |    8    |  80e    |    14.9   |     **54.7**    |      **72.0**     |   98.42   |  206.59  |[model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_x_80e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe/ppyoloe_plus_crn_x_80e_coco.yml)                   |
+
+</details>
+
+<details>
+<summary> Deploy Models  </summary>
+
+| Model     | Input Size | Exported weights(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| PP-YOLOE-s(400epoch) |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_400e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_400e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_400e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_400e_coco_wo_nms.onnx) |
+| PP-YOLOE-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_s_300e_coco_wo_nms.onnx) |
+| PP-YOLOE-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_m_300e_coco_wo_nms.onnx) |
+| PP-YOLOE-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_l_300e_coco_wo_nms.onnx) |
+| PP-YOLOE-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_crn_x_300e_coco_wo_nms.onnx) |
+| **PP-YOLOE+_s** |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_s_80e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_s_80e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_s_80e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_s_80e_coco_wo_nms.onnx) |
+| **PP-YOLOE+_m** |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_m_80e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_m_80e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_m_80e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_m_80e_coco_wo_nms.onnx) |
+| **PP-YOLOE+_l** |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_l_80e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_l_80e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_l_80e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_l_80e_coco_wo_nms.onnx) |
+| **PP-YOLOE+_x** |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_x_80e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_x_80e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_x_80e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/ppyoloe/ppyoloe_plus_crn_x_80e_coco_wo_nms.onnx) |
+
+</details>
+
+### [YOLOX](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox)
+
+<details>
+<summary> Baseline </summary>
+
+| Model        | Input Size  | images/GPU | Epoch | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    download       | config |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOX-nano     |  416     |    8      |   300e    |     2.3    |  26.1  |  42.0 |  0.91  |  1.08 | [model](https://paddledet.bj.bcebos.com/models/yolox_nano_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_nano_300e_coco.yml) |
+| YOLOX-tiny     |  416     |    8      |   300e    |     2.8    |  32.9  |  50.4 |  5.06  |  6.45 | [model](https://paddledet.bj.bcebos.com/models/yolox_tiny_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_tiny_300e_coco.yml) |
+| YOLOX-s        |  640     |    8      |   300e    |     3.0    |  40.4  |  59.6 |  9.0  |  26.8 | [model](https://paddledet.bj.bcebos.com/models/yolox_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_s_300e_coco.yml) |
+| YOLOX-m        |  640     |    8      |   300e    |     5.8    |  46.9  |  65.7 |  25.3  |  73.8 | [model](https://paddledet.bj.bcebos.com/models/yolox_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_m_300e_coco.yml) |
+| YOLOX-l        |  640     |    8      |   300e    |     9.3    |  50.1  |  68.8 |  54.2  |  155.6 | [model](https://paddledet.bj.bcebos.com/models/yolox_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_l_300e_coco.yml) |
+| YOLOX-x        |  640     |    8      |   300e    |     16.6   |  **51.8**  |  **70.6** |  99.1  |  281.9 | [model](https://paddledet.bj.bcebos.com/models/yolox_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_x_300e_coco.yml) |
+ YOLOX-cdn-tiny    |  416     |    8      |   300e    |     1.9    |  32.4  |  50.2 |  5.03 |  6.33  | [model](https://paddledet.bj.bcebos.com/models/yolox_cdn_tiny_300e_coco.pdparams) | [config](c../../onfigs/yolox/yolox_cdn_tiny_300e_coco.yml) |
+| YOLOX-crn-s     |  640     |    8      |   300e    |     3.0    |  40.4  |  59.6 |  7.7  |  24.69 | [model](https://paddledet.bj.bcebos.com/models/yolox_crn_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox/yolox_crn_s_300e_coco.yml) |
+| YOLOX-s ConvNeXt|  640     |    8      |   36e     |     -      |  44.6  |  65.3 |  36.2 |  27.52 | [model](https://paddledet.bj.bcebos.com/models/yolox_convnext_s_36e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/convnext/yolox_convnext_s_36e_coco.yml) |
+
+</details>
+
+<details>
+<summary> Deploy Models  </summary>
+
+| Model     | Input Size | Exported weights(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| YOLOx-nano |  416   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_nano_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_nano_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_nano_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_nano_300e_coco_wo_nms.onnx) |
+| YOLOx-tiny |  416   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_tiny_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_tiny_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_tiny_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_tiny_300e_coco_wo_nms.onnx) |
+| YOLOx-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_s_300e_coco_wo_nms.onnx) |
+| YOLOx-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_m_300e_coco_wo_nms.onnx) |
+| YOLOx-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_l_300e_coco_wo_nms.onnx) |
+| YOLOx-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolox/yolox_x_300e_coco_wo_nms.onnx) |
+
+</details>
+
+
+### [YOLOv5](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5)
+
+<details>
+<summary> Baseline </summary>
+
+| Model        | Input Size  | images/GPU | Epoch | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    download       | config |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv5-n        |  640     |    16     |   300e    |     1.5    |  28.0  | 45.7 |  1.87  | 4.52 | [model](https://paddledet.bj.bcebos.com/models/yolov5_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_n_300e_coco.yml) |
+| YOLOv5-s        |  640     |    16      |   300e    |     2.6    |  37.6  | 56.7 |  7.24  | 16.54 | [model](https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_s_300e_coco.yml) |
+| YOLOv5-m        |  640     |    16      |   300e    |     5.2    |  45.4  | 64.1 |  21.19  | 49.08 | [model](https://paddledet.bj.bcebos.com/models/yolov5_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_m_300e_coco.yml) |
+| YOLOv5-l        |  640     |    16      |   300e    |     7.9    |  48.9  | 67.1 |  46.56  | 109.32 | [model](https://paddledet.bj.bcebos.com/models/yolov5_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_l_300e_coco.yml) |
+| YOLOv5-x        |  640     |    16      |   300e    |     13.7   |  50.6  | 68.7 |  86.75  | 205.92 | [model](https://paddledet.bj.bcebos.com/models/yolov5_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_x_300e_coco.yml) |
+| YOLOv5-s ConvNeXt|  640    |    8      |   36e     |     -      |  42.4  |  65.3  |  34.54 |  17.96 | [model](https://paddledet.bj.bcebos.com/models/yolov5_convnext_s_36e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5_convnext_s_36e_coco.yml) |
+| *YOLOv5u-n        |  640     |    16      |   300e   |     1.61    |  34.5  | 49.7 |  2.65  | 7.79 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_n_300e_coco.yml) |
+| *YOLOv5u-s        |  640     |    16      |   300e   |     2.66    |  43.0  | 59.7 |  9.15   | 24.12 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_s_300e_coco.yml) |
+| *YOLOv5u-m        |  640     |    16      |   300e   |     5.50    |  49.0  | 65.7 |  25.11  | 64.42 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_m_300e_coco.yml) |
+| *YOLOv5u-l        |  640     |    16      |   300e   |     8.73    |  52.2  | 69.0 |  53.23  | 135.34 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_l_300e_coco.yml) |
+| *YOLOv5u-x        |  640     |    16      |   300e   |     15.49   |  53.1  | 69.9 |  97.28  | 246.89 | [model](https://paddledet.bj.bcebos.com/models/yolov5u_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5u/yolov5u_x_300e_coco.yml) |
+| *YOLOv5p6-n        |  1280     |    16     |   300e    |     -    |  35.9  | 54.2 |  3.25  | 9.23 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_n_300e_coco.yml) |
+| *YOLOv5p6-s        |  1280     |    16     |   300e    |     -    |  44.5  | 63.3 |  12.63  | 33.81 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_s_300e_coco.yml) |
+| *YOLOv5p6-m        |  1280     |    16     |   300e    |     -    |  51.1  | 69.0 |  35.73  | 100.21 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_m_300e_coco.yml) |
+| *YOLOv5p6-l        |  1280     |    8      |   300e    |     -    |  53.4  | 71.0 |  76.77  | 223.09 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_l_300e_coco.yml) |
+| *YOLOv5p6-x        |  1280     |    8      |   300e    |     -    |  54.7  | 72.4 |  140.80 | 420.03 | [model](https://paddledet.bj.bcebos.com/models/yolov5p6_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5/yolov5p6_x_300e_coco.yml) |
+
+</details>
+
+<details>
+<summary> Deploy Models  </summary>
+
+| Model     | Input Size | Exported weights(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| YOLOv5-n |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_n_300e_coco_wo_nms.onnx) |
+| YOLOv5-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_s_300e_coco_wo_nms.onnx) |
+| YOLOv5-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_m_300e_coco_wo_nms.onnx) |
+| YOLOv5-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_l_300e_coco_wo_nms.onnx) |
+| YOLOv5-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov5/yolov5_x_300e_coco_wo_nms.onnx) |
+
+</details>
+
+### [YOLOv6](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6)
+
+<details>
+<summary> Baseline </summary>
+
+| Model        | Input Size  | images/GPU | Epoch | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    download       | config |
+| :------------- | :------- | :-------: | :------: | :---------: | :-----: |:-----: | :-----: |:-----: | :-------------: | :-----: |
+| *YOLOv6-n       |  640     |    16      |   300e(+300e) |  1.3  |  37.5 |    53.1 |  5.07  | 12.49 |[model](https://paddledet.bj.bcebos.com/models/yolov6_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6/yolov6_n_300e_coco.yml) |
+| *YOLOv6-s       |  640     |    32      |   300e(+300e) |  2.7  |  44.8 |    61.7 |  20.18  | 49.36 |[model](https://paddledet.bj.bcebos.com/models/yolov6_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6/yolov6_s_300e_coco.yml) |
+| *YOLOv6-m       |  640     |    32      |   300e(+300e) |  5.3  |  49.5 |    66.9 |  37.74  | 92.47 |[model](https://paddledet.bj.bcebos.com/models/yolov6_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6/yolov6_m_300e_coco.yml) |
+| *YOLOv6-l(silu) |  640     |    32      |   300e(+300e) |  9.5  |  52.2 |    70.2 |  59.66  | 149.4 |[model](https://paddledet.bj.bcebos.com/models/yolov6_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6/yolov6_l_300e_coco.yml) |
+
+</details>
+
+<details>
+<summary> Deploy Models  </summary>
+
+| Model     | Input Size | Exported weights(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| yolov6-n |  640   | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_n_300e_coco_w_nms.zip) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_n_300e_coco_wo_nms.zip) | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_n_300e_coco_w_nms.onnx) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_n_300e_coco_wo_nms.onnx) |
+| yolov6-s |  640   | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_s_300e_coco_w_nms.zip) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_s_300e_coco_wo_nms.zip) | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_s_300e_coco_w_nms.onnx) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_s_300e_coco_wo_nms.onnx) |
+| yolov6-m |  640   | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_m_300e_coco_w_nms.zip) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_m_300e_coco_wo_nms.zip) | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_m_300e_coco_w_nms.onnx) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_m_300e_coco_wo_nms.onnx) |
+| yolov6-l(silu) |  640  | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_l_300e_coco_w_nms.zip) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_l_300e_coco_wo_nms.zip) | [(w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_l_300e_coco_w_nms.onnx) &#124; [(w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov6/yolov6_l_300e_coco_wo_nms.onnx) |
+
+</details>
+
+### [YOLOv7](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7)
+
+<details>
+<summary> Baseline </summary>
+
+| Model        | Input Size  | images/GPU | Epoch | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    download       | config |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| YOLOv7-L         |  640     |    32      |   300e    |     7.4     |  51.0  | 70.2 |  37.62  | 106.08 |[model](https://paddledet.bj.bcebos.com/models/yolov7_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_l_300e_coco.yml) |
+| *YOLOv7u-L       |  640     |    32      |   300e    |      9.0    |  52.1 | 68.8 |  43.59  | 130.10 |[model](https://paddledet.bj.bcebos.com/models/yolov7u_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7u/yolov7u_l_300e_coco.yml) |
+| *YOLOv7-X        |  640     |    32      |   300e    |     12.2    |  53.0  | 70.8 |  71.34  | 190.08 | [model](https://paddledet.bj.bcebos.com/models/yolov7_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_x_300e_coco.yml) |
+| *YOLOv7P6-W6     |  1280    |    16      |   300e    |     25.5    |  54.4  | 71.8 |  70.43  | 360.26 | [model](https://paddledet.bj.bcebos.com/models/yolov7p6_w6_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7p6_w6_300e_coco.yml) |
+| *YOLOv7P6-E6     |  1280    |    10      |   300e    |     31.1    |  55.7  | 73.0 |  97.25  | 515.4 | [model](https://paddledet.bj.bcebos.com/models/yolov7p6_e6_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7p6_e6_300e_coco.yml) |
+| *YOLOv7P6-D6     |  1280    |    8      |   300e    |     37.4    | 56.1  | 73.3 |  133.81  | 702.92 | [model](https://paddledet.bj.bcebos.com/models/yolov7p6_d6_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7p6_d6_300e_coco.yml) |
+| *YOLOv7P6-E6E    |  1280    |    6      |   300e    |     48.7    |  56.5  | 73.7 |  151.76  | 843.52 | [model](https://paddledet.bj.bcebos.com/models/yolov7p6_e6e_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7p6_e6e_300e_coco.yml) |
+| YOLOv7-tiny     |  640     |    32      |   300e    |     2.4   |  37.3 | 54.5 |  6.23  | 13.80 |[model](https://paddledet.bj.bcebos.com/models/yolov7_tiny_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_tiny_300e_coco.yml) |
+| YOLOv7-tiny     |  416     |    32      |   300e    |     1.3    | 33.3 | 49.5 |  6.23  | 5.82 |[model](https://paddledet.bj.bcebos.com/models/yolov7_tiny_416_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_tiny_416_300e_coco.yml) |
+| YOLOv7-tiny     |  320     |    32      |   300e    |     -    | 29.1 | 43.8 |  6.23  | 3.46 |[model](https://paddledet.bj.bcebos.com/models/yolov7_tiny_320_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7/yolov7_tiny_320_300e_coco.yml) |
+
+</details>
+
+<details>
+<summary> Deploy Models  </summary>
+
+| Model     | Input Size | Exported weights(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| YOLOv7-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_l_300e_coco_wo_nms.onnx) |
+| YOLOv7-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_x_300e_coco_wo_nms.onnx) |
+| YOLOv7P6-W6 |  1280   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_w6_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_w6_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_w6_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_w6_300e_coco_wo_nms.onnx) |
+| YOLOv7P6-E6 |  1280   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6_300e_coco_wo_nms.onnx) |
+| YOLOv7P6-D6 |  1280   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_d6_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_d6_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_d6_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_d6_300e_coco_wo_nms.onnx) |
+| YOLOv7P6-E6E |  1280   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6e_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6e_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6e_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7p6_e6e_300e_coco_wo_nms.onnx) |
+| YOLOv7-tiny |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_300e_coco_wo_nms.onnx) |
+| YOLOv7-tiny |  416   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_416_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_416_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_416_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_416_300e_coco_wo_nms.onnx) |
+| YOLOv7-tiny |  320   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_320_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_320_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_320_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/yolov7/yolov7_tiny_320_300e_coco_wo_nms.onnx) |
+
+</details>
+
+
+### [YOLOv8](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8)
+
+<details>
+<summary> Baseline </summary>
+
+| Model        | Input Size  | images/GPU | Epoch | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    download       | config |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| *YOLOv8-n        |  640     |    16      |   500e   |    1.8   |  37.3  | 53.0 |  3.16   | 8.7 | [model](https://paddledet.bj.bcebos.com/models/yolov8_n_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_n_300e_coco.yml) |
+| *YOLOv8-s        |  640     |    16      |   500e   |    3.4   |  44.9  | 61.8 |  11.17  | 28.6 | [model](https://paddledet.bj.bcebos.com/models/yolov8_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_s_300e_coco.yml) |
+| *YOLOv8-m        |  640     |    16      |   500e   |    6.5   |  50.2  | 67.3 |  25.90  | 78.9 | [model](https://paddledet.bj.bcebos.com/models/yolov8_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_m_300e_coco.yml) |
+| *YOLOv8-l        |  640     |    16      |   500e   |    10.0  |  52.8  | 69.6 |  43.69  | 165.2 | [model](https://paddledet.bj.bcebos.com/models/yolov8_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_l_300e_coco.yml) |
+| *YOLOv8-x        |  640     |    16      |   500e   |    15.1  |  53.8  | 70.6 |  68.23  | 257.8 | [model](https://paddledet.bj.bcebos.com/models/yolov8_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8_x_300e_coco.yml) |
+| *YOLOv8-P6-x     |  1280    |    16      |   500e   |    55.0  |    -   |   -  |  97.42  | 522.93 | [model](https://paddledet.bj.bcebos.com/models/yolov8p6_x_500e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov8/yolov8p6_x_500e_coco.yml) |
+
+</details>
+
+<details>
+<summary> Deploy Models  </summary>
+
+| Model   | Input Size | Exported weights(with nms) | Exported weights(exclude_nms)| ONNX(exclude_post_process)  |
+| :------ | :--------: | :------------------------: | :--------------------------: | :-------------------------: |
+| YOLOv8-n |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco.onnx) |
+| YOLOv8-s |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco.onnx) |
+| YOLOv8-m |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco.onnx) |
+| YOLOv8-l |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco.onnx) |
+| YOLOv8-x |  640   | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco.onnx) |
+
+</details>
+
+
+### [RTMDet](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet)
+
+<details>
+<summary> Baseline </summary>
+
+| Model        | Input Size  | images/GPU | Epoch | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) |    download       | config |
+| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
+| *RTMDet-t       |  640     |    32      |   300e    |    2.8   |  40.9 | 57.9 |  4.90  | 16.21 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_t_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_t_300e_coco.yml) |
+| *RTMDet-s       |  640     |    32      |   300e    |    3.3   |  44.5 | 62.0 |  8.89  | 29.71 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_s_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_s_300e_coco.yml) |
+| *RTMDet-m       |  640     |    32      |   300e    |    6.4   |  49.1 | 66.8 |  24.71  | 78.47 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_m_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_m_300e_coco.yml) |
+| *RTMDet-l       |  640     |    32      |   300e    |    10.2  |  51.2 | 68.8 |  52.31  | 160.32 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_l_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_l_300e_coco.yml) |
+| *RTMDet-x       |  640     |    32      |   300e    |    18.0  |  52.6 | 70.4 |  94.86  | 283.12 |[model](https://paddledet.bj.bcebos.com/models/rtmdet_x_300e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/rtmdet/rtmdet_x_300e_coco.yml) |
+
+</details>
+
+<details>
+<summary> Deploy Models  </summary>
+
+| Model     | Input Size | Exported weights(w/o NMS) | ONNX(w/o NMS)  |
+| :-------- | :--------: | :---------------------: | :----------------: |
+| RTMDet-t |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_t_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_t_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_t_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_t_300e_coco_wo_nms.onnx) |
+| RTMDet-s |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_s_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_s_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_s_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_s_300e_coco_wo_nms.onnx) |
+| RTMDet-m |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_m_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_m_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_m_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_m_300e_coco_wo_nms.onnx) |
+| RTMDet-l |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_l_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_l_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_l_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_l_300e_coco_wo_nms.onnx) |
+| RTMDet-x |  640   | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_x_300e_coco_w_nms.zip) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_x_300e_coco_wo_nms.zip) | [( w/ nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_x_300e_coco_w_nms.onnx) &#124; [( w/o nms)](https://paddledet.bj.bcebos.com/deploy/yoloseries/rtmdet/rtmdet_x_300e_coco_wo_nms.onnx) |
+
+</details>
+
+
+### **Notes：**
+ - All the models are trained on COCO train2017 dataset and evaluated on val2017 dataset. The * in front of the model indicates that the training is being updated.
+ - Please check the specific accuracy and speed details in [PP-YOLOE](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/ppyoloe),[YOLOX](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolox),[YOLOv5](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov5),[YOLOv6](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov6),[YOLOv7](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/yolov7). **Note that YOLOv5, YOLOv6 and YOLOv7 have not adopted `multi_label` to eval**.
+- TRT-FP16-Latency(ms) is the time spent in testing under TensorRT-FP16, **excluding data preprocessing and model output post-processing (NMS)**. The test adopts single card **Tesla T4 GPU, batch size=1**, and the test environment is **paddlepaddle-2.3.2**, **CUDA 11.2**, **CUDNN 8.2**, **GCC-8.2**, **TensorRT 8.0.3.4**. Please refer to the respective model homepage for details.
+- For **FLOPs(G) and Params(M)**, you should first install [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim), `pip install paddleslim`, then set `print_flops: True` and `print_params: True` in [runtime.yml](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/runtime.yml). Make sure **single scale** like 640x640, **MACs are printed，FLOPs=2*MACs**.
+ - Based on [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim), quantitative training of YOLO series models can achieve basically lossless accuracy and generally improve the speed by more than 30%. For details, please refer to [auto_compression](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/auto_compression).
+
+
+### [VOC](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc)
+
+<details>
+<summary> Baseline </summary>
+
+| Model        | Input Size  | images/GPU | Epoch | TRT-FP16-Latency(ms) | mAP(0.50,11point) | Params(M) | FLOPs(G) |    download       | config |
+| :-----------: | :-------: | :-------: | :------: | :------------: | :---------------: | :------------------: |:-----------------: | :------: | :------: |
+| YOLOv5-s        |  640     |    16     |   60e    |     3.2   |  80.3 |  7.24  | 16.54 | [model](https://paddledet.bj.bcebos.com/models/yolov5_s_60e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc/yolov5_s_60e_voc.yml) |
+| YOLOv7-tiny     |  640     |    32     |   60e    |     2.6   |  80.2 |  6.23  | 6.90 | [model](https://paddledet.bj.bcebos.com/models/yolov7_tiny_60e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc/yolov7_tiny_60e_voc.yml) |
+| YOLOX-s         |  640     |    8      |   40e    |     3.0   |  82.9 |  9.0   |  26.8 | [model](https://paddledet.bj.bcebos.com/models/yolox_s_40e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc/yolox_s_40e_voc.yml) |
+| PP-YOLOE+_s     |  640     |    8      |   30e    |     2.9   |  86.7 |  7.93  |  17.36 | [model](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_s_30e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/voc/ppyoloe_plus_crn_s_30e_voc.yml) |
+
+</details>
+
+**Note:**
+  - The VOC mAP is `mAP(IoU=0.5)`, and all the models **have not adopted `multi_label` to eval**.
+  - All YOLO VOC models are loaded with the COCO weights of their respective models as pre-train weights. Each config file uses 8 GPUs by default, which can be used as a reference for setting custom datasets. The specific mAP will vary depending on the datasets;
+  - We recommend to use YOLO detection model **with a total `batch_size` at least greater than `64` to train**. If the resources are insufficient, please **use the smaller model** or **reduce the input size of the model**. To ensure high detection accuracy, **you'd better not try to using single GPU or total `batch_size` less than `64` for training**;
+  - Params (M) and FLOPs (G) are measured during training. YOLOv7 has no s model, so tiny model is selected;
+  - For TRT-FP16 Latency (ms) speed measurement, please refer to the config homepage of each YOLO model;
+
+
+## UserGuide
+
+Download MS-COCO dataset, [official website](https://cocodataset.org). The download links are: [annotations](http://images.cocodataset.org/annotations/annotations_trainval2017.zip), [train2017](http://images.cocodataset.org/zips/train2017.zip), [val2017](http://images.cocodataset.org/zips/val2017.zip), [test2017](http://images.cocodataset.org/zips/test2017.zip).
+The download link provided by PaddleDetection team is: [coco](https://bj.bcebos.com/v1/paddledet/data/coco.tar)(about 22G) and [test2017](https://bj.bcebos.com/v1/paddledet/data/cocotest2017.zip). Note that test2017 is optional, and the evaluation is based on val2017.
+
+
+### **Pipeline**
+
+Write the following commands in a script file, such as ```run.sh```, and run as：```sh run.sh```. You can also run the command line sentence by sentence.
+
+```bash
+model_name=ppyoloe # yolov7
+job_name=ppyoloe_plus_crn_s_80e_coco # yolov7_tiny_300e_coco
+
+config=configs/${model_name}/${job_name}.yml
+log_dir=log_dir/${job_name}
+# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
+weights=output/${job_name}/model_final.pdparams
+
+# 1.training（single GPU / multi GPU）
+# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp
+python -m paddle.distributed.launch --log_dir=${log_dir} --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp
+
+# 2.eval
+CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c ${config} -o weights=${weights} --classwise
+
+# 3.infer
+CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_img=demo/000000014439_640x640.jpg --draw_threshold=0.5
+# CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_dir=demo/ --draw_threshold=0.5
+
+# 4.export
+CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} # trt=True
+
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_post_process=True # trt=True
+
+# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_nms=True # trt=True
+
+# 5.deploy infer
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU
+
+# 6.deploy speed, add '--run_mode=trt_fp16' to test in TensorRT FP16 mode
+CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU --run_benchmark=True # --run_mode=trt_fp16
+
+# 7.export onnx
+paddle2onnx --model_dir output_inference/${job_name} --model_filename model.pdmodel --params_filename model.pdiparams --opset_version 12 --save_file ${job_name}.onnx
+
+# 8.onnx speed
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp16
+/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp32
+```
+
+**Note：**
+- If you want to switch models, just modify the first two lines, such as:
+  ```
+  model_name=yolov7
+  job_name=yolov7_tiny_300e_coco
+  ```
+- For **exporting onnx**, you should install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) by `pip install paddle2onnx` at first.
+- For **FLOPs(G) and Params(M)**, you should install [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) by `pip install paddleslim` at first, then set `print_flops: True` and `print_params: True` in [runtime.yml](https://github.com/PaddlePaddle/PaddleYOLO/tree/release/2.6/configs/runtime.yml). Make sure **single scale** like 640x640, **MACs are printed，FLOPs=2*MACs**.
+
+
+### CustomDataset
+
+#### preparation：
+
+1.For the annotation of custom dataset, please refer to[DetAnnoTools](../tutorials/data/DetAnnoTools.md);
+
+2.For training preparation of custom dataset，please refer to[PrepareDataSet](../tutorials/PrepareDataSet.md).
+
+
+#### fintune：
+
+In addition to changing the path of the dataset, it is generally recommended to load **the COCO pre training weight of the corresponding model** to fintune, which will converge faster and achieve higher accuracy, such as：
+
+```base
+# fintune with single GPU：
+# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp -o pretrain_weights=https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams
+
+# fintune with multi GPU：
+python -m paddle.distributed.launch --log_dir=./log_dir --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp -o pretrain_weights=https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams
+```
+
+**Note:**
+- The fintune training will show that the channels of the last layer of the head classification branch is not matched, which is a normal situation, because the number of custom dataset is generally inconsistent with that of COCO dataset;
+- In general, the number of epochs for fintune training can be set less, and the lr setting is also smaller, such as 1/10. The highest accuracy may occur in one of the middle epochs;
+
+#### Predict and export:
+
+When using custom dataset to predict and export models, if the path of the TestDataset dataset is set incorrectly, COCO 80 categories will be used by default.
+
+In addition to the correct path setting of the TestDataset dataset, you can also modify and add the corresponding `label_list`. Txt file (one category is recorded in one line), and `anno_path` in TestDataset can also be set as an absolute path, such as:
+```
+TestDataset:
+  !ImageFolder
+    anno_path: label_list.txt # if not set dataset_dir, the anno_path will be relative path of PaddleDetection root directory
+    # dataset_dir: dataset/my_coco # if set dataset_dir, the anno_path will be dataset_dir/anno_path
+```
+one line in `label_list.txt` records a corresponding category：
+```
+person
+vehicle
+```
diff --git a/docs/advanced_tutorials/MODEL_TECHNICAL.md b/docs/advanced_tutorials/MODEL_TECHNICAL.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d3e58d909c9e5ae48028c4bc0beb71ad08bf363
--- /dev/null
+++ b/docs/advanced_tutorials/MODEL_TECHNICAL.md
@@ -0,0 +1,407 @@
+# 新增模型算法
+为了让用户更好的使用PaddleDetection，本文档中，我们将介绍PaddleDetection的主要模型技术细节及应用
+
+## 目录
+- [1.简介](#1.简介)
+- [2.新增模型](#2.新增模型)
+  - [2.1新增网络结构](#2.1新增网络结构)
+    - [2.1.1新增Backbone](#2.1.1新增Backbone)
+    - [2.1.2新增Neck](#2.1.2新增Neck)
+    - [2.1.3新增Head](#2.1.3新增Head)
+    - [2.1.4新增Loss](#2.1.4新增Loss)
+    - [2.1.5新增后处理模块](#2.1.5新增后处理模块)
+    - [2.1.6新增Architecture](#2.1.6新增Architecture)
+  - [2.2新增配置文件](#2.2新增配置文件)
+    - [2.2.1网络结构配置文件](#2.2.1网络结构配置文件)
+    - [2.2.2优化器配置文件](#2.2.2优化器配置文件)
+    - [2.2.3Reader配置文件](#2.2.3Reader配置文件)
+
+### 1.简介
+PaddleDetecion中的每一种模型对应一个文件夹，以yolov3为例，yolov3系列的模型对应于`configs/yolov3`文件夹，其中yolov3_darknet的总配置文件`configs/yolov3/yolov3_darknet53_270e_coco.yml`的内容如下：
+```
+_BASE_: [
+  '../datasets/coco_detection.yml', # 数据集配置文件，所有模型共用
+  '../runtime.yml', # 运行时相关配置
+  '_base_/optimizer_270e.yml', # 优化器相关配置
+  '_base_/yolov3_darknet53.yml', # yolov3网络结构配置文件
+  '_base_/yolov3_reader.yml', # yolov3 Reader模块配置
+]
+
+# 定义在此处的相关配置可以覆盖上述文件中的同名配置
+snapshot_epoch: 5
+weights: output/yolov3_darknet53_270e_coco/model_final
+```
+可以看到，配置文件中的模块进行了清晰的划分，除了公共的数据集配置以及运行时配置，其他配置被划分为优化器，网络结构以及Reader模块。PaddleDetection中支持丰富的优化器，学习率调整策略，预处理算子等，因此大多数情况下不需要编写优化器以及Reader相关的代码，而只需要在配置文件中配置即可。因此，新增一个模型的主要在于搭建网络结构。
+
+PaddleDetection网络结构的代码在`ppdet/modeling/`中，所有网络结构以组件的形式进行定义与组合，网络结构的主要构成如下所示：
+```
+  ppdet/modeling/
+  ├── architectures
+  │   ├── faster_rcnn.py # Faster Rcnn模型
+  │   ├── ssd.py         # SSD模型
+  │   ├── yolo.py      # YOLOv3模型
+  │   │   ...
+  ├── heads       # 检测头模块
+  │   ├── xxx_head.py    # 定义各类检测头
+  │   ├── roi_extractor.py #检测感兴趣区域提取
+  ├── backbones          # 基干网络模块
+  │   ├── resnet.py      # ResNet网络
+  │   ├── mobilenet.py   # MobileNet网络
+  │   │   ...
+  ├── losses             # 损失函数模块
+  │   ├── xxx_loss.py    # 定义注册各类loss函数
+  ├── necks     # 特征融合模块
+  │   ├── xxx_fpn.py  # 定义各种FPN模块
+  ├── proposal_generator # anchor & proposal生成与匹配模块
+  │   ├── anchor_generator.py   # anchor生成模块
+  │   ├── proposal_generator.py # proposal生成模块
+  │   ├── target.py   # anchor & proposal的匹配函数
+  │   ├── target_layer.py   # anchor & proposal的匹配模块
+  ├── tests  # 单元测试模块
+  │   ├── test_xxx.py  # 对网络中的算子以及模块结构进行单元测试
+  ├── ops.py  # 封装各类PaddlePaddle物体检测相关公共检测组件/算子
+  ├── layers.py  # 封装及注册各类PaddlePaddle物体检测相关公共检测组件/算子
+  ├── bbox_utils.py # 封装检测框相关的函数
+  ├── post_process.py # 封装及注册后处理相关模块
+  ├── shape_spec.py # 定义模块输出shape的类
+```
+
+![](../images/model_figure.png)
+
+### 2.新增模型
+接下来，以单阶段检测器YOLOv3为例，对建立模型过程进行详细描述，按照此思路您可以快速搭建新的模型。
+
+#### 2.1新增网络结构
+
+##### 2.1.1新增Backbone
+
+PaddleDetection中现有所有Backbone网络代码都放置在`ppdet/modeling/backbones`目录下，所以我们在其中新建`darknet.py`如下：
+```python
+import paddle.nn as nn
+from ppdet.core.workspace import register, serializable
+
+@register
+@serializable
+class DarkNet(nn.Layer):
+
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=53,
+                 return_idx=[2, 3, 4],
+                 norm_type='bn',
+                 norm_decay=0.):
+        super(DarkNet, self).__init__()
+        # 省略内容
+
+    def forward(self, inputs):
+        # 省略处理逻辑
+        pass
+
+    @property
+    def out_shape(self):
+        # 省略内容
+        pass
+```
+然后在`backbones/__init__.py`中加入引用：
+```python
+from . import darknet
+from .darknet import *
+```
+**几点说明：**
+- 为了在yaml配置文件中灵活配置网络，所有Backbone需要利用`ppdet.core.workspace`里的`register`进行注册，形式请参考如上示例。此外，可以使用`serializable`以使backbone支持序列化；
+- 所有的Backbone需继承`paddle.nn.Layer`类，并实现forward函数。此外，还需实现out_shape属性定义输出的feature map的channel信息，具体可参见源码；
+- `__shared__`为了实现一些参数的配置全局共享，这些参数可以被backbone, neck，head，loss等所有注册模块共享。
+
+##### 2.1.2新增Neck
+特征融合模块放置在`ppdet/modeling/necks`目录下，我们在其中新建`yolo_fpn.py`如下：
+
+``` python
+import paddle.nn as nn
+from ppdet.core.workspace import register, serializable
+
+@register
+@serializable
+class YOLOv3FPN(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                in_channels=[256, 512, 1024],
+                norm_type='bn'):
+        super(YOLOv3FPN, self).__init__()
+        # 省略内容
+
+    def forward(self, blocks):
+        # 省略内容
+        pass
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        # 省略内容
+        pass
+
+    @property
+    def out_shape(self):
+        # 省略内容
+        pass
+```
+然后在`necks/__init__.py`中加入引用：
+```python
+from . import yolo_fpn
+from .yolo_fpn import *
+```
+**几点说明：**
+- neck模块需要使用`register`进行注册，可以使用`serializable`进行序列化；
+- neck模块需要继承`paddle.nn.Layer`类，并实现forward函数。除此之外，还需要实现`out_shape`属性，用于定义输出的feature map的channel信息，还需要实现类函数`from_config`用于在配置文件中推理出输入channel，并用于`YOLOv3FPN`的初始化；
+- neck模块可以使用`__shared__`实现一些参数的配置全局共享。
+
+##### 2.1.3新增Head
+Head模块全部存放在`ppdet/modeling/heads`目录下，我们在其中新建`yolo_head.py`如下
+``` python
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+@register
+class YOLOv3Head(nn.Layer):
+    __shared__ = ['num_classes']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 anchors=[[10, 13], [16, 30], [33, 23],
+                   [30, 61], [62, 45],[59, 119],
+                   [116, 90], [156, 198], [373, 326]],
+                 anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
+                 num_classes=80,
+                 loss='YOLOv3Loss',
+                 iou_aware=False,
+                 iou_aware_factor=0.4):
+        super(YOLOv3Head, self).__init__()
+        # 省略内容
+
+    def forward(self, feats, targets=None):
+        # 省略内容
+        pass
+```
+然后在`heads/__init__.py`中加入引用：
+```python
+from . import yolo_head
+from .yolo_head import *
+```
+**几点说明：**
+- Head模块需要使用`register`进行注册；
+- Head模块需要继承`paddle.nn.Layer`类，并实现forward函数。
+- `__inject__`表示引入全局字典中已经封装好的模块。如loss等。
+
+##### 2.1.4新增Loss
+Loss模块全部存放在`ppdet/modeling/losses`目录下，我们在其中新建`yolo_loss.py`下
+```python
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+@register
+class YOLOv3Loss(nn.Layer):
+
+    __inject__ = ['iou_loss', 'iou_aware_loss']
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 ignore_thresh=0.7,
+                 label_smooth=False,
+                 downsample=[32, 16, 8],
+                 scale_x_y=1.,
+                 iou_loss=None,
+                 iou_aware_loss=None):
+        super(YOLOv3Loss, self).__init__()
+        # 省略内容
+
+    def forward(self, inputs, targets, anchors):
+        # 省略内容
+        pass
+```
+然后在`losses/__init__.py`中加入引用：
+```python
+from . import yolo_loss
+from .yolo_loss import *
+```
+**几点说明：**
+- loss模块需要使用`register`进行注册；
+- loss模块需要继承`paddle.nn.Layer`类，并实现forward函数。
+- 可以使用`__inject__`表示引入全局字典中已经封装好的模块，使用`__shared__`可以实现一些参数的配置全局共享。
+
+##### 2.1.5新增后处理模块
+后处理模块定义在`ppdet/modeling/post_process.py`中，其中定义了`BBoxPostProcess`类来进行后处理操作，如下所示：
+``` python
+from ppdet.core.workspace import register
+
+@register
+class BBoxPostProcess(object):
+    __shared__ = ['num_classes']
+    __inject__ = ['decode', 'nms']
+
+    def __init__(self, num_classes=80, decode=None, nms=None):
+        # 省略内容
+        pass
+
+    def __call__(self, head_out, rois, im_shape, scale_factor):
+        # 省略内容
+        pass
+```
+**几点说明：**
+- 后处理模块需要使用`register`进行注册
+- `__inject__`注入了全局字典中封装好的模块，如decode和nms等。decode和nms定义在`ppdet/modeling/layers.py`中。
+
+##### 2.1.6新增Architecture
+
+所有architecture网络代码都放置在`ppdet/modeling/architectures`目录下，`meta_arch.py`中定义了`BaseArch`类，代码如下：
+``` python
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+@register
+class BaseArch(nn.Layer):
+     def __init__(self):
+        super(BaseArch, self).__init__()
+
+    def forward(self, inputs):
+        self.inputs = inputs
+        self.model_arch()
+
+        if self.training:
+            out = self.get_loss()
+        else:
+            out = self.get_pred()
+        return out
+
+    def model_arch(self, ):
+        pass
+
+    def get_loss(self, ):
+        raise NotImplementedError("Should implement get_loss method!")
+
+    def get_pred(self, ):
+        raise NotImplementedError("Should implement get_pred method!")
+```
+所有的architecture需要继承`BaseArch`类，如`yolo.py`中的`YOLOv3`定义如下：
+``` python
+@register
+class YOLOv3(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='DarkNet',
+                 neck='YOLOv3FPN',
+                 yolo_head='YOLOv3Head',
+                 post_process='BBoxPostProcess'):
+        super(YOLOv3, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # 省略内容
+        pass
+
+    def get_loss(self):
+        # 省略内容
+        pass
+
+    def get_pred(self):
+        # 省略内容
+        pass
+```
+
+**几点说明：**
+- 所有的architecture需要使用`register`进行注册
+- 在组建一个完整的网络时必须要设定`__category__ = 'architecture'`来表示一个完整的物体检测模型；
+- backbone, neck, yolo_head以及post_process等检测组件传入到architecture中组成最终的网络。像这样将检测模块化，提升了检测模型的复用性，可以通过组合不同的检测组件得到多个模型。
+- from_config类函数实现了模块间组合时channel的自动配置。
+
+#### 2.2新增配置文件
+
+##### 2.2.1网络结构配置文件
+上面详细地介绍了如何新增一个architecture，接下来演示如何配置一个模型，yolov3关于网络结构的配置在`configs/yolov3/_base_/`文件夹中定义，如`yolov3_darknet53.yml`定义了yolov3_darknet的网络结构，其定义如下：
+```
+architecture: YOLOv3
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/DarkNet53_pretrained.pdparams
+norm_type: sync_bn
+
+YOLOv3:
+  backbone: DarkNet
+  neck: YOLOv3FPN
+  yolo_head: YOLOv3Head
+  post_process: BBoxPostProcess
+
+DarkNet:
+  depth: 53
+  return_idx: [2, 3, 4]
+
+# use default config
+# YOLOv3FPN:
+
+YOLOv3Head:
+  anchors: [[10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+  loss: YOLOv3Loss
+
+YOLOv3Loss:
+  ignore_thresh: 0.7
+  downsample: [32, 16, 8]
+  label_smooth: false
+
+BBoxPostProcess:
+  decode:
+    name: YOLOBox
+    conf_thresh: 0.005
+    downsample_ratio: 32
+    clip_bbox: true
+  nms:
+    name: MultiClassNMS
+    keep_top_k: 100
+    score_threshold: 0.01
+    nms_threshold: 0.45
+    nms_top_k: 1000
+
+```
+可以看到在配置文件中，首先需要指定网络的architecture，pretrain_weights指定训练模型的url或者路径，norm_type等可以作为全局参数共享。模型的定义自上而下依次在文件中定义，与上节中的模型组件一一对应。对于一些模型组件，如果采用默认
+的参数，可以不用配置，如上文中的`yolo_fpn`。通过改变相关配置，我们可以轻易地组合出另一个模型，比如`configs/yolov3/_base_/yolov3_mobilenet_v1.yml`将backbone从Darknet切换成MobileNet。
+
+##### 2.2.2优化器配置文件
+优化器配置文件定义模型使用的优化器以及学习率的调度策略，目前PaddleDetection中已经集成了多种多样的优化器和学习率策略，具体可参见代码`ppdet/optimizer.py`。比如，yolov3的优化器配置文件定义在`configs/yolov3/_base_/optimizer_270e.yml`，其定义如下：
+```
+epoch: 270
+
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones:
+    # epoch数目
+    - 216
+    - 243
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 4000
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.0005
+    type: L2
+```
+**几点说明：**
+- 可以通过OptimizerBuilder.optimizer指定优化器的类型及参数，目前支持的优化器可以参考[PaddlePaddle官方文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/Overview_cn.html)
+- 可以设置LearningRate.schedulers设置不同学习率调整策略的组合，PaddlePaddle目前支持多种学习率调整策略，具体也可参考[PaddlePaddle官方文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/Overview_cn.html)。需要注意的是，你需要对于PaddlePaddle中的学习率调整策略进行简单的封装，具体可参考源码`ppdet/optimizer.py`。
+
+##### 2.2.3Reader配置文件
+关于Reader的配置可以参考[Reader配置文档](./READER.md#5.配置及运行)。
+
+> 看过此文档，您应该对PaddleDetection中模型搭建与配置有了一定经验，结合源码会理解的更加透彻。关于模型技术，如您有其他问题或建议，请给我们提issue，我们非常欢迎您的反馈。
diff --git a/docs/advanced_tutorials/MODEL_TECHNICAL_en.md b/docs/advanced_tutorials/MODEL_TECHNICAL_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..927a08596cd5086c3c610779f12b4b99421cce87
--- /dev/null
+++ b/docs/advanced_tutorials/MODEL_TECHNICAL_en.md
@@ -0,0 +1,409 @@
+# How to Create Model Algorithm
+In order to make better use of PaddleDetection, we will introduce the main model technical details and application of PaddleDetection in this document
+
+## Directory
+- [How to Create Model Algorithm](#how-to-create-model-algorithm)
+  - [Directory](#directory)
+    - [1. Introduction](#1-introduction)
+    - [2. Create Model](#2-create-model)
+      - [2.1 Create Model Structure](#21-create-model-structure)
+        - [2.1.1 Create Backbone](#211-create-backbone)
+        - [2.1.2 Create Neck](#212-create-neck)
+        - [2.1.3 Create Head](#213-create-head)
+        - [2.1.4 Create Loss](#214-create-loss)
+        - [2.1.5 Create Post-processing Module](#215-create-post-processing-module)
+        - [2.1.6 Create Architecture](#216-create-architecture)
+      - [2.2 Create Configuration File](#22-create-configuration-file)
+        - [2.2.1 Network Structure Configuration File](#221-network-structure-configuration-file)
+        - [2.2.2 Optimizer configuration file](#222-optimizer-configuration-file)
+        - [2.2.3 Reader Configuration File](#223-reader-configuration-file)
+
+### 1. Introduction
+Each model in the PaddleDetecion corresponds to a folder. In the case of Yolov3, models in the Yolov3 family correspond to the `configs/yolov3` folder. Yolov3 Darknet's general configuration file `configs/yolov3/yolov3_darknet53_270e_coco.yml`.
+```
+_BASE_: [
+  '../datasets/coco_detection.yml', # Dataset configuration file shared by all models
+  '../runtime.yml', # Runtime configuration
+  '_base_/optimizer_270e.yml', # Optimizer related configuration
+  '_base_/yolov3_darknet53.yml', # yolov3 Network structure configuration file
+  '_base_/yolov3_reader.yml', # yolov3 Reader module configuration
+]
+
+# The relevant configuration defined here can override the configuration of the same name in the above file
+snapshot_epoch: 5
+weights: output/yolov3_darknet53_270e_coco/model_final
+```
+As you can see, the modules in the configuration file are clearly divided into optimizer, network structure, and reader modules, with the exception of the common dataset configuration and runtime configuration. Rich optimizers, learning rate adjustment strategies, preprocessing operators, etc., are supported in PaddleDetection, so most of the time you don't need to write the optimizer and reader-related code, just configure it in the configuration file. Therefore, the main purpose of adding a new model is to build the network structure.
+
+In `ppdet/modeling/`, all of the Paddle Detection network structures are defined and combined in the form of components. The main components of the network structure are as follows:
+```
+  ppdet/modeling/
+  ├── architectures
+  │   ├── faster_rcnn.py # Faster Rcnn model
+  │   ├── ssd.py         # SSD model
+  │   ├── yolo.py      # YOLOv3 model
+  │   │   ...
+  ├── heads       # detection head module
+  │   ├── xxx_head.py    # define various detection heads
+  │   ├── roi_extractor.py # detection of region of interest extraction
+  ├── backbones          # backbone network module
+  │   ├── resnet.py      # ResNet network
+  │   ├── mobilenet.py   # MobileNet network
+  │   │   ...
+  ├── losses             # loss function module
+  │   ├── xxx_loss.py    # define and register various loss functions
+  ├── necks     # feature fusion module
+  │   ├── xxx_fpn.py  # define various FPN modules
+  ├── proposal_generator # anchor & proposal generate and match modules
+  │   ├── anchor_generator.py   # anchor generate modules
+  │   ├── proposal_generator.py # proposal generate modules
+  │   ├── target.py   # anchor & proposal Matching function
+  │   ├── target_layer.py   # anchor & proposal Matching function
+  ├── tests  # unit test module
+  │   ├── test_xxx.py  # the operator and module structure in the network are unit tested
+  ├── ops.py  # encapsulates all kinds of common detection components/operators related to the detection of PaddlePaddle objects
+  ├── layers.py  # encapsulates and register all kinds of PaddlePaddle object detection related public detection components/operators
+  ├── bbox_utils.py # encapsulates the box-related functions
+  ├── post_process.py # encapsulate and process related modules after registration
+  ├── shape_spec.py # defines a class for the module to output shape
+```
+
+![](../images/model_figure.png)
+
+### 2. Create Model
+Next, the modeling process is described in detail by taking the single-stage detector YOLOv3 as an example, so that you can quickly build a new model according to this idea.
+
+#### 2.1 Create Model Structure
+
+##### 2.1.1 Create Backbone
+
+All existing Backbone network code in PaddleDetection is placed under `ppdet/modeling/backbones` directory, so we created `darknet.py` as follows:
+```python
+import paddle.nn as nn
+from ppdet.core.workspace import register, serializable
+
+@register
+@serializable
+class DarkNet(nn.Layer):
+
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=53,
+                 return_idx=[2, 3, 4],
+                 norm_type='bn',
+                 norm_decay=0.):
+        super(DarkNet, self).__init__()
+        # Omit the content
+
+    def forward(self, inputs):
+        # Ellipsis processing logic
+        pass
+
+    @property
+    def out_shape(self):
+        # Omit the content
+        pass
+```
+Then add a reference to `backbones/__init__.py`:
+```python
+from . import darknet
+from .darknet import *
+```
+**A few notes:**
+- To flexibly configure networks in the YAML configuration file, all backbone nodes need to register in `ppdet.core.workspace` as shown in the preceding example. In addition, `serializable` can be used to enable backbone to support serialization;
+- All backbone needs to inherit the `paddle.nn.Layer` class and implement the forward function. In addition, it is necessary to implement the out shape attribute to define the channel information of the output feature map. For details, please refer to the source code.
+- `__shared__` To realize global sharing of configuration parameters, these parameters can be shared by all registration modules, such as backbone, neck, head, and loss. 
+
+##### 2.1.2 Create Neck
+The feature fusion module is placed under the `ppdet/modeling/necks` directory and we create the following `yolo_fpn.py`:
+
+``` python
+import paddle.nn as nn
+from ppdet.core.workspace import register, serializable
+
+@register
+@serializable
+class YOLOv3FPN(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                in_channels=[256, 512, 1024],
+                norm_type='bn'):
+        super(YOLOv3FPN, self).__init__()
+        # Omit the content
+
+    def forward(self, blocks):
+        # Omit the content
+        pass
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        # Omit the content
+        pass
+
+    @property
+    def out_shape(self):
+        # Omit the content
+        pass
+```
+Then add a reference to `necks/__init__.py`:
+```python
+from . import yolo_fpn
+from .yolo_fpn import *
+```
+**A few notes:**
+- The neck module needs to be registered with `register` and can be serialized with `serializable`.
+- The neck module needs to inherit the `paddle.nn.Layer` class and implement the forward function. In addition, the `out_shape` attribute needs to be implemented to define the channel information of the output feature map, and the class function `from_config` needs to be implemented to deduce the input channel in the configuration file and initialize `YOLOv3FPN`.
+- The neck module can use `shared` to implement global sharing of configuration parameters.
+
+##### 2.1.3 Create Head
+The head module is all stored in the `ppdet/modeling/heads` directory, where we create `yolo_head.py` as follows
+``` python
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+@register
+class YOLOv3Head(nn.Layer):
+    __shared__ = ['num_classes']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 anchors=[[10, 13], [16, 30], [33, 23],
+                   [30, 61], [62, 45],[59, 119],
+                   [116, 90], [156, 198], [373, 326]],
+                 anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
+                 num_classes=80,
+                 loss='YOLOv3Loss',
+                 iou_aware=False,
+                 iou_aware_factor=0.4):
+        super(YOLOv3Head, self).__init__()
+        # Omit the content
+
+    def forward(self, feats, targets=None):
+        # Omit the content
+        pass
+```
+Then add a reference to `heads/__init__.py`:
+```python
+from . import yolo_head
+from .yolo_head import *
+```
+**A few notes:**
+- The head module needs to register with `register`.
+- The head module needs to inherit the `paddle.nn.Layer` class and implement the forward function.
+- `__inject__` indicates that the module encapsulated in the global dictionary is imported. Such as loss, etc.
+
+##### 2.1.4 Create Loss
+The loss modules are all stored under `ppdet/modeling/losses` directory, where we created `yolo_loss.py`
+```python
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+@register
+class YOLOv3Loss(nn.Layer):
+
+    __inject__ = ['iou_loss', 'iou_aware_loss']
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 ignore_thresh=0.7,
+                 label_smooth=False,
+                 downsample=[32, 16, 8],
+                 scale_x_y=1.,
+                 iou_loss=None,
+                 iou_aware_loss=None):
+        super(YOLOv3Loss, self).__init__()
+        # Omit the content
+
+    def forward(self, inputs, targets, anchors):
+        # Omit the content
+        pass
+```
+Then add a reference to `losses/__init__.py`:
+```python
+from . import yolo_loss
+from .yolo_loss import *
+```
+**A few notes:**
+- The loss module needs to register with `register`.
+- The loss module needs to inherit the `paddle.nn.Layer` class and implement the forward function.
+- `__inject__` modules that have been encapsulated in the global dictionary can be used. Some parameters can be globally shared with `__shared__` configuration.
+
+##### 2.1.5 Create Post-processing Module
+The post-processing module is defined in `ppdet/modeling/post_process.py`, where the `BBoxPostProcess` class is defined for post-processing operations, as follows:
+``` python
+from ppdet.core.workspace import register
+
+@register
+class BBoxPostProcess(object):
+    __shared__ = ['num_classes']
+    __inject__ = ['decode', 'nms']
+
+    def __init__(self, num_classes=80, decode=None, nms=None):
+        # Omit the content
+        pass
+
+    def __call__(self, head_out, rois, im_shape, scale_factor):
+        # Omit the content
+        pass
+```
+**A few notes:**
+- Post-processing modules need to register with `register`
+- `__inject__` modules encapsulated in the global dictionary, such as decode and NMS. Decode and NMS are defined in `ppdet/modeling/layers.py`.
+
+##### 2.1.6 Create Architecture
+
+All architecture network code is placed in `ppdet/modeling/architectures` directory, `meta_arch.py` defines the `BaseArch` class, the code is as follows:
+``` python
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+@register
+class BaseArch(nn.Layer):
+     def __init__(self):
+        super(BaseArch, self).__init__()
+
+    def forward(self, inputs):
+        self.inputs = inputs
+        self.model_arch()
+
+        if self.training:
+            out = self.get_loss()
+        else:
+            out = self.get_pred()
+        return out
+
+    def model_arch(self, ):
+        pass
+
+    def get_loss(self, ):
+        raise NotImplementedError("Should implement get_loss method!")
+
+    def get_pred(self, ):
+        raise NotImplementedError("Should implement get_pred method!")
+```
+All architecture needs to inherit from the `BaseArch` class, as defined by `yolo.py` in `YOLOv3` as follows:
+``` python
+@register
+class YOLOv3(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='DarkNet',
+                 neck='YOLOv3FPN',
+                 yolo_head='YOLOv3Head',
+                 post_process='BBoxPostProcess'):
+        super(YOLOv3, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # Omit the content
+        pass
+
+    def get_loss(self):
+        # Omit the content
+        pass
+
+    def get_pred(self):
+        # Omit the content
+        pass
+```
+
+**A few notes:**
+- All architecture needs to be registered using a `register`
+- When constructing a complete network, `__category__ = 'architecture'` must be set to represent a complete object detection model;
+- Backbone, neck, YOLO head, post-process and other inspection components are passed into the architecture to form the final network. Modularization of detection like this improves the reusability of detection models, and multiple models can be obtained by combining different detection components.
+- The from config class function implements the automatic configuration of channels when modules are combined.
+
+#### 2.2 Create Configuration File
+
+##### 2.2.1 Network Structure Configuration File
+The configuration of the yolov3 network structure is defined in the `configs/yolov3/_base_/` folder. For example, `yolov3_darknet53.yml` defines the network structure of Yolov3 Darknet as follows:
+```
+architecture: YOLOv3
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/DarkNet53_pretrained.pdparams
+norm_type: sync_bn
+
+YOLOv3:
+  backbone: DarkNet
+  neck: YOLOv3FPN
+  yolo_head: YOLOv3Head
+  post_process: BBoxPostProcess
+
+DarkNet:
+  depth: 53
+  return_idx: [2, 3, 4]
+
+# use default config
+# YOLOv3FPN:
+
+YOLOv3Head:
+  anchors: [[10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+  loss: YOLOv3Loss
+
+YOLOv3Loss:
+  ignore_thresh: 0.7
+  downsample: [32, 16, 8]
+  label_smooth: false
+
+BBoxPostProcess:
+  decode:
+    name: YOLOBox
+    conf_thresh: 0.005
+    downsample_ratio: 32
+    clip_bbox: true
+  nms:
+    name: MultiClassNMS
+    keep_top_k: 100
+    score_threshold: 0.01
+    nms_threshold: 0.45
+    nms_top_k: 1000
+
+```
+In the configuration file, you need to specify the network architecture, pretrain weights to specify the URL or path of the training model, and norm type to share as global parameters. The definition of the model is defined in the file from top to bottom, corresponding to the model components in the previous section. For some model components, if the default parameters are used, you do not need to configure them, such as `yolo_fpn` above. By changing related configuration, we can easily combine another model, such as `configs/yolov3/_base_/yolov3_mobilenet_v1.yml` to switch backbone from Darknet to MobileNet.
+
+##### 2.2.2 Optimizer configuration file
+The optimizer profile defines the optimizer used by the model and the learning rate scheduling strategy. Currently, a variety of optimizers and learning rate strategies have been integrated in PaddleDetection, as described in the code `ppdet/optimizer.py`. For example, the optimizer configuration file for yolov3 is defined in `configs/yolov3/_base_/optimizer_270e.yml` as follows:
+```
+epoch: 270
+
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones:
+    # epoch number
+    - 216
+    - 243
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 4000
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.0005
+    type: L2
+```
+**A few notes:**
+- Optimizer builder. Optimizer specifies the type and parameters of the Optimizer. Currently support the optimizer can reference [PaddlePaddle official documentation](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/Overview_cn.html)
+- The `LearningRate.schedulers` sets the combination of different Learning Rate adjustment strategies. Paddle currently supports a variety of Learning Rate adjustment strategies. Specific also can reference [Paddle Paddle official documentation](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/Overview_cn.html). It is important to note that you need to simply package the learning rate adjustment strategy in Paddle, which can be found in the source code `ppdet/optimizer.py`.
+
+
+##### 2.2.3 Reader Configuration File
+For Reader configuration, see [Reader configuration documentation](./READER_en.md#5.Configuration-and-Operation).
+
+> After reading this document, you should have some experience in model construction and configuration of Paddle Detection, and you will understand it more thoroughly with the source code. If you have other questions or suggestions about model technology, please send us an issue. We welcome your feedback.
diff --git a/docs/advanced_tutorials/READER.md b/docs/advanced_tutorials/READER.md
new file mode 100644
index 0000000000000000000000000000000000000000..60c9fee67f2718a3de088eb52d899828924f9e34
--- /dev/null
+++ b/docs/advanced_tutorials/READER.md
@@ -0,0 +1,337 @@
+# 数据处理模块
+
+## 目录
+- [1.简介](#1.简介)
+- [2.数据集](#2.数据集)
+  - [2.1COCO数据集](#2.1COCO数据集)
+  - [2.2Pascal VOC数据集](#2.2Pascal-VOC数据集)
+  - [2.3自定义数据集](#2.3自定义数据集)
+- [3.数据预处理](#3.数据预处理)
+  - [3.1数据增强算子](#3.1数据增强算子)
+  - [3.2自定义数据增强算子](#3.2自定义数据增强算子)
+- [4.Raeder](#4.Reader)
+- [5.配置及运行](#5.配置及运行)
+  - [5.1配置](#5.1配置)
+  - [5.2运行](#5.2运行)
+
+### 1.简介
+PaddleDetection的数据处理模块的所有代码逻辑在`ppdet/data/`中，数据处理模块用于加载数据并将其转换成适用于物体检测模型的训练、评估、推理所需要的格式。
+数据处理模块的主要构成如下架构所示：
+```bash
+  ppdet/data/
+  ├── reader.py     # 基于Dataloader封装的Reader模块
+  ├── source  # 数据源管理模块
+  │   ├── dataset.py      # 定义数据源基类，各类数据集继承于此
+  │   ├── coco.py         # COCO数据集解析与格式化数据
+  │   ├── voc.py          # Pascal VOC数据集解析与格式化数据
+  │   ├── widerface.py    # WIDER-FACE数据集解析与格式化数据
+  │   ├── category.py    # 相关数据集的类别信息
+  ├── transform  # 数据预处理模块
+  │   ├── batch_operators.py  # 定义各类基于批量数据的预处理算子
+  │   ├── op_helper.py    # 预处理算子的辅助函数
+  │   ├── operators.py    # 定义各类基于单张图片的预处理算子
+  │   ├── gridmask_utils.py    # GridMask数据增强函数
+  │   ├── autoaugment_utils.py  # AutoAugment辅助函数
+  ├── shm_utils.py     # 用于使用共享内存的辅助函数
+  ```
+
+
+### 2.数据集
+数据集定义在`source`目录下，其中`dataset.py`中定义了数据集的基类`DetDataSet`, 所有的数据集均继承于基类，`DetDataset`基类里定义了如下等方法：
+
+| 方法                        | 输入   | 输出           |  备注                   |
+| :------------------------: | :----: | :------------: | :--------------: |
+| \_\_len\_\_ | 无     | int, 数据集中样本的数量     | 过滤掉了无标注的样本 |
+| \_\_getitem\_\_ | int, 样本的索引idx     |  dict, 索引idx对应的样本roidb  | 得到transform之后的样本roidb |
+| check_or_download_dataset            | 无     | 无  |  检查数据集是否存在，如果不存在则下载，目前支持COCO, VOC，widerface等数据集 |
+| set_kwargs                |  可选参数，以键值对的形式给出   | 无  | 目前用于支持接收mixup, cutmix等参数的设置 |
+| set_transform            | 一系列的transform函数   | 无  | 设置数据集的transform函数 |
+| set_epoch            | int, 当前的epoch  | 无  | 用于dataset与训练过程的交互 |
+| parse_dataset            | 无  | 无  | 用于从数据中读取所有的样本 |
+| get_anno            | 无  | 无  | 用于获取标注文件的路径 |
+
+当一个数据集类继承自`DetDataSet`，那么它只需要实现parse_dataset函数即可。parse_dataset根据数据集设置的数据集根路径dataset_dir，图片文件夹image_dir， 标注文件路径anno_path取出所有的样本，并将其保存在一个列表roidbs中，每一个列表中的元素为一个样本xxx_rec(比如coco_rec或者voc_rec)，用dict表示，dict中包含样本的image, gt_bbox, gt_class等字段。COCO和Pascal-VOC数据集中的xxx_rec的数据结构定义如下：
+  ```python
+  xxx_rec = {
+      'im_file': im_fname,         # 一张图像的完整路径
+      'im_id': np.array([img_id]), # 一张图像的ID序号
+      'h': im_h,                   # 图像高度
+      'w': im_w,                   # 图像宽度
+      'is_crowd': is_crowd,        # 是否是群落对象, 默认为0 (VOC中无此字段)
+      'gt_class': gt_class,        # 标注框标签名称的ID序号
+      'gt_bbox': gt_bbox,          # 标注框坐标(xmin, ymin, xmax, ymax)
+      'gt_poly': gt_poly,          # 分割掩码，此字段只在coco_rec中出现，默认为None
+      'difficult': difficult       # 是否是困难样本，此字段只在voc_rec中出现，默认为0
+  }
+  ```
+
+xxx_rec中的内容也可以通过`DetDataSet`的data_fields参数来控制，即可以过滤掉一些不需要的字段，但大多数情况下不需要修改，按照`configs/datasets`中的默认配置即可。
+
+此外，在parse_dataset函数中，保存了类别名到id的映射的一个字典`cname2cid`。在coco数据集中，会利用[COCO API](https://github.com/cocodataset/cocoapi)从标注文件中加载数据集的类别名，并设置此字典。在voc数据集中，如果设置`use_default_label=False`，将从`label_list.txt`中读取类别列表，反之将使用voc默认的类别列表。
+
+#### 2.1COCO数据集
+COCO数据集目前分为COCO2014和COCO2017，主要由json文件和image文件组成，其组织结构如下所示：
+
+  ```
+  dataset/coco/
+  ├── annotations
+  │   ├── instances_train2014.json
+  │   ├── instances_train2017.json
+  │   ├── instances_val2014.json
+  │   ├── instances_val2017.json
+  │   │   ...
+  ├── train2017
+  │   ├── 000000000009.jpg
+  │   ├── 000000580008.jpg
+  │   │   ...
+  ├── val2017
+  │   ├── 000000000139.jpg
+  │   ├── 000000000285.jpg
+  │   │   ...
+  ```
+
+在`source/coco.py`中定义并注册了`COCODataSet`数据集类，其继承自`DetDataSet`，并实现了parse_dataset方法，调用[COCO API](https://github.com/cocodataset/cocoapi)加载并解析COCO格式数据源`roidbs`和`cname2cid`，具体可参见`source/coco.py`源码。将其他数据集转换成COCO格式可以参考[用户数据转成COCO数据](../tutorials/data/PrepareDetDataSet.md#用户数据转成COCO数据)
+
+#### 2.2Pascal VOC数据集
+该数据集目前分为VOC2007和VOC2012，主要由xml文件和image文件组成，其组织结构如下所示：
+```
+  dataset/voc/
+  ├── trainval.txt
+  ├── test.txt
+  ├── label_list.txt (optional)
+  ├── VOCdevkit/VOC2007
+  │   ├── Annotations
+  │       ├── 001789.xml
+  │       │   ...
+  │   ├── JPEGImages
+  │       ├── 001789.jpg
+  │       │   ...
+  │   ├── ImageSets
+  │       |   ...
+  ├── VOCdevkit/VOC2012
+  │   ├── Annotations
+  │       ├── 2011_003876.xml
+  │       │   ...
+  │   ├── JPEGImages
+  │       ├── 2011_003876.jpg
+  │       │   ...
+  │   ├── ImageSets
+  │       │   ...
+  ```
+在`source/voc.py`中定义并注册了`VOCDataSet`数据集，它继承自`DetDataSet`基类，并重写了`parse_dataset`方法，解析VOC数据集中xml格式标注文件，更新`roidbs`和`cname2cid`。将其他数据集转换成VOC格式可以参考[用户数据转成VOC数据](../tutorials/data/PrepareDetDataSet.md#用户数据转成VOC数据)
+
+#### 2.3自定义数据集
+如果COCODataSet和VOCDataSet不能满足你的需求，可以通过自定义数据集的方式来加载你的数据集。只需要以下两步即可实现自定义数据集
+
+1. 新建`source/xxx.py`，定义类`XXXDataSet`继承自`DetDataSet`基类，完成注册与序列化，并重写`parse_dataset`方法对`roidbs`与`cname2cid`更新：
+  ```python
+  from ppdet.core.workspace import register, serializable
+
+  #注册并序列化
+  @register
+  @serializable
+  class XXXDataSet(DetDataSet):
+      def __init__(self,
+                  dataset_dir=None,
+                  image_dir=None,
+                  anno_path=None,
+                  ...
+                  ):
+          self.roidbs = None
+          self.cname2cid = None
+          ...
+
+      def parse_dataset(self):
+          ...
+          省略具体解析数据逻辑
+          ...
+          self.roidbs, self.cname2cid = records, cname2cid
+  ```
+
+2. 在`source/__init__.py`中添加引用：
+  ```python
+  from . import xxx
+  from .xxx import *
+  ```
+完成以上两步就将新的数据源`XXXDataSet`添加好了，你可以参考[配置及运行](#5.配置及运行)实现自定义数据集的使用。
+
+### 3.数据预处理
+
+#### 3.1数据增强算子
+PaddleDetection中支持了种类丰富的数据增强算子，有单图像数据增强算子与批数据增强算子两种方式，您可选取合适的算子组合使用。单图像数据增强算子定义在`transform/operators.py`中，已支持的单图像数据增强算子详见下表：
+
+| 名称                     |  作用                   |
+| :---------------------: | :--------------: |
+| Decode             | 从图像文件或内存buffer中加载图像，格式为RGB格式 |
+| Permute                 | 假如输入是HWC顺序变成CHW |
+| RandomErasingImage | 对图像进行随机擦除 |
+| NormalizeImage          | 对图像像素值进行归一化，如果设置is_scale=True，则先将像素值除以255.0, 再进行归一化。 |
+| GridMask  | GridMask数据增广 |
+| RandomDistort           | 随机扰动图片亮度、对比度、饱和度和色相 |
+| AutoAugment | AutoAugment数据增广，包含一系列数据增强方法 |
+| RandomFlip         | 随机水平翻转图像 |
+| Resize             | 对于图像进行resize，并对标注进行相应的变换 |
+| MultiscaleTestResize    | 将图像重新缩放为多尺度list的每个尺寸 |
+| RandomResize | 对于图像进行随机Resize，可以Resize到不同的尺寸以及使用不同的插值策略 |
+| RandomExpand | 将原始图片放入用像素均值填充的扩张图中，对此图进行裁剪、缩放和翻转 |
+| CropWithSampling         | 根据缩放比例、长宽比例生成若干候选框，再依据这些候选框和标注框的面积交并比(IoU)挑选出符合要求的裁剪结果 |
+| CropImageWithDataAchorSampling | 基于CropImage，在人脸检测中，随机将图片尺度变换到一定范围的尺度，大大增强人脸的尺度变化 |
+| RandomCrop              | 原理同CropImage，以随机比例与IoU阈值进行处理 |
+| RandomScaledCrop        | 根据长边对图像进行随机裁剪，并对标注做相应的变换 |
+| Cutmix             | Cutmix数据增强，对两张图片做拼接  |
+| Mixup              | Mixup数据增强，按比例叠加两张图像 |
+| NormalizeBox            | 对bounding box进行归一化 |
+| PadBox                  | 如果bounding box的数量少于num_max_boxes，则将零填充到bbox |
+| BboxXYXY2XYWH           | 将bounding box从(xmin,ymin,xmax,ymin)形式转换为(xmin,ymin,width,height)格式 |
+| Pad           | 将图片Pad某一个数的整数倍或者指定的size，并支持指定Pad的方式 |
+| Poly2Mask | Poly2Mask数据增强 ｜
+
+批数据增强算子定义在`transform/batch_operators.py`中, 目前支持的算子列表如下：
+| 名称                     |  作用                   |
+| :---------------------: | :--------------: |
+| PadBatch           | 随机对每个batch的数据图片进行Pad操作，使得batch中的图片具有相同的shape |
+| BatchRandomResize  | 对一个batch的图片进行resize，使得batch中的图片随机缩放到相同的尺寸  |
+| Gt2YoloTarget      | 通过gt数据生成YOLO系列模型的目标  |
+| Gt2FCOSTarget      | 通过gt数据生成FCOS模型的目标 |
+| Gt2TTFTarget       | 通过gt数据生成TTFNet模型的目标 |
+| Gt2Solov2Target    | 通过gt数据生成SOLOv2模型的目标 |
+
+**几点说明：**
+- 数据增强算子的输入为sample或者samples，每一个sample对应上文所说的`DetDataSet`输出的roidbs中的一个样本，如coco_rec或者voc_rec
+- 单图像数据增强算子(Mixup, Cutmix等除外)也可用于批数据处理中。但是，单图像处理算子和批图像处理算子仍有一些差异，以RandomResize和BatchRandomResize为例，RandomResize会将一个Batch中的每张图片进行随机缩放，但是每一张图像Resize之后的形状不尽相同，BatchRandomResize则会将一个Batch中的所有图片随机缩放到相同的形状。
+- 除BatchRandomResize外，定义在`transform/batch_operators.py`的批数据增强算子接收的输入图像均为CHW形式，所以使用这些批数据增强算子前请先使用Permute进行处理。如果用到Gt2xxxTarget算子，需要将其放置在靠后的位置。NormalizeBox算子建议放置在Gt2xxxTarget之前。将这些限制条件总结下来，推荐的预处理算子的顺序为
+  ```
+    - XXX: {}
+    - ...
+    - BatchRandomResize: {...} # 如果不需要，可以移除，如果需要，放置在Permute之前
+    - Permute: {} # 必须项
+    - NormalizeBox: {} # 如果需要，建议放在Gt2XXXTarget之前
+    - PadBatch: {...} # 如果不需要可移除，如果需要，建议放置在Permute之后
+    - Gt2XXXTarget: {...} # 建议与PadBatch放置在最后的位置
+  ```
+
+#### 3.2自定义数据增强算子
+如果需要自定义数据增强算子，那么您需要了解下数据增强算子的相关逻辑。数据增强算子基类为定义在`transform/operators.py`中的`BaseOperator`类，单图像数据增强算子与批数据增强算子均继承自这个基类。完整定义参考源码，以下代码显示了`BaseOperator`类的关键函数: apply和__call__方法
+  ``` python
+  class BaseOperator(object):
+
+    ...
+
+    def apply(self, sample, context=None):
+        return sample
+
+    def __call__(self, sample, context=None):
+        if isinstance(sample, Sequence):
+            for i in range(len(sample)):
+                sample[i] = self.apply(sample[i], context)
+        else:
+            sample = self.apply(sample, context)
+        return sample
+  ```
+__call__方法为`BaseOperator`的调用入口，接收一个sample(单图像)或者多个sample(多图像)作为输入，并调用apply函数对一个或者多个sample进行处理。大多数情况下，你只需要继承`BaseOperator`重写apply方法或者重写__call__方法即可，如下所示，定义了一个XXXOp继承自BaseOperator，并注册：
+  ```python
+  @register_op
+  class XXXOp(BaseOperator):
+    def __init__(self,...):
+
+      super(XXXImage, self).__init__()
+      ...
+
+    # 大多数情况下只需要重写apply方法
+    def apply(self, sample, context=None):
+      ...
+      省略对输入的sample具体操作
+      ...
+      return sample
+
+    # 如果有需要，可以重写__call__方法，如Mixup, Gt2XXXTarget等
+    # def __call__(self, sample, context=None):
+    #   ...
+    #   省略对输入的sample具体操作
+    #   ...
+    #   return sample
+  ```
+大多数情况下，只需要重写apply方法即可，如`transform/operators.py`中除Mixup和Cutmix外的预处理算子。对于批处理的情况一般需要重写__call__方法，如`transform/batch_operators.py`的预处理算子。
+
+### 4.Reader
+Reader相关的类定义在`reader.py`, 其中定义了`BaseDataLoader`类。`BaseDataLoader`在`paddle.io.DataLoader`的基础上封装了一层，其具备`paddle.io.DataLoader`的所有功能，并能够实现不同模型对于`DetDataset`的不同需求，如可以通过对Reader进行设置，以控制`DetDataset`支持Mixup, Cutmix等操作。除此之外，数据预处理算子通过`Compose`类和`BatchCompose`类组合起来分别传入`DetDataset`和`paddle.io.DataLoader`中。
+所有的Reader类都继承自`BaseDataLoader`类，具体可参见源码。
+
+### 5.配置及运行
+
+#### 5.1 配置
+与数据预处理相关的模块的配置文件包含所有模型公用的Dataset的配置文件，以及不同模型专用的Reader的配置文件。
+
+##### 5.1.1 Dataset配置
+关于Dataset的配置文件存在于`configs/datasets`文件夹。比如COCO数据集的配置文件如下：
+```
+metric: COCO # 目前支持COCO, VOC, OID， WiderFace等评估标准
+num_classes: 80 # num_classes数据集的类别数，不包含背景类
+
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017 # 训练集的图片所在文件相对于dataset_dir的路径
+    anno_path: annotations/instances_train2017.json # 训练集的标注文件相对于dataset_dir的路径
+    dataset_dir: dataset/coco #数据集所在路径，相对于PaddleDetection路径
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] # 控制dataset输出的sample所包含的字段，注意此为TrainDataset独有的且必须配置的字段
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017 # 验证集的图片所在文件夹相对于dataset_dir的路径
+    anno_path: annotations/instances_val2017.json # 验证集的标注文件相对于dataset_dir的路径
+    dataset_dir: dataset/coco # 数据集所在路径，相对于PaddleDetection路径
+
+TestDataset:
+  !ImageFolder
+    anno_path: annotations/instances_val2017.json # 标注文件所在路径，仅用于读取数据集的类别信息，支持json和txt格式
+    dataset_dir: dataset/coco # 数据集所在路径，若添加了此行，则`anno_path`路径为`dataset_dir/anno_path`，若此行不设置或去掉此行，则`anno_path`路径即为`anno_path`
+```
+在PaddleDetection的yml配置文件中，使用`!`直接序列化模块实例(可以是函数，实例等)，上述的配置文件均使用Dataset进行了序列化。
+
+**注意：**
+请运行前自行仔细检查数据集的配置路径，在训练或验证时如果TrainDataset和EvalDataset的路径配置有误，会提示自动下载数据集。若使用自定义数据集，在推理时如果TestDataset路径配置有误，会提示使用默认COCO数据集的类别信息。
+
+
+##### 5.1.2 Reader配置
+不同模型专用的Reader定义在每一个模型的文件夹下，如yolov3的Reader配置文件定义在`configs/yolov3/_base_/yolov3_reader.yml`。一个Reader的示例配置如下：
+```
+worker_num: 2
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    ...
+  batch_transforms:
+    ...
+  batch_size: 8
+  shuffle: true
+  drop_last: true
+  use_shared_memory: true
+
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    ...
+  batch_size: 1
+  drop_empty: false
+
+TestReader:
+  inputs_def:
+    image_shape: [3, 608, 608]
+  sample_transforms:
+    - Decode: {}
+    ...
+  batch_size: 1
+```
+你可以在Reader中定义不同的预处理算子，每张卡的batch_size以及DataLoader的worker_num等。
+
+#### 5.2运行
+在PaddleDetection的训练、评估和测试运行程序中，都通过创建Reader迭代器。Reader在`ppdet/engine/trainer.py`中创建。下面的代码展示了如何创建训练时的Reader
+``` python
+from ppdet.core.workspace import create
+# build data loader
+self.dataset = cfg['TrainDataset']
+self.loader = create('TrainReader')(selfdataset, cfg.worker_num)
+```
+相应的预测以及评估时的Reader与之类似，具体可参考`ppdet/engine/trainer.py`源码。
+
+> 关于数据处理模块，如您有其他问题或建议，请给我们提issue，我们非常欢迎您的反馈。
diff --git a/docs/advanced_tutorials/READER_en.md b/docs/advanced_tutorials/READER_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..07940a965dd4e48499a96def925679f9ff269ad8
--- /dev/null
+++ b/docs/advanced_tutorials/READER_en.md
@@ -0,0 +1,337 @@
+# Data Processing Module
+
+## Directory
+- [Data Processing Module](#data-processing-module)
+  - [Directory](#directory)
+    - [1.Introduction](#1introduction)
+    - [2.Dataset](#2dataset)
+      - [2.1COCO Dataset](#21coco-dataset)
+      - [2.2Pascal VOC dataset](#22pascal-voc-dataset)
+      - [2.3Customize Dataset](#23customize-dataset)
+    - [3.Data preprocessing](#3data-preprocessing)
+      - [3.1Data Enhancement Operator](#31data-enhancement-operator)
+      - [3.2Custom data enhancement operator](#32custom-data-enhancement-operator)
+    - [4.Reader](#4reader)
+    - [5.Configuration and Operation](#5configuration-and-operation)
+      - [5.1Configuration](#51configuration)
+      - [5.2run](#52run)
+
+### 1.Introduction
+All code logic for Paddle Detection's data processing module in `ppdet/data/`, the data processing module is used to load data and convert it into a format required for training, evaluation and reasoning of object Detection models. The main components of the data processing module are as follows:
+The main components of the data processing module are as follows:
+```bash
+  ppdet/data/
+  ├── reader.py     # Reader module based on Dataloader encapsulation
+  ├── source  # Data source management module
+  │   ├── dataset.py      # Defines the data source base class from which various datasets are inherited
+  │   ├── coco.py         # The COCO dataset parses and formats the data
+  │   ├── voc.py          # Pascal VOC datasets parse and format data
+  │   ├── widerface.py    # The WIDER-FACE dataset parses and formats data
+  │   ├── category.py    # Category information for the relevant dataset
+  ├── transform  # Data preprocessing module
+  │   ├── batch_operators.py  # Define all kinds of preprocessing operators based on batch data
+  │   ├── op_helper.py    # The auxiliary function of the preprocessing operator
+  │   ├── operators.py    # Define all kinds of preprocessing operators based on single image
+  │   ├── gridmask_utils.py    # GridMask data enhancement function
+  │   ├── autoaugment_utils.py  # AutoAugment auxiliary function
+  ├── shm_utils.py     # Auxiliary functions for using shared memory
+  ```
+
+
+### 2.Dataset
+The dataset is defined in the `source` directory, where `dataset.py` defines the base class `DetDataSet` of the dataset. All datasets inherit from the base class, and the `DetDataset` base class defines the following methods:
+
+|          Method           |                    Input                     |                  Output                   |                                                      Note                                                       |
+| :-----------------------: | :------------------------------------------: | :---------------------------------------: | :-------------------------------------------------------------------------------------------------------------: |
+|        \_\_len\_\_        |                      no                      | int, the number of samples in the dataset |                                        Filter out the unlabeled samples                                         |
+|      \_\_getitem\_\_      |         int, The index of the sample         |      dict, Index idx to sample ROIDB      |                                      Get the sample roidb after transform                                       |
+| check_or_download_dataset |                      no                      |                    no                     | Check whether the dataset exists, if not, download, currently support COCO, VOC, Widerface and other datasets |
+|        set_kwargs         | Optional arguments, given as key-value pairs |                    no                     |                     Currently used to support receiving mixup, cutMix and other parameters                      |
+|       set_transform       |       A series of transform functions        |                    no                     |                                    Set the transform function of the dataset                                    |
+|         set_epoch         |              int, current epoch              |                    no                     |                                Interaction between dataset and training process                                 |
+|       parse_dataset       |                      no                      |                    no                     |                                     Used to read all samples from the data                                      |
+|         get_anno          |                      no                      |                    no                     |                                   Used to get the path to the annotation file                                   |
+
+When a dataset class inherits from `DetDataSet`, it simply implements the Parse dataset function. parse_dataset set dataset root path dataset_dir, image folder image dir, annotated file path anno_path retrieve all samples and save them in a list roidbs Each element in the list is a sample XXX rec(such as coco_rec or voc_rec), represented by dict, which contains the sample image, gt_bbox, gt_class and other fields. The data structure of xxx_rec in COCO and Pascal-VOC datasets is defined as follows:
+  ```python
+  xxx_rec = {
+      'im_file': im_fname,         # The full path to an image
+      'im_id': np.array([img_id]), # The ID number of an image
+      'h': im_h,                   # Height of the image
+      'w': im_w,                   # The width of the image
+      'is_crowd': is_crowd,        # Community object, default is 0 (VOC does not have this field)
+      'gt_class': gt_class,        # ID number of an enclosure label name
+      'gt_bbox': gt_bbox,          # label box coordinates(xmin, ymin, xmax, ymax)
+      'gt_poly': gt_poly,          # Segmentation mask. This field only appears in coco_rec and defaults to None
+      'difficult': difficult       # Is it a difficult sample? This field only appears in voc_rec and defaults to 0
+  }
+  ```
+
+The contents of the xxx_rec can also be controlled by the Data fields parameter of `DetDataSet`, that is, some unwanted fields can be filtered out, but in most cases you do not need to change them. The default configuration in `configs/datasets` will do.
+
+In addition, a dictionary `cname2cid` holds the mapping of category names to IDS in the Parse dataset function. In coco dataset, can use [coco API](https://github.com/cocodataset/cocoapi) from the label category name of the file to load dataset, and set up the dictionary. In the VOC dataset, if `use_default_label=False` is set, the category list will be read from `label_list.txt`, otherwise the VOC default category list will be used.
+
+#### 2.1COCO Dataset
+COCO datasets are currently divided into COCO2014 and COCO2017, which are mainly composed of JSON files and image files, and their organizational structure is shown as follows:
+  ```
+  dataset/coco/
+  ├── annotations
+  │   ├── instances_train2014.json
+  │   ├── instances_train2017.json
+  │   ├── instances_val2014.json
+  │   ├── instances_val2017.json
+  │   │   ...
+  ├── train2017
+  │   ├── 000000000009.jpg
+  │   ├── 000000580008.jpg
+  │   │   ...
+  ├── val2017
+  │   ├── 000000000139.jpg
+  │   ├── 000000000285.jpg
+  │   │   ...
+  ```
+class `COCODataSet` is defined and registered on `source/coco.py`. And implements the parse the dataset method, called [COCO API](https://github.com/cocodataset/cocoapi) to load and parse COCO format data source ` roidbs ` and ` cname2cid `, See `source/coco.py` source code for details. Converting other datasets to COCO format can be done by referring to [converting User Data to COCO Data](../tutorials/PrepareDataSet_en.md#convert-user-data-to-coco-data)
+And implements the parse the dataset method, called [COCO API](https://github.com/cocodataset/cocoapi) to load and parse COCO format data source `roidbs` and `cname2cid`, See `source/coco.py` source code for details. Converting other datasets to COCO format can be done by referring to [converting User Data to COCO Data](../tutorials/data/PrepareDetDataSet_en.md#convert-user-data-to-coco-data)
+
+
+#### 2.2Pascal VOC dataset
+The dataset is currently divided into VOC2007 and VOC2012, mainly composed of XML files and image files, and its organizational structure is shown as follows:
+```
+  dataset/voc/
+  ├── trainval.txt
+  ├── test.txt
+  ├── label_list.txt (optional)
+  ├── VOCdevkit/VOC2007
+  │   ├── Annotations
+  │       ├── 001789.xml
+  │       │   ...
+  │   ├── JPEGImages
+  │       ├── 001789.jpg
+  │       │   ...
+  │   ├── ImageSets
+  │       |   ...
+  ├── VOCdevkit/VOC2012
+  │   ├── Annotations
+  │       ├── 2011_003876.xml
+  │       │   ...
+  │   ├── JPEGImages
+  │       ├── 2011_003876.jpg
+  │       │   ...
+  │   ├── ImageSets
+  │       │   ...
+  ```
+The `VOCDataSet` dataset is defined and registered in `source/voc.py` . It inherits the `DetDataSet` base class and rewrites the `parse_dataset` method to parse XML annotations in the VOC dataset. Update `roidbs` and `cname2cid`. To convert other datasets to VOC format, refer to [User Data to VOC Data](../tutorials/data/PrepareDetDataSet_en.md#convert-user-data-to-voc-data)
+
+
+#### 2.3Customize Dataset
+If the COCO dataset and VOC dataset do not meet your requirements, you can load your dataset by customizing it. There are only two steps to implement a custom dataset
+
+1. create`source/xxx.py`, define class `XXXDataSet` extends from `DetDataSet` base class, complete registration and serialization, and rewrite `parse_dataset`methods to update `roidbs` and `cname2cid`:
+  ```python
+  from ppdet.core.workspace import register, serializable
+
+  #Register and serialize
+  @register
+  @serializable
+  class XXXDataSet(DetDataSet):
+      def __init__(self,
+                  dataset_dir=None,
+                  image_dir=None,
+                  anno_path=None,
+                  ...
+                  ):
+          self.roidbs = None
+          self.cname2cid = None
+          ...
+
+      def parse_dataset(self):
+          ...
+          Omit concrete parse data logic
+          ...
+          self.roidbs, self.cname2cid = records, cname2cid
+  ```
+
+2. Add a reference to `source/__init__.py`:
+  ```python
+  from . import xxx
+  from .xxx import *
+  ```
+Complete the above two steps to add the new Data source `XXXDataSet`, you can refer to [Configure and Run](#5.Configuration-and-Operation) to implement the use of custom datasets.
+
+### 3.Data preprocessing
+
+#### 3.1Data Enhancement Operator
+A variety of data enhancement operators are supported in PaddleDetection, including single image data enhancement operator and batch data enhancement operator. You can choose suitable operators to use in combination. Single image data enhancement operators are defined in `transform/operators.py`. The supported single image data enhancement operators are shown in the following table:
+|              Name              |                                                                                                                                 Function                                                                                                                                 |
+| :----------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|             Decode             |                                                                                                     Loads an image from an image file or memory buffer in RGB format                                                                                                     |
+|            Permute             |                                                                                                             If the input is HWC, the sequence changes to CHW                                                                                                             |
+|       RandomErasingImage       |                                                                                                                       Random erasure of the image                                                                                                                        |
+|         NormalizeImage         |                                                                     The pixel value of the image is normalized. If is scale= True is set, the pixel value is divided by 255.0 before normalization.                                                                      |
+|            GridMask            |                                                                                                                        GridMask data is augmented                                                                                                                        |
+|         RandomDistort          |                                                                                                   Random disturbance of image brightness, contrast, saturation and hue                                                                                                   |
+|          AutoAugment           |                                                                                                 Auto Augment data, which contains a series of data augmentation methods                                                                                                  |
+|           RandomFlip           |                                                                                                                   Randomly flip the image horizontally                                                                                                                   |
+|             Resize             |                                                                                                        Resize the image and transform the annotation accordingly                                                                                                         |
+|      MultiscaleTestResize      |                                                                                                          Rescale the image to each size of the multi-scale list                                                                                                          |
+|          RandomResize          |                                                                               Random Resize of images can be resized to different sizes and different interpolation strategies can be used                                                                               |
+|          RandomExpand          |                                                                                 Place the original image into an expanded image filled with pixel mean, crop, scale, and flip the image                                                                                  |
+|        CropWithSampling        | Several candidate frames are generated according to the scaling ratio and length-width ratio, and then the prunning results that meet the requirements are selected according to the area intersection ratio (IoU) between these candidate frames and the marking frames |
+| CropImageWithDataAchorSampling |                                                       Based on Crop Image, in face detection, the Image scale is randomly transformed to a certain range of scale, which greatly enhances the scale change of face                                                       |
+|           RandomCrop           |                                                                                   The principle is the same as CropImage, which is processed with random proportion and IoU threshold                                                                                    |
+|        RandomScaledCrop        |                                                                        According to the long edge, the image is randomly clipped and the corresponding transformation is made to the annotations                                                                         |
+|             Cutmix             |                                                                                                              Cutmix data enhancement, Mosaic of two images                                                                                                               |
+|             Mixup              |                                                                                                              Mixup data enhancement to scale up two images                                                                                                               |
+|          NormalizeBox          |                                                                                                                        Bounding box is normalized                                                                                                                        |
+|             PadBox             |                                                                                        If the number of bounding boxes is less than num Max boxes, zero is populated into bboxes                                                                                         |
+|         BboxXYXY2XYWH          |                                                                                       Bounding Box is converted from (xmin,ymin,xmax,ymin) form to (xmin,ymin, Width,height) form                                                                                        |
+|              Pad               |                                                                          The image Pad is an integer multiple of a certain number or the specified size, and supports the way of specifying Pad                                                                          |
+|           Poly2Mask            |                                                                                                                      Poly2Mask data enhancement ｜                                                                                                                       |
+
+Batch data enhancement operators are defined in `transform/batch_operators.py`. The list of operators currently supported is as follows:
+|       Name        |                                                       Function                                                       |
+| :---------------: | :------------------------------------------------------------------------------------------------------------------: |
+|     PadBatch      | Pad operation is performed on each batch of data images randomly to make the images in the batch have the same shape |
+| BatchRandomResize |            Resize a batch of images so that the images in the batch are randomly scaled to the same size             |
+|   Gt2YoloTarget   |                              Generate the objectives of YOLO series models from GT data                              |
+|   Gt2FCOSTarget   |                                  Generate the target of the FCOS model from GT data                                  |
+|   Gt2TTFTarget    |                                     Generate TTF Net model targets from GT data                                      |
+|  Gt2Solov2Target  |                                   Generate targets for SOL Ov2 models from GT data                                   |
+
+**A few notes:**
+- The input of Data enhancement operator is sample or samples, and each sample corresponds to a sample of RoIDBS output by `DetDataSet` mentioned above, such as coco_rec or voc_rec
+- Single image data enhancement operators (except Mixup, Cutmix, etc.) can also be used in batch data processing. However, there are still some differences between single image processing operators and Batch image processing operators. Taking Random Resize and Batch Random Resize as an example, Random Resize will randomly scale each picture in a Batch. However, the shapes of each image after Resize are different. Batch Random Resize means that all images in a Batch will be randomly scaled to the same shape.
+- In addition to Batch Random Resize, the Batch data enhancement operators defined in `transform/batch_operators.py` receive input images in the form of CHW, so please use Permute before using these Batch data enhancement operators . If the Gt2xxx Target operator is used, it needs to be placed further back. The Normalize Box operator is recommended to be placed before Gt2xxx Target. After summarizing these constraints, the order of the recommended preprocessing operator is:
+  ```
+    - XXX: {}
+    - ...
+    - BatchRandomResize: {...} # Remove it if not needed, and place it in front of Permute if necessary
+    - Permute: {} # flush privileges
+    - NormalizeBox: {} # If necessary, it is recommended to precede Gt2XXXTarget
+    - PadBatch: {...} # If not, you can remove it. If necessary, it is recommended to place it behind Permute
+    - Gt2XXXTarget: {...} # It is recommended to place with Pad Batch in the last position
+  ```
+
+#### 3.2Custom data enhancement operator
+If you need to customize data enhancement operators, you need to understand the logic of data enhancement operators. The Base class of the data enhancement Operator is the `transform/operators.py`class defined in `BaseOperator`, from which both the single image data enhancement Operator and the batch data enhancement Operator inherit. Refer to the source code for the complete definition. The following code shows the key functions of the `BaseOperator` class: the apply and __call__ methods
+  ``` python
+  class BaseOperator(object):
+
+    ...
+
+    def apply(self, sample, context=None):
+        return sample
+
+    def __call__(self, sample, context=None):
+        if isinstance(sample, Sequence):
+            for i in range(len(sample)):
+                sample[i] = self.apply(sample[i], context)
+        else:
+            sample = self.apply(sample, context)
+        return sample
+  ```
+__call__ method is call entry of `BaseOperator`, Receive one sample(single image) or multiple samples (multiple images) as input, and call the Apply function to process one or more samples. In most cases, you simply inherit from `BaseOperator` and override the apply method or override the __call__ method, as shown below. Define a XXXOp that inherits from Base Operator and register it:
+  ```python
+  @register_op
+  class XXXOp(BaseOperator):
+    def __init__(self,...):
+
+      super(XXXImage, self).__init__()
+      ...
+
+    # In most cases, you just need to override the Apply method
+    def apply(self, sample, context=None):
+      ...
+      省略对输入的sample具体操作
+      ...
+      return sample
+
+    # If necessary, override call methods such as Mixup, Gt2XXXTarget, etc
+    # def __call__(self, sample, context=None):
+    #   ...
+    #   The specific operation on the input sample is omitted
+    #   ...
+    #   return sample
+  ```
+In most cases, you simply override the Apply method, such as the preprocessor in `transform/operators.py` in addition to Mixup and Cutmix. In the case of batch processing, it is generally necessary to override the call method, such as the preprocessing operator of `transform/batch_operators.py`.
+
+### 4.Reader
+The Reader class is defined in `reader.py`, where the `BaseDataLoader` class is defined. `BaseDataLoader` encapsulates a layer on the basis of `paddle.io.DataLoader`, which has all the functions of `paddle.io.DataLoader` and can realize the different needs of `DetDataset` for different models. For example, you can set Reader to control `DetDataset` to support Mixup, Cutmix and other operations. In addition, the Data preprocessing operators are combined into the `DetDataset` and `paddle.io.DataLoader` by the `Compose` and 'Batch Compose' classes, respectively. All Reader classes inherit from the `BaseDataLoader` class. See source code for details.
+
+### 5.Configuration and Operation
+
+#### 5.1 Configuration
+The configuration files for modules related to data preprocessing contain the configuration files for Datasets common to all models and the configuration files for readers specific to different models.
+
+##### 5.1.1 Dataset Configuration
+The configuration file for the Dataset exists in the `configs/datasets` folder. For example, the COCO dataset configuration file is as follows:
+```
+metric: COCO # Currently supports COCO, VOC, OID, Wider Face and other evaluation standards
+num_classes: 80 # num_classes: The number of classes in the dataset, excluding background classes
+
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017 # The path where the training set image resides relative to the dataset_dir
+    anno_path: annotations/instances_train2017.json # Path to the annotation file of the training set relative to the dataset_dir
+    dataset_dir: dataset/coco #The path where the dataset is located relative to the PaddleDetection path
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] # Controls the fields contained in the sample output of the dataset, note data_fields are unique to the TrainDataset and must be configured
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017 # The path where the images of the validation set reside relative to the dataset_dir
+    anno_path: annotations/instances_val2017.json # The path to the annotation file of the validation set relative to the dataset_dir
+    dataset_dir: dataset/coco # The path where the dataset is located relative to the PaddleDetection path
+TestDataset:
+  !ImageFolder
+    anno_path: dataset/coco/annotations/instances_val2017.json # The path of the annotation file,  it is only used to read the category information of the dataset. JSON and TXT formats are supported
+    dataset_dir: dataset/coco # The path of the dataset, note if this row is added, `anno_path` will be 'dataset_dir/anno_path`, if not set or removed, `anno_path` is `anno_path`
+```
+In the YML profile for Paddle Detection, use `!`directly serializes module instances (functions, instances, etc.). The above configuration files are serialized using Dataset.
+
+**Note:**
+Please carefully check the configuration path of the dataset before running. During training or verification, if the path of TrainDataset or EvalDataset is wrong, it will download the dataset automatically. When using a user-defined dataset, if the TestDataset path is incorrectly configured during inference, the category of the default COCO dataset will be used.
+
+
+##### 5.1.2 Reader configuration
+The Reader configuration files for yolov3 are defined in `configs/yolov3/_base_/yolov3_reader.yml`. An example Reader configuration is as follows:
+```
+worker_num: 2
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    ...
+  batch_transforms:
+    ...
+  batch_size: 8
+  shuffle: true
+  drop_last: true
+  use_shared_memory: true
+
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    ...
+  batch_size: 1
+  drop_empty: false
+
+TestReader:
+  inputs_def:
+    image_shape: [3, 608, 608]
+  sample_transforms:
+    - Decode: {}
+    ...
+  batch_size: 1
+```
+You can define different preprocessing operators in Reader, batch_size per gpu, worker_num of Data Loader, etc.
+
+#### 5.2run
+In the Paddle Detection training, evaluation, and test runs, Reader iterators are created. The Reader is created in `ppdet/engine/trainer.py`. The following code shows how to create a training-time Reader
+``` python
+from ppdet.core.workspace import create
+# build data loader
+self.dataset = cfg['TrainDataset']
+self.loader = create('TrainReader')(selfdataset, cfg.worker_num)
+```
+The Reader for prediction and evaluation is similar to `ppdet/engine/trainer.py`.
+
+> About the data processing module, if you have other questions or suggestions, please send us an issue, we welcome your feedback.
diff --git a/docs/advanced_tutorials/customization/detection.md b/docs/advanced_tutorials/customization/detection.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f20cf3c58e8908136bd336abc413536a06a3467
--- /dev/null
+++ b/docs/advanced_tutorials/customization/detection.md
@@ -0,0 +1,84 @@
+简体中文 | [English](./detection_en.md)
+
+# 目标检测任务二次开发
+
+在目标检测算法产业落地过程中，常常会出现需要额外训练以满足实际使用的要求，项目迭代过程中也会出先需要修改类别的情况。本文档详细介绍如何使用PaddleDetection进行目标检测算法二次开发，流程包括：数据准备、模型优化思路和修改类别开发流程。
+
+## 数据准备
+
+二次开发首先需要进行数据集的准备，针对场景特点采集合适的数据从而提升模型效果和泛化性能。然后使用Labeme，LabelImg等标注工具标注目标检测框，并将标注结果转化为COCO或VOC数据格式。详细文档可以参考[数据准备文档](../../tutorials/data/README.md)
+
+## 模型优化
+
+### 1. 使用自定义数据集训练
+
+基于准备的数据在数据配置文件中修改对应路径，例如`configs/dataset/coco_detection.yml`:
+
+```
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017 # 训练集的图片所在文件相对于dataset_dir的路径
+    anno_path: annotations/instances_train2017.json # 训练集的标注文件相对于dataset_dir的路径
+    dataset_dir: dataset/coco # 数据集所在路径，相对于PaddleDetection路径
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017 # 验证集的图片所在文件相对于dataset_dir的路径
+    anno_path: annotations/instances_val2017.json # 验证集的标注文件相对于dataset_dir的路径
+    dataset_dir: dataset/coco # 数据集所在路径，相对于PaddleDetection路径
+
+TestDataset:
+  !ImageFolder
+    anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt) # 标注文件所在文件 相对于dataset_dir的路径
+    dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path' # 数据集所在路径，相对于PaddleDetection路径
+```
+
+配置修改完成后，即可以启动训练评估，命令如下
+
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml --eval
+```
+
+更详细的命令参考[30分钟快速上手PaddleDetection](../../tutorials/GETTING_STARTED_cn.md)
+
+
+### 2. 加载COCO模型作为预训练
+
+目前PaddleDetection提供的配置文件加载的预训练模型均为ImageNet数据集的权重，加载到检测算法的骨干网络中，实际使用时，建议加载COCO数据集训练好的权重，通常能够对模型精度有较大提升，使用方法如下：
+
+#### 1) 设置预训练权重路径
+
+COCO数据集训练好的模型权重均在各算法配置文件夹下，例如`configs/ppyoloe`下提供了PP-YOLOE-l COCO数据集权重：[链接](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams) 。配置文件中设置`pretrain_weights: https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams`
+
+#### 2) 修改超参数
+
+加载COCO预训练权重后，需要修改学习率超参数，例如`configs/ppyoloe/_base_/optimizer_300e.yml`中:
+
+```
+epoch: 120 # 原始配置为300epoch，加载COCO权重后可以适当减少迭代轮数
+
+LearningRate:
+  base_lr: 0.005 # 原始配置为0.025，加载COCO权重后需要降低学习率
+  schedulers:
+    - !CosineDecay
+      max_epochs: 144 # 依据epoch数进行修改
+    - !LinearWarmup
+      start_factor: 0.
+      epochs: 5
+```
+
+## 修改类别
+
+当实际使用场景类别发生变化时，需要修改数据配置文件，例如`configs/datasets/coco_detection.yml`中:
+
+```
+metric: COCO
+num_classes: 10 # 原始类别80
+```
+
+配置修改完成后，同样可以加载COCO预训练权重，PaddleDetection支持自动加载shape匹配的权重，对于shape不匹配的权重会自动忽略，因此无需其他修改。
diff --git a/docs/advanced_tutorials/customization/detection_en.md b/docs/advanced_tutorials/customization/detection_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..003ea152906b947473643b93cf1585b7f32d2155
--- /dev/null
+++ b/docs/advanced_tutorials/customization/detection_en.md
@@ -0,0 +1,89 @@
+[简体中文](./detection.md) | English
+
+# Customize Object Detection task
+
+In the practical application of object detection algorithms in a specific industry, additional training is often required for practical use. The project iteration will also need to modify categories. This document details how to use PaddleDetection for a customized object detection algorithm. The process includes data preparation, model optimization roadmap, and modifying the category development process.
+
+## Data Preparation
+
+Customization starts with the preparation of the dataset. We need to collect suitable data for the scenario features, so as to improve the model effect and generalization performance. Then Labeme, LabelImg and other labeling tools will be used to label the object detection bouding boxes and convert the labeling results into COCO or VOC data format. Details please refer to [Data Preparation](../../tutorials/data/PrepareDetDataSet_en.md)
+
+## Model Optimization
+
+### 1. Use customized dataset for training
+
+Modify the corresponding path in the data configuration file based on the prepared data, for example:
+
+configs/dataset/coco_detection.yml`:
+
+```
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017 # Path to the images of the training set relative to the dataset_dir
+    anno_path: annotations/instances_train2017.json # Path to the annotation file of the training set relative to the dataset_dir
+    dataset_dir: dataset/coco # Path to the dataset relative to the PaddleDetection path
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017 # Path to the images of the evaldataset set relative to the dataset_dir
+    anno_path: annotations/instances_val2017.json # Path to the annotation file of the evaldataset relative to the dataset_dir
+    dataset_dir: dataset/coco # Path to the dataset relative to the PaddleDetection path
+
+TestDataset:
+  !ImageFolder
+    anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt) # Path to the annotation files relative to dataset_di.
+    dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path' # Path to the dataset relative to the PaddleDetection path
+```
+
+Once the configuration changes are completed, the training evaluation can be started with the following command
+
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml --eval
+```
+
+More details please refer to [Getting Started for PaddleDetection](../../tutorials/GETTING_STARTED_cn.md)
+
+###
+
+### 2. Load the COCO model as pre-training
+
+The currently provided pre-trained models in PaddleDetection's configurations are weights from the ImageNet dataset, loaded into the backbone network of the detection algorithm. For practical use, it is recommended to load the weights trained on the COCO dataset, which can usually provide a large improvement to the model accuracy. The method is as follows.
+
+#### 1) Set pre-training weight path
+
+The trained model weights for the COCO dataset are saved in the configuration folder of each algorithm, for example, PP-YOLOE-l COCO dataset weights are provided under `configs/ppyoloe`: [Link](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams) The configuration file sets`pretrain_weights: https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams`
+
+#### 2) Modify hyperparameters
+
+After loading the COCO pre-training weights, the learning rate hyperparameters need to be modified, for example
+
+In `configs/ppyoloe/_base_/optimizer_300e.yml`:
+
+```
+epoch: 120 # The original configuration is 300 epoch, after loading COCO weights, the iteration number can be reduced appropriately
+
+LearningRate:
+ base_lr: 0.005 # The original configuration is 0.025, after loading COCO weights, the learning rate should be reduced.
+ schedulers:
+ - !CosineDecay
+ max_epochs: 144 # Modify based on the number of epochs
+ - LinearWarmup
+ start_factor: 0.
+ epochs: 5
+```
+
+## Modify categories
+
+When the actual application scenario category changes, the data configuration file needs to be modified, for example in `configs/datasets/coco_detection.yml`:
+
+```
+metric: COCO
+num_classes: 10 # original class 80
+```
+
+After the configuration changes are completed, the COCO pre-training weights can also be loaded. PaddleDetection supports automatic loading of shape-matching weights, and weights that do not match the shape are automatically ignored, so no other modifications are needed.
diff --git a/docs/feature_models/SSLD_PRETRAINED_MODEL.md b/docs/feature_models/SSLD_PRETRAINED_MODEL.md
new file mode 100644
index 0000000000000000000000000000000000000000..e27b69a664a5d19624e61caad5cc079d9de8f602
--- /dev/null
+++ b/docs/feature_models/SSLD_PRETRAINED_MODEL.md
@@ -0,0 +1,54 @@
+简体中文 | [English](SSLD_PRETRAINED_MODEL_en.md)
+
+### Simple semi-supervised label knowledge distillation solution (SSLD)
+
+### R-CNN on COCO
+
+| 骨架网络              | 网络类型       | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | Mask AP |                           下载                          | 配置文件 |
+| :------------------- | :------------| :-----: | :-----: | :------------: | :-----: | :-----: | :-----------------------------------------------------: | :-----: |
+| ResNet50-vd-SSLDv2-FPN      | Faster         |    1    |   1x    |     ----     |  41.4  |  -  | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_1x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN      | Faster         |    1    |   2x    |     ----     |  42.3  |  -  | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_2x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Mask         |    1    |   1x    |     ----     |  42.0  |    38.2   | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Mask         |    1    |   2x    |     ----     |  42.7 |    38.9   | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Cascade Faster         |    1    |   1x    |     ----     |  44.4  |    -    | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Cascade Faster         |    1    |   2x    |     ----     |  45.0  |    -    | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Cascade Mask         |    1    |   1x    |     ----     |  44.9 |    39.1    | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Cascade Mask         |    1    |   2x    |     ----     |  45.7  |    39.7    | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml) |
+
+
+### YOLOv3 on COCO
+
+| 骨架网络             | 输入尺寸   | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP |                           下载                          | 配置文件 |
+| :----------------- | :-------- | :-----------: | :------: | :---------: | :----: | :----------------------------------------------------: | :-----: |
+| MobileNet-V1-SSLD         | 608         |    8    |   270e    |     ----     |  31.0  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) |
+| MobileNet-V1-SSLD         | 416         |    8    |   270e    |     ----     |  30.6  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) |
+| MobileNet-V1-SSLD         | 320         |    8    |   270e    |     ----     |  28.4  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) |
+
+### YOLOv3 on Pasacl VOC
+
+| 骨架网络             | 输入尺寸   | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP |                           下载                          | 配置文件 |
+| :----------------- | :-------- | :-----------: | :------: | :---------: | :----: | :----------------------------------------------------: | :-----: |
+| MobileNet-V1-SSLD | 608  |    8    |   270e  |      -        |  78.3  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) |
+| MobileNet-V1-SSLD | 416  |    8    |   270e  |      -        |  79.6  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) |
+| MobileNet-V1-SSLD | 320  |    8    |   270e  |      -        |  77.3  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) |
+| MobileNet-V3-SSLD | 608  |    8    |   270e  |      -        |  80.4  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) |
+| MobileNet-V3-SSLD | 416  |    8    |   270e  |      -        |  79.2  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) |
+| MobileNet-V3-SSLD | 320  |    8    |   270e  |      -        |  77.3  | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) |
+
+**注意事项:**  
+
+- [SSLD](https://arxiv.org/abs/2103.05959)是一种知识蒸馏方法，我们使用蒸馏后性能更强的backbone预训练模型，进一步提升检测精度，详细方案请参考[知识蒸馏教程](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/en/advanced_tutorials/distillation/distillation_en.md)
+
+![demo image](../images/ssld_model.png)
+
+## Citations
+```
+@misc{cui2021selfsupervision,
+      title={Beyond Self-Supervision: A Simple Yet Effective Network Distillation Alternative to Improve Backbones},
+      author={Cheng Cui and Ruoyu Guo and Yuning Du and Dongliang He and Fu Li and Zewu Wu and Qiwen Liu and Shilei Wen and Jizhou Huang and Xiaoguang Hu and Dianhai Yu and Errui Ding and Yanjun Ma},
+      year={2021},
+      eprint={2103.05959},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/docs/feature_models/SSLD_PRETRAINED_MODEL_en.md b/docs/feature_models/SSLD_PRETRAINED_MODEL_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b97c3b71ee5b8e439181740ea6d7c1a0d7a6d2ba
--- /dev/null
+++ b/docs/feature_models/SSLD_PRETRAINED_MODEL_en.md
@@ -0,0 +1,53 @@
+English | [简体中文](SSLD_PRETRAINED_MODEL.md)
+
+### Simple semi-supervised label knowledge distillation solution (SSLD)
+
+### R-CNN on COCO
+
+| Backbone              |  Model       | Images/GPU | Lr schd | FPS | Box AP | Mask AP |                           Download                           | Config |
+| :------------------- | :------------| :-----: | :-----: | :------------: | :-----: | :-----: | :-----------------------------------------------------: | :-----: |
+| ResNet50-vd-SSLDv2-FPN      | Faster         |    1    |   1x    |     ----     |  41.4  |  -  | [model](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_1x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN      | Faster         |    1    |   2x    |     ----     |  42.3  |  -  | [model](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_2x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Mask         |    1    |   1x    |     ----     |  42.0  |    38.2   | [model](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Mask         |    1    |   2x    |     ----     |  42.7 |    38.9   | [model](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Cascade Faster         |    1    |   1x    |     ----     |  44.4  |    -    | [model](https://paddledet.bj.bcebos.com/models/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Cascade Faster         |    1    |   2x    |     ----     |  45.0  |    -    | [model](https://paddledet.bj.bcebos.com/models/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Cascade Mask         |    1    |   1x    |     ----     |  44.9 |    39.1    | [model](https://paddledet.bj.bcebos.com/models/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml) |
+| ResNet50-vd-SSLDv2-FPN         | Cascade Mask         |    1    |   2x    |     ----     |  45.7  |    39.7    | [model](https://paddledet.bj.bcebos.com/models/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml) |
+
+### YOLOv3 on COCO
+
+| Backbone            |   Input shape   | Images/GPU | Lr schd | FPS | Box AP |                          Download                           | Config |
+| :----------------- | :-------- | :-----------: | :------: | :---------: | :----: | :----------------------------------------------------: | :-----: |
+| MobileNet-V1-SSLD         | 608         |    8    |   270e    |     ----     |  31.0  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) |
+| MobileNet-V1-SSLD         | 416         |    8    |   270e    |     ----     |  30.6  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) |
+| MobileNet-V1-SSLD         | 320         |    8    |   270e    |     ----     |  28.4  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) |
+
+### YOLOv3 on Pasacl VOC
+
+| Backbone            |   Input shape   | Images/GPU | Lr schd | FPS | Box AP |                          Download                           | Config |
+| :----------------- | :-------- | :-----------: | :------: | :---------: | :----: | :----------------------------------------------------: | :-----: |
+| MobileNet-V1-SSLD | 608  |    8    |   270e  |      -        |  78.3  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) |
+| MobileNet-V1-SSLD | 416  |    8    |   270e  |      -        |  79.6  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) |
+| MobileNet-V1-SSLD | 320  |    8    |   270e  |      -        |  77.3  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) |
+| MobileNet-V3-SSLD | 608  |    8    |   270e  |      -        |  80.4  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) |
+| MobileNet-V3-SSLD | 416  |    8    |   270e  |      -        |  79.2  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) |
+| MobileNet-V3-SSLD | 320  |    8    |   270e  |      -        |  77.3  | [model](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) |
+
+**Notes:**
+
+- [SSLD](https://arxiv.org/abs/2103.05959) is a knowledge distillation method. We use the stronger backbone pretrained model after distillation to further improve the detection accuracy. Please refer to the [knowledge distillation tutorial](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/en/advanced_tutorials/distillation/distillation_en.md).
+
+![demo image](../images/ssld_model.png)
+
+## Citations
+```
+@misc{cui2021selfsupervision,
+      title={Beyond Self-Supervision: A Simple Yet Effective Network Distillation Alternative to Improve Backbones},
+      author={Cheng Cui and Ruoyu Guo and Yuning Du and Dongliang He and Fu Li and Zewu Wu and Qiwen Liu and Shilei Wen and Jizhou Huang and Xiaoguang Hu and Dianhai Yu and Errui Ding and Yanjun Ma},
+      year={2021},
+      eprint={2103.05959},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/docs/images/000000014439.jpg b/docs/images/000000014439.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..56a4f66768c439adf0fadbde7b150b520c6d09e3
Binary files /dev/null and b/docs/images/000000014439.jpg differ
diff --git a/docs/images/12_Group_Group_12_Group_Group_12_935.jpg b/docs/images/12_Group_Group_12_Group_Group_12_935.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2a563361ae03fbe079dba017374eee51ccbd17dd
Binary files /dev/null and b/docs/images/12_Group_Group_12_Group_Group_12_935.jpg differ
diff --git a/docs/images/PedestrianDetection_001.png b/docs/images/PedestrianDetection_001.png
new file mode 100644
index 0000000000000000000000000000000000000000..5194d6ff891b9507fedfc53f36de4f00219c7f30
Binary files /dev/null and b/docs/images/PedestrianDetection_001.png differ
diff --git a/docs/images/PedestrianDetection_004.png b/docs/images/PedestrianDetection_004.png
new file mode 100644
index 0000000000000000000000000000000000000000..7c62be5051f9a47c5f5e98ccd9f45c3fa5f30257
Binary files /dev/null and b/docs/images/PedestrianDetection_004.png differ
diff --git a/docs/images/bus.jpg b/docs/images/bus.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cdbbf8c9ba9990fb228360db590e37f078160767
Binary files /dev/null and b/docs/images/bus.jpg differ
diff --git a/docs/images/dog.jpg b/docs/images/dog.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..237c084d9b0dd5cf32e9ec5463ab027ebd148df8
Binary files /dev/null and b/docs/images/dog.jpg differ
diff --git a/docs/images/fps_map.png b/docs/images/fps_map.png
new file mode 100644
index 0000000000000000000000000000000000000000..0fbafcb4fb55fb3659a09b9ff20b6f82a9fe2ffc
Binary files /dev/null and b/docs/images/fps_map.png differ
diff --git a/docs/images/input_shape.png b/docs/images/input_shape.png
new file mode 100644
index 0000000000000000000000000000000000000000..1148116f81ec78ae625f342fa51dcf778d1fb4ca
Binary files /dev/null and b/docs/images/input_shape.png differ
diff --git a/docs/images/instance_seg.png b/docs/images/instance_seg.png
new file mode 100644
index 0000000000000000000000000000000000000000..7ba84009457640edc700805be5e48207ffa660ad
Binary files /dev/null and b/docs/images/instance_seg.png differ
diff --git a/docs/images/layout.jpg b/docs/images/layout.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1c3ca618d30c4c04f062a7db382326ebb4d4e599
Binary files /dev/null and b/docs/images/layout.jpg differ
diff --git a/docs/images/lite_demo.jpg b/docs/images/lite_demo.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0eee6e84c24ee44d422f314a92a3df5d7cf2dc81
Binary files /dev/null and b/docs/images/lite_demo.jpg differ
diff --git a/docs/images/mobile_fps_map.png b/docs/images/mobile_fps_map.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b31508332710042406ab046529148d82a0581e8
Binary files /dev/null and b/docs/images/mobile_fps_map.png differ
diff --git a/docs/images/model_figure.png b/docs/images/model_figure.png
new file mode 100644
index 0000000000000000000000000000000000000000..72ec8cdad23a49e948f39fe3091c26f7a94d74a4
Binary files /dev/null and b/docs/images/model_figure.png differ
diff --git a/docs/images/picedet_demo.jpeg b/docs/images/picedet_demo.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..031f38ab484ac8f375d71029a2588a81ec23aa1e
Binary files /dev/null and b/docs/images/picedet_demo.jpeg differ
diff --git a/docs/images/ppdet.gif b/docs/images/ppdet.gif
new file mode 100644
index 0000000000000000000000000000000000000000..9539637070da504ead2d5870eb512b438bcd5c62
Binary files /dev/null and b/docs/images/ppdet.gif differ
diff --git a/docs/images/ppyolo_map_fps.png b/docs/images/ppyolo_map_fps.png
new file mode 100644
index 0000000000000000000000000000000000000000..f860d220d1c831e42a23e38fc78732426c23e2cc
Binary files /dev/null and b/docs/images/ppyolo_map_fps.png differ
diff --git a/docs/images/ppyoloe_plus_map_fps.png b/docs/images/ppyoloe_plus_map_fps.png
new file mode 100644
index 0000000000000000000000000000000000000000..dbc0e4cca60775103fd655c36c3c4092f57a24a5
Binary files /dev/null and b/docs/images/ppyoloe_plus_map_fps.png differ
diff --git a/docs/images/reader_figure.png b/docs/images/reader_figure.png
new file mode 100644
index 0000000000000000000000000000000000000000..68441a20cd5bc14349bfea01a3ffa66a31ac1793
Binary files /dev/null and b/docs/images/reader_figure.png differ
diff --git a/docs/images/res.jpg b/docs/images/res.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6f281fa3be0053d5a919da4ee36c6005e0664daa
Binary files /dev/null and b/docs/images/res.jpg differ
diff --git a/docs/images/road554.png b/docs/images/road554.png
new file mode 100644
index 0000000000000000000000000000000000000000..1ecd45d9403897aa048417a9b69ad06e7ce41016
Binary files /dev/null and b/docs/images/road554.png differ
diff --git a/docs/images/roadsign_yml.png b/docs/images/roadsign_yml.png
new file mode 100644
index 0000000000000000000000000000000000000000..242bab90bd75f7ab08c7477475222b0b37678c43
Binary files /dev/null and b/docs/images/roadsign_yml.png differ
diff --git a/docs/images/ssld_model.png b/docs/images/ssld_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..23508712be7e6b6787575a66ca4c65037c9015c8
Binary files /dev/null and b/docs/images/ssld_model.png differ
diff --git a/docs/images/yaml_show.png b/docs/images/yaml_show.png
new file mode 100644
index 0000000000000000000000000000000000000000..b6319752d4f13471f2edc4a357cb9ec51ec90c75
Binary files /dev/null and b/docs/images/yaml_show.png differ
diff --git a/docs/tutorials/DistributedTraining_cn.md b/docs/tutorials/DistributedTraining_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..80ff32ecb0f3bf86b30ce59db55e0fc1d4197cbe
--- /dev/null
+++ b/docs/tutorials/DistributedTraining_cn.md
@@ -0,0 +1,50 @@
+[English](DistributedTraining_en.md) | 简体中文
+
+
+# 分布式训练
+
+## 1. 简介
+
+* 分布式训练指的是将训练任务按照一定方法拆分到多个计算节点进行计算，再按照一定的方法对拆分后计算得到的梯度等信息进行聚合与更新。飞桨分布式训练技术源自百度的业务实践，在自然语言处理、计算机视觉、搜索和推荐等领域经过超大规模业务检验。分布式训练的高性能，是飞桨的核心优势技术之一，PaddleDetection同时支持单机训练与多机训练。更多关于分布式训练的方法与文档可以参考：[分布式训练快速开始教程](https://fleet-x.readthedocs.io/en/latest/paddle_fleet_rst/parameter_server/ps_quick_start.html)。
+
+## 2. 使用方法
+
+### 2.1 单机训练
+
+* 以PP-YOLOE-s为例，本地准备好数据之后，使用`paddle.distributed.launch`或者`fleetrun`的接口启动训练任务即可。下面为运行脚本示例。
+
+```bash
+fleetrun \
+--selected_gpu 0,1,2,3,4,5,6,7 \
+tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml \
+--eval &>logs.txt 2>&1 &
+```
+
+### 2.2 多机训练
+
+* 相比单机训练，多机训练时，只需要添加`--ips`的参数，该参数表示需要参与分布式训练的机器的ip列表，不同机器的ip用逗号隔开。下面为运行代码示例。
+
+```shell
+ip_list="10.127.6.17,10.127.5.142,10.127.45.13,10.127.44.151"
+fleetrun \
+--ips=${ip_list} \
+--selected_gpu 0,1,2,3,4,5,6,7 \
+tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml \
+--eval &>logs.txt 2>&1 &
+```
+
+**注：**
+* 不同机器的ip信息需要用逗号隔开，可以通过`ifconfig`或者`ipconfig`查看。
+* 不同机器之间需要做免密设置，且可以直接ping通，否则无法完成通信。
+* 不同机器之间的代码、数据与运行命令或脚本需要保持一致，且所有的机器上都需要运行设置好的训练命令或者脚本。最终`ip_list`中的第一台机器的第一块设备是trainer0，以此类推。
+* 不同机器的起始端口可能不同，建议在启动多机任务前，在不同的机器中设置相同的多机运行起始端口，命令为`export FLAGS_START_PORT=17000`，端口值建议在`10000~20000`之间。
+
+
+## 3. 性能效果测试
+
+* 在单机和4机8卡V100的机器上，基于[PP-YOLOE-s](../../configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml)进行模型训练，模型的训练耗时情况如下所示。
+
+机器 | 精度 | 耗时
+-|-|-
+单机8卡 | 42.7% | 39h
+4机8卡 | 42.1% | 13h
diff --git a/docs/tutorials/DistributedTraining_en.md b/docs/tutorials/DistributedTraining_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..9fea8637340dd76fa9d416d9a40ccd88d17d86db
--- /dev/null
+++ b/docs/tutorials/DistributedTraining_en.md
@@ -0,0 +1,44 @@
+English | [简体中文](DistributedTraining_cn.md)
+
+
+## 1. Usage
+
+### 1.1 Single-machine
+
+* Take PP-YOLOE-s as an example, after preparing the data locally, use the interface of `paddle.distributed.launch` or `fleetrun` to start the training task. Below is an example of running the script.
+
+```bash
+fleetrun \
+--selected_gpu 0,1,2,3,4,5,6,7 \
+tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml \
+--eval &>logs.txt 2>&1 &
+```
+
+### 1.2 Multi-machine
+
+* Compared with single-machine training, when training on multiple machines, you only need to add the `--ips` parameter, which indicates the ip list of machines that need to participate in distributed training. The ips of different machines are separated by commas. Below is an example of running code.
+
+```shell
+ip_list="10.127.6.17,10.127.5.142,10.127.45.13,10.127.44.151"
+fleetrun \
+--ips=${ip_list} \
+--selected_gpu 0,1,2,3,4,5,6,7 \
+tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml \
+--eval &>logs.txt 2>&1 &
+```
+
+**Note:**
+* The ip information of different machines needs to be separated by commas, which can be viewed through `ifconfig` or `ipconfig`.
+* Password-free settings are required between different machines, and they can be pinged directly, otherwise the communication cannot be completed.
+* The code, data, and running commands or scripts between different machines need to be consistent, and the set training commands or scripts need to be run on all machines. The first device of the first machine in the final `ip_list` is trainer0, and so on.
+* The starting port of different machines may be different. It is recommended to set the same starting port for multi-machine running in different machines before starting the multi-machine task. The command is `export FLAGS_START_PORT=17000`, and the port value is recommended to be `10000~20000`.
+
+
+## 2. Performance
+
+* On single-machine and 4-machine 8-card V100 machines, model training is performed based on [PP-YOLOE-s](../../configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml). The model training time is as follows.
+
+Machine | mAP | Time cost
+-|-|-
+single machine | 42.7% | 39h
+4 machines | 42.1% | 13h
diff --git "a/docs/tutorials/FAQ/FAQ\347\254\254\344\270\200\346\234\237.md" "b/docs/tutorials/FAQ/FAQ\347\254\254\344\270\200\346\234\237.md"
new file mode 100644
index 0000000000000000000000000000000000000000..b7926d86d2135772af3e2f7a8c96f24b82d42e13
--- /dev/null
+++ "b/docs/tutorials/FAQ/FAQ\347\254\254\344\270\200\346\234\237.md"
@@ -0,0 +1,57 @@
+# FAQ：第一期
+
+**Q：**SOLOv2训练mAP值宽幅震荡，无上升趋势，检测效果不好，检测置信度超过了1的原因是？
+
+**A：** SOLOv2训练不收敛的话，先更新PaddleDetection到release/2.2或者develop分支尝试。
+
+
+
+**Q：** Optimizer中优化器支持哪几种？
+
+**A：** Paddle中支持的优化器[Optimizer](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/Overview_cn.html )在PaddleDetection中均支持，需要手动修改下配置文件即可。
+
+
+
+**Q：** 在tools/infer.py加入如下函数，得到FLOPs值为-1,请问原因？
+
+**A：** 更新PaddleDetection到release/2.2或者develop分支，`print_flops`设为True即可打印FLOPs。
+
+
+
+**Q：** 使用官方的ReID模块时遇到了模块未注册的问题
+
+**A：** 请尝试`pip uninstall paddledet`并重新安装，或者`python setup.py install`。
+
+
+
+**Q：** 大规模实用目标检测模型有动态图版本吗，或者可以转换为动态图版本吗？
+
+**A：** 大规模实用模型的动态图版本正在整理，我们正在开发更大规模的通用预训练模型，预计在2.3版本中发布。
+
+
+
+**Q：** Develop分支下FairMot预测视频问题：预测视频时不会完全运行完毕。比如用一个300frame的视频，代码会保存预测结果的每一帧图片，但只保存到299张就没了，并且也没有预测好的视频文件生成，该如何解决？
+
+**A：** 已经支持自己设置帧率infer视频，请使用develop分支或release/2.2分支，命令如下：
+
+```
+CUDA_VISIBLE_DEVICES=0 python tools/infer_mot.py -c configs/mot/fairmot/fairmot_dla34_30e_1088x608.yml -o weights=https://paddledet.bj.bcebos.com/models/mot/fairmot_dla34_30e_1088x608.pdparams --video_file={your video name}.mp4 --frame_rate=20 --save_videos
+```
+
+
+
+**Q：** 使用YOLOv3模型如何通过yml文件修改输入图片尺寸？
+
+**A：** 模型预测部署需要用到指定的尺寸时，首先在训练前需要修改`configs/_base_/yolov3_reader.yml`中的`TrainReader`的`BatchRandomResize`中`target_size`包含指定的尺寸，训练完成后，在评估或者预测时，需要将`EvalReader`和`TestReader`中的`Resize`的`target_size`修改成对应的尺寸，如果是需要模型导出(export_model)，则需要将`TestReader`中的`image_shape`修改为对应的图片输入尺寸 。
+
+
+
+**Q：** 以前的模型都是用静态图训练的，现在想用动态图训练，但想加载原来静态图的模型作为预训练模型，可以直接用加载静态图保存的模型断点吗？如不行，有其它方法吗？
+
+**A：** 静态图和动态图模型的权重的key做下映射一一对应转过去是可以的，可以参考[这个代码](https://github.com/nemonameless/weights_st2dy )。但是不保证所有静态图的权重的key映射都能对应上，静态图是把背景也训练了，动态图去背景类训的，而且现有动态图模型训出来的一般都比以前静态图更高，资源时间够的情况下建议还是直接训动态图版本。
+
+
+
+**Q：** TTFNet训练过程中hm_loss异常
+
+**A：** 如果是单卡的话学习率需要对应降低8倍。另外ttfnet模型因为自身设置的学习率比较大，可能会出现其他数据集训练出现不稳定的情况。建议pretrain_weights加载官方release出的coco数据集上训练好的模型，然后将学习率再调低一些。
diff --git "a/docs/tutorials/FAQ/FAQ\347\254\254\351\233\266\346\234\237.md" "b/docs/tutorials/FAQ/FAQ\347\254\254\351\233\266\346\234\237.md"
new file mode 100644
index 0000000000000000000000000000000000000000..4478495bff8e52ed1377ad8e09ee63a49ce606da
--- /dev/null
+++ "b/docs/tutorials/FAQ/FAQ\347\254\254\351\233\266\346\234\237.md"
@@ -0,0 +1,104 @@
+# FAQ：第零期
+
+**Q:**  为什么我使用单GPU训练loss会出`NaN`? </br>
+**A:**  配置文件中原始学习率是适配多GPU训练(8x GPU)，若使用单GPU训练，须对应调整学习率（例如，除以8）。
+
+以[faster_rcnn_r50](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/configs/faster_rcnn/faster_rcnn_r50_1x_coco.yml) 为例,在静态图下计算规则表如下所示，它们是等价的，表中变化节点即为`piecewise decay`里的`boundaries`: </br>
+
+
+| GPU数  |batch size/卡| 学习率  | 最大轮数 | 变化节点       |
+| :---------: |  :------------:|:------------: | :-------: | :--------------: |
+| 2          | 1 | 0.0025         | 720000    | [480000, 640000] |
+| 4          | 1 | 0.005          | 360000    | [240000, 320000] |
+| 8          | 1| 0.01           | 180000    | [120000, 160000] |
+
+* 上述方式适用于静态图下。在动态图中，由于训练以epoch方式计数，因此调整GPU卡数后只需要修改学习率即可，修改方式和静态图相同.
+
+
+**Q:**  自定义数据集时，配置文件里的`num_classes`应该如何设置? </br>
+**A:**  动态图中，自定义数据集时将`num_classes`统一设置为自定义数据集的类别数即可，静态图中(static目录下)，YOLO系列模型和anchor free系列模型将`num_classes`设置为自定义数据集类别即可，其他模型如RCNN系列，SSD，RetinaNet，SOLOv2等模型，由于检测原理上分类中需要区分背景框和前景框，设置的`num_classes`须为自定义数据集类别数+1，即增加一类背景类。
+
+**Q:**  PP-YOLOv2模型训练使用`—eval`做训练中验证，在第一次做eval的时候hang住,该如何处理?</br>
+**A:**  PP-YOLO系列模型如果只加载backbone的预训练权重从头开始训练的话收敛会比较慢，当模型还没有较好收敛的时候做预测时，由于输出的预测框比较混乱，在NMS时做排序和滤除会非常耗时，就好像eval时hang住了一样，这种情况一般发生在使用自定义数据集并且自定义数据集样本数较少导致训练到第一次做eval的时候训练轮数较少，模型还没有较好收敛的情况下，可以通过如下三个方面排查解决。
+
+
+
+* PaddleDetection中提供的默认配置一般是采用8卡训练的配置，配置文件中的`batch_size`数为每卡的batch size，若训练的时候不是使用8卡或者对`batch_size`有修改，需要等比例的调小初始`learning_rate`来获得较好的收敛效果
+
+* 如果使用自定义数据集并且样本数比较少，建议增大`snapshot_epoch`数来增加第一次进行eval的时候的训练轮数来保证模型已经较好收敛
+
+* 若使用自定义数据集训练，可以加载我们发布的COCO或VOC数据集上训练好的权重进行finetune训练来加快收敛速度，可以使用`-o pretrain_weights=xxx`的方式指定预训练权重，xxx可以是Model Zoo里发布的模型权重链接
+
+
+
+
+**Q:**  如何更好的理解reader和自定义修改reader文件
+```
+# 每张GPU reader进程个数
+worker_num: 2
+# 训练数据
+TrainReader:
+  inputs_def:
+    num_max_boxes: 50
+  # 训练数据transforms
+  sample_transforms:
+    - Decode: {} # 图片解码，将图片数据从numpy格式转为rgb格式，是必须存在的一个OP
+    - Mixup: {alpha: 1.5, beta: 1.5} # Mixup数据增强，对两个样本的gt_bbbox/gt_score操作，构建虚拟的训练样本，可选的OP
+    - RandomDistort: {} # 随机颜色失真，可选的OP
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} # 随机Canvas填充，可选的OP
+    - RandomCrop: {} # 随机裁剪，可选的OP
+    - RandomFlip: {} # 随机左右翻转，默认概率0.5，可选的OP
+  # batch_transforms
+  batch_transforms:
+    - BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeBox: {}
+    - PadBox: {num_max_boxes: 50}
+    - BboxXYXY2XYWH: {}
+    - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
+    - Permute: {}
+    - Gt2YoloTarget: {anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]], anchors: [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]], downsample_ratios: [32, 16, 8]}
+  # 训练时batch_size
+  batch_size: 24
+  # 读取数据是否乱序
+  shuffle: true
+  # 是否丢弃最后不能完整组成batch的数据
+  drop_last: true
+  # mixup_epoch，大于最大epoch，表示训练过程一直使用mixup数据增广。默认值为-1，表示不使用Mixup。如果删去- Mixup: {alpha: 1.5, beta: 1.5}这行代码则必须也将mixup_epoch设置为-1或者删除
+  mixup_epoch: 25000
+  # 是否通过共享内存进行数据读取加速，需要保证共享内存大小(如/dev/shm)满足大于1G
+  use_shared_memory: true
+
+  如果需要单尺度训练，则去掉batch_transforms里的BatchRandomResize这一行，在sample_transforms最后一行添加- Resize: {target_size: [608, 608], keep_ratio: False, interp: 2}
+
+  Decode是必须保留的，如果想要去除数据增强，则可以注释或删除Mixup RandomDistort RandomExpand RandomCrop RandomFlip，注意如果注释或删除Mixup则必须也将mixup_epoch这一行注释或删除，或者设置为-1表示不使用Mixup
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2}
+
+```
+**Q:**  用户如何控制类别类别输出？即图中有多类目标只输出其中的某几类
+
+**A:**  用户可自行在代码中进行修改，增加条件设置。
+```
+# filter by class_id
+keep_class_id = [1, 2]
+bbox_res = [e for e in bbox_res if int(e[0]) in keep_class_id]
+```
+https://github.com/PaddlePaddle/PaddleDetection/blob/b87a1ea86fa18ce69e44a17ad1b49c1326f19ff9/ppdet/engine/trainer.py#L438
+
+**Q:**  用户自定义数据集训练，预测结果标签错误
+
+**A:**  此类情况往往是用户在设置数据集路径时候，并没有关注TestDataset中anno_path的路径问题。需要用户将anno_path设置成自己的路径。
+```
+TestDataset:
+  !ImageFolder
+    anno_path: annotations/instances_val2017.json
+```
+
+**Q:** 如何打印网络FLOPs？
+
+**A:** 在`configs/runtime.yml`中设置`print_flops: true`，同时需要安装PaddleSlim(比如：pip install paddleslim)，即可打印模型的FLOPs。
+
+**Q:** 如何使用无标注框进行训练？
+
+**A:** 在`configs/dataset/coco.py` 或者`configs/dataset/voc.py`中的TrainDataset下设置`allow_empty: true`, 此时允许数据集加载无标注框进行训练。该功能支持coco，voc数据格式，RCNN系列和YOLO系列模型验证能够正常训练。另外，如果无标注框数据过多，会影响模型收敛，在TrainDataset下可以设置`empty_ratio: 0.1`对无标注框数据进行随机采样，控制无标注框的数据量占总数据量的比例，默认值为1.，即使用全部无标注框
diff --git a/docs/tutorials/FAQ/README.md b/docs/tutorials/FAQ/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..67d688600f1e93455f5ac700ff1b51fcc1bbb375
--- /dev/null
+++ b/docs/tutorials/FAQ/README.md
@@ -0,0 +1,6 @@
+# FAQ/常见问题
+
+**PaddleDetection**非常感谢各位开发者提出任何使用问题或需求，我们根据大家的提问，总结**FAQ/常见问题**合集，并在**每周一**进行更新，以下是往期的FAQ，欢迎大家进行查阅。
+
+- [FAQ：第零期](./FAQ第零期.md)
+- [FAQ：第一期](./FAQ第一期.md)
diff --git a/docs/tutorials/GETTING_STARTED.md b/docs/tutorials/GETTING_STARTED.md
new file mode 100644
index 0000000000000000000000000000000000000000..6ed4043a2f69fcbba19e757c6abdaa6b8507fc7b
--- /dev/null
+++ b/docs/tutorials/GETTING_STARTED.md
@@ -0,0 +1,146 @@
+English | [简体中文](GETTING_STARTED_cn.md)
+
+# Getting Started
+
+## Installation
+
+For setting up the running environment, please refer to [installation
+instructions](INSTALL_cn.md).
+
+
+
+## Data preparation
+
+- Please refer to [PrepareDetDataSet](./data/PrepareDetDataSet_en.md) for data preparation
+- Please set the data path for data configuration file in ```configs/datasets```
+
+## Training & Evaluation & Inference
+
+PaddleDetection provides scripts for training, evalution and inference with various features according to different configure. And for more distribued training details see [DistributedTraining].(./DistributedTraining_en.md)
+
+```bash
+# training on single-GPU
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml
+# training on multi-GPU
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml
+# training on multi-machines and multi-GPUs
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+$fleetrun --ips="10.127.6.17,10.127.5.142,10.127.45.13,10.127.44.151" --selected_gpu 0,1,2,3,4,5,6,7 tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml
+# GPU evaluation
+export CUDA_VISIBLE_DEVICES=0
+python tools/eval.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams
+# Inference
+python tools/infer.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml --infer_img=demo/000000570688.jpg -o weights=https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams
+```
+
+### Other argument list
+
+list below can be viewed by `--help`
+
+|         FLAG             |  script supported  |    description    |     default     |      remark      |
+| :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: |
+|          -c              |      ALL       |  Select config file  |  None  |  **required**, such as `-c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml` |
+|          -o              |      ALL       |  Set parameters in configure file  |  None  |  `-o` has higher priority to file configured by `-c`. Such as `-o use_gpu=False`  |  
+|        --eval            |     train      |  Whether to perform evaluation in training  |  False  |  set `--eval` if needed  |
+|   -r/--resume_checkpoint |     train      |  Checkpoint path for resuming training  |  None  |  such as `-r output/faster_rcnn_r50_1x_coco/10000`  |
+|      --slim_config     |     ALL |  Configure file of slim method  |  None  |  such as `--slim_config configs/slim/prune/yolov3_prune_l1_norm.yml`  |
+|        --use_vdl          |   train/infer   |  Whether to record the data with [VisualDL](https://github.com/paddlepaddle/visualdl), so as to display in VisualDL  |  False  |  VisualDL requires Python>=3.5   |
+|        --vdl\_log_dir     |   train/infer   |  VisualDL logging directory for image  |  train:`vdl_log_dir/scalar` infer: `vdl_log_dir/image`  |  VisualDL requires Python>=3.5   |
+|      --output_eval       |   eval |  Directory for storing the evaluation output  | None  |   such as `--output_eval=eval_output`, default is current directory  |
+|       --json_eval        |       eval     |  Whether to evaluate with already existed bbox.json or mask.json  |  False  |  set `--json_eval` if needed and json path is set in `--output_eval`  |
+|      --classwise         |       eval     |  Whether to eval AP for each class and draw PR curve  |  False  |  set `--classwise` if needed  |
+|       --output_dir       |      infer     |  Directory for storing the output visualization files  |  `./output`  |  such as `--output_dir output`  |
+|    --draw_threshold      |      infer     |  Threshold to reserve the result for visualization  |  0.5  |   such as `--draw_threshold 0.7`  |
+|      --infer_dir         |       infer     |  Directory for images to perform inference on  |  None  | One of `infer_dir` and `infer_img` is requied  |
+|      --infer_img         |       infer     |  Image path  |  None  | One of `infer_dir` and `infer_img` is requied, `infer_img` has higher priority over `infer_dir`  |
+|      --save_results         |       infer     |  Whether to save detection results to file      |  False | Optional
+
+
+
+## Examples
+
+### Training
+
+- Perform evaluation in training
+
+  ```bash
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml --eval
+  ```
+
+  Perform training and evalution alternatively and evaluate at each end of epoch. Meanwhile, the best model with highest MAP is saved at each epoch which has the same path as `model_final`.
+
+  If evaluation dataset is large, we suggest modifing `snapshot_epoch` in `configs/runtime.yml` to decrease evaluation times or evaluating after training.
+
+- Fine-tune other task
+
+  When using pre-trained model to fine-tune other task, pretrain\_weights can be used directly. The parameters with different shape will be ignored automatically. For example:
+
+
+  ```bash
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # If the shape of parameters in program is different from pretrain_weights,
+  # then PaddleDetection will not use such parameters.
+  python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml \
+                           -o pretrain_weights=output/faster_rcnn_r50_1x_coco/model_final \
+  ```
+
+##### NOTES
+
+- `CUDA_VISIBLE_DEVICES` can specify different gpu numbers. Such as: `export CUDA_VISIBLE_DEVICES=0,1,2,3`.
+- Dataset will be downloaded automatically and cached in `~/.cache/paddle/dataset` if not be found locally.
+- Pretrained model is downloaded automatically and cached in `~/.cache/paddle/weights`.
+- Checkpoints are saved in `output` by default, and can be revised from `save_dir` in `configs/runtime.yml`.
+
+
+### Evaluation
+
+- Evaluate by specified weights path and dataset path
+
+  ```bash
+  export CUDA_VISIBLE_DEVICES=0
+  python -u tools/eval.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml \
+                          -o weights=https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams
+  ```
+
+  The path of model to be evaluted can be both local path and link in [MODEL_ZOO](../MODEL_ZOO_cn.md).
+
+- Evaluate with json
+
+  ```bash
+  export CUDA_VISIBLE_DEVICES=0
+  python tools/eval.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml \
+             --json_eval \
+             -output_eval evaluation/
+  ```
+
+  The json file must be named bbox.json or mask.json, placed in the `evaluation/` directory.
+
+
+### Inference
+
+- Output specified directory && Set up threshold
+
+  ```bash
+  export CUDA_VISIBLE_DEVICES=0
+  python tools/infer.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml \
+                      --infer_img=demo/000000570688.jpg \
+                      --output_dir=infer_output/ \
+                      --draw_threshold=0.5 \
+                      -o weights=output/faster_rcnn_r50_fpn_1x_coco/model_final \
+                      --use_vdl=True
+  ```
+
+  `--draw_threshold` is an optional argument. Default is 0.5.
+  Different thresholds will produce different results depending on the calculation of [NMS](https://ieeexplore.ieee.org/document/1699659).
+
+
+## Deployment
+
+Please refer to [depolyment](../../deploy/README_en.md)
+
+## Model Compression
+
+Please refer to [slim](../../configs/slim/README_en.md)
diff --git a/docs/tutorials/GETTING_STARTED_cn.md b/docs/tutorials/GETTING_STARTED_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0230f514746344b0d2ad1f1c36f8c68c5c4e45d
--- /dev/null
+++ b/docs/tutorials/GETTING_STARTED_cn.md
@@ -0,0 +1,266 @@
+[English](GETTING_STARTED.md) | 简体中文
+
+
+# 30分钟快速上手PaddleDetection
+
+PaddleDetection作为成熟的目标检测开发套件，提供了从数据准备、模型训练、模型评估、模型导出到模型部署的全流程。在这个章节里面，我们以路标检测数据集为例，提供快速上手PaddleDetection的流程。
+
+## 1 安装
+
+关于安装配置运行环境，请参考[安装指南](INSTALL_cn.md)
+在本演示案例中，假定用户将PaddleDetection的代码克隆并放置在`/home/paddle`目录中。用户执行的命令操作均在`/home/paddle/PaddleDetection`目录下完成
+
+## 2 准备数据
+目前PaddleDetection支持：COCO VOC WiderFace, MOT四种数据格式。
+- 首先按照[准备数据文档](./data/PrepareDetDataSet.md) 准备数据。  
+- 然后设置`configs/datasets`中相应的coco或voc等数据配置文件中的数据路径。
+- 在本项目中，我们使用路标识别数据集
+ ```bash
+python dataset/roadsign_voc/download_roadsign_voc.py
+```
+- 下载后的数据格式为
+```
+  ├── download_roadsign_voc.py
+  ├── annotations
+  │   ├── road0.xml
+  │   ├── road1.xml
+  │   |   ...
+  ├── images
+  │   ├── road0.png
+  │   ├── road1.png
+  │   |   ...
+  ├── label_list.txt
+  ├── train.txt
+  ├── valid.txt
+```
+
+## 3 配置文件改动和说明
+我们使用`configs/yolov3/yolov3_mobilenet_v1_roadsign`配置进行训练。
+在静态图版本下，一个模型往往可以通过两个配置文件（一个主配置文件、一个reader的读取配置）实现，在PaddleDetection 2.0后续版本，采用了模块解耦设计，用户可以组合配置模块实现检测器，并可自由修改覆盖各模块配置，如下图所示
+
+
+<center>
+<img src="../images/roadsign_yml.png" width="500" >
+</center>
+<br><center>配置文件摘要</center></br>
+
+
+从上图看到`yolov3_mobilenet_v1_roadsign.yml`配置需要依赖其他的配置文件。在该例子中需要依赖：
+
+```bash
+  roadsign_voc.yml
+
+  runtime.yml
+
+  optimizer_40e.yml
+
+  yolov3_mobilenet_v1.yml
+
+  yolov3_reader.yml
+--------------------------------------
+
+
+yolov3_mobilenet_v1_roadsign 文件入口
+
+roadsign_voc 主要说明了训练数据和验证数据的路径
+
+runtime.yml 主要说明了公共的运行参数，比如说是否使用GPU、每多少个epoch存储checkpoint等
+
+optimizer_40e.yml 主要说明了学习率和优化器的配置。
+
+ppyolov2_r50vd_dcn.yml 主要说明模型、和主干网络的情况。
+
+ppyolov2_reader.yml 主要说明数据读取器配置，如batch size，并发加载子进程数等，同时包含读取后预处理操作，如resize、数据增强等等
+
+
+```
+
+<center><img src="../images/yaml_show.png" width="1000" ></center>
+
+<br><center>配置文件结构说明</center></br>
+
+### 修改配置文件说明
+* 关于数据的路径修改说明
+在修改配置文件中，用户如何实现自定义数据集是非常关键的一步，如何定义数据集请参考[如何自定义数据集](https://aistudio.baidu.com/aistudio/projectdetail/1917140)
+* 默认学习率是适配多GPU训练(8x GPU)，若使用单GPU训练，须对应调整学习率（例如，除以8）
+* 更多使用问题，请参考[FAQ](FAQ)
+
+## 4 训练
+
+PaddleDetection提供了单卡/多卡训练模式，满足用户多种训练需求
+* GPU单卡训练
+```bash
+export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令
+python tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml
+```
+
+* GPU多卡训练
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #windows和Mac下不需要执行该命令
+python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml
+```
+
+* [GPU多机多卡训练](./DistributedTraining_cn.md)
+```bash
+$fleetrun \
+--ips="10.127.6.17,10.127.5.142,10.127.45.13,10.127.44.151" \
+--selected_gpu 0,1,2,3,4,5,6,7 \
+tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml \
+```
+
+* Fine-tune其他任务
+
+  使用预训练模型fine-tune其他任务时，可以直接加载预训练模型，形状不匹配的参数将自动忽略，例如：
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # 如果模型中参数形状与加载权重形状不同，将不会加载这类参数
+python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o pretrain_weights=output/model_final
+```
+
+* 模型恢复训练
+
+  在日常训练过程中，有的用户由于一些原因导致训练中断，用户可以使用-r的命令恢复训练
+
+```bash
+export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令
+python tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -r output/faster_rcnn_r50_1x_coco/10000
+ ```
+
+## 5 评估
+* 默认将训练生成的模型保存在当前`output`文件夹下
+ ```bash
+export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令
+python tools/eval.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_roadsign.pdparams
+```
+* 边训练，边评估
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #windows和Mac下不需要执行该命令
+python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml --eval
+```
+
+  在训练中交替执行评估, 评估在每个epoch训练结束后开始。每次评估后还会评出最佳mAP模型保存到`best_model`文件夹下。
+
+  如果验证集很大，测试将会比较耗时，建议调整`configs/runtime.yml` 文件中的 `snapshot_epoch`配置以减少评估次数，或训练完成后再进行评估。
+
+- 通过json文件评估
+
+```bash
+export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令
+python tools/eval.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml \
+             --json_eval \
+             -output_eval evaluation/
+```
+* 上述命令中没有加载模型的选项，则使用配置文件中weights的默认配置，`weights`表示训练过程中保存的最后一轮模型文件
+
+* json文件必须命名为bbox.json或者mask.json，放在`evaluation`目录下。
+
+## 6 预测
+
+  ```bash
+  python tools/infer.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml --infer_img=demo/000000570688.jpg -o weights=https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_roadsign.pdparams
+  ```
+ * 设置参数预测
+
+  ```bash
+  export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令
+  python tools/infer.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml \
+                      --infer_img=demo/road554.png \
+                      --output_dir=infer_output/ \
+                      --draw_threshold=0.5 \
+                      -o weights=output/yolov3_mobilenet_v1_roadsign/model_final \
+                      --use_vdl=True
+  ```
+
+  `--draw_threshold` 是个可选参数. 根据 [NMS](https://ieeexplore.ieee.org/document/1699659) 的计算，不同阈值会产生不同的结果
+  `keep_top_k`表示设置输出目标的最大数量，默认值为100，用户可以根据自己的实际情况进行设定。
+
+结果如下图：
+
+![road554 image](../images/road554.png)
+
+## 7 训练可视化
+
+当打开`use_vdl`开关后，为了方便用户实时查看训练过程中状态，PaddleDetection集成了VisualDL可视化工具，当打开`use_vdl`开关后，记录的数据包括：
+1. loss变化趋势
+2. mAP变化趋势
+
+```bash
+export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令
+python tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml
+                        --use_vdl=true \
+                        --vdl_log_dir=vdl_dir/scalar \
+```
+
+使用如下命令启动VisualDL查看日志
+```shell
+# 下述命令会在127.0.0.1上启动一个服务，支持通过前端web页面查看，可以通过--host这个参数指定实际ip地址
+visualdl --logdir vdl_dir/scalar/
+```
+
+在浏览器输入提示的网址，效果如下：
+<center><img src="https://ai-studio-static-online.cdn.bcebos.com/ab767a202f084d1589f7d34702a75a7ef5d0f0a7e8c445bd80d54775b5761a8d" width="900" ></center>
+
+<br><center>图：VDL效果演示</center></br>
+
+**参数列表**
+
+以下列表可以通过`--help`查看
+
+|         FLAG             |     支持脚本    |        用途        |      默认值       |         备注         |
+| :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: |
+|          -c              |      ALL       |  指定配置文件  |  None  |  **必选**，例如-c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml |
+|          -o              |      ALL       |  设置或更改配置文件里的参数内容  |  None  |  相较于`-c`设置的配置文件有更高优先级，例如：`-o use_gpu=False`  |
+|        --eval            |     train      |  是否边训练边测试  |  False  |  如需指定，直接`--eval`即可 |
+|   -r/--resume_checkpoint |     train      |  恢复训练加载的权重路径  |  None  |  例如：`-r output/faster_rcnn_r50_1x_coco/10000`  |
+|       --slim_config             |     ALL      |  模型压缩策略配置文件  |  None  |  例如`--slim_config configs/slim/prune/yolov3_prune_l1_norm.yml`  |
+|        --use_vdl          |   train/infer   |  是否使用[VisualDL](https://github.com/paddlepaddle/visualdl)记录数据，进而在VisualDL面板中显示  |  False  |  VisualDL需Python>=3.5   |
+|        --vdl\_log_dir     |   train/infer   |  指定 VisualDL 记录数据的存储路径  |  train:`vdl_log_dir/scalar` infer: `vdl_log_dir/image`  |  VisualDL需Python>=3.5   |
+|      --output_eval       |   eval |  评估阶段保存json路径  | None  |  例如 `--output_eval=eval_output`, 默认为当前路径  |
+|       --json_eval        |       eval     |  是否通过已存在的bbox.json或者mask.json进行评估  |  False  |  如需指定，直接`--json_eval`即可， json文件路径在`--output_eval`中设置  |
+|      --classwise         |       eval     |  是否评估单类AP和绘制单类PR曲线  |  False  |  如需指定，直接`--classwise`即可 |
+|       --output_dir       |      infer/export_model     |  预测后结果或导出模型保存路径  |  `./output`  |  例如`--output_dir=output`  |
+|    --draw_threshold      |      infer     |  可视化时分数阈值  |  0.5  |  例如`--draw_threshold=0.7`  |
+|      --infer_dir         |       infer     |  用于预测的图片文件夹路径  |  None  |    `--infer_img`和`--infer_dir`必须至少设置一个 |
+|      --infer_img         |       infer     |  用于预测的图片路径  |  None  |  `--infer_img`和`--infer_dir`必须至少设置一个，`infer_img`具有更高优先级  |
+|      --save_results         |       infer     |  是否在文件夹下将图片的预测结果保存到文件中        |  False  |  可选  |
+
+
+## 8 模型导出
+
+在模型训练过程中保存的模型文件是包含前向预测和反向传播的过程，在实际的工业部署则不需要反向传播，因此需要将模型进行导成部署需要的模型格式。
+在PaddleDetection中提供了 `tools/export_model.py`脚本来导出模型
+
+```bash
+python tools/export_model.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml --output_dir=./inference_model \
+ -o weights=output/yolov3_mobilenet_v1_roadsign/best_model
+```
+预测模型会导出到`inference_model/yolov3_mobilenet_v1_roadsign`目录下，分别为`infer_cfg.yml`, `model.pdiparams`, `model.pdiparams.info`,`model.pdmodel` 如果不指定文件夹，模型则会导出在`output_inference`
+
+* 更多关于模型导出的文档，请参考[模型导出文档](../../deploy/EXPORT_MODEL.md)
+
+## 9 模型压缩
+
+为了进一步对模型进行优化，PaddleDetection提供了基于PaddleSlim进行模型压缩的完整教程和benchmark。目前支持的方案：
+* 裁剪
+* 量化
+* 蒸馏
+* 联合策略
+* 更多关于模型压缩的文档，请参考[模型压缩文档](../../configs/slim/README.md)。
+## 10 预测部署
+PaddleDetection提供了PaddleInference、PaddleServing、PaddleLite多种部署形式，支持服务端、移动端、嵌入式等多种平台，提供了完善的Python和C++部署方案。
+* 在这里，我们以Python为例，说明如何使用PaddleInference进行模型部署
+```bash
+python deploy/python/infer.py --model_dir=./output_inference/yolov3_mobilenet_v1_roadsign --image_file=demo/road554.png --device=GPU
+```
+* 同时`infer.py`提供了丰富的接口，用户进行接入视频文件、摄像头进行预测，更多内容请参考[Python端预测部署](../../deploy/python)
+### PaddleDetection支持的部署形式说明
+|形式|语言|教程|设备/平台|
+|-|-|-|-|
+|PaddleInference|Python|已完善|Linux(arm X86)、Windows
+|PaddleInference|C++|已完善|Linux(arm X86)、Windows|
+|PaddleServing|Python|已完善|Linux(arm X86)、Windows|
+|PaddleLite|C++|已完善|Android、IOS、FPGA、RK...
+
+* 更多关于预测部署的文档，请参考[预测部署文档](../../deploy/README.md)。
diff --git a/docs/tutorials/INSTALL.md b/docs/tutorials/INSTALL.md
new file mode 100644
index 0000000000000000000000000000000000000000..db11a2532a96b4efe00de08878dea320983dcdf4
--- /dev/null
+++ b/docs/tutorials/INSTALL.md
@@ -0,0 +1,128 @@
+English | [简体中文](INSTALL_cn.md)
+
+# Installation
+
+
+This document covers how to install PaddleYOLO and its dependencies
+(including PaddlePaddle), together with COCO and Pascal VOC dataset.
+
+For general information about PaddleYOLO, please see [README.md](https://github.com/PaddlePaddle/PaddleYOLO/tree/develop).
+
+## Requirements:
+
+- PaddlePaddle 2.3.2
+- OS 64 bit
+- Python 3(3.5.1+/3.6/3.7/3.8/3.9)，64 bit
+- pip/pip3(9.0.1+), 64 bit
+- CUDA >= 10.2
+- cuDNN >= 7.6
+
+
+Dependency of PaddleYOLO and PaddlePaddle:
+
+| PaddleYOLO version | PaddlePaddle version  |    tips    |
+| :----------------: | :---------------: | :-------: |
+|    develop           |       >= 2.3.2   |     Dygraph mode is set as default    |
+|    release/2.6       |       >= 2.3.2   |     Dygraph mode is set as default    |
+|    release/2.5       |       >= 2.2.2   |     Dygraph mode is set as default    |
+
+## Instruction
+
+### 1. Install PaddlePaddle
+
+```
+
+# CUDA10.2
+python -m pip install paddlepaddle-gpu==2.3.2 -i https://mirror.baidu.com/pypi/simple
+
+# CPU
+python -m pip install paddlepaddle==2.3.2 -i https://mirror.baidu.com/pypi/simple
+```
+
+- For more CUDA version or environment to quick install, please refer to the [PaddlePaddle Quick Installation document](https://www.paddlepaddle.org.cn/install/quick)
+- For more installation methods such as conda or compile with source code, please refer to the [installation document](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html)
+
+Please make sure that your PaddlePaddle is installed successfully and the version is not lower than the required version. Use the following command to verify.
+
+```
+# check
+>>> import paddle
+>>> paddle.utils.run_check()
+
+# confirm the paddle's version
+python -c "import paddle; print(paddle.__version__)"
+```
+
+**Note**
+
+1.  If you want to use PaddleDetection on multi-GPU, please install NCCL at first.
+
+
+### 2. Install PaddleDetection
+
+
+
+**Note:** Installing via pip only supports Python3
+
+```
+
+# Clone PaddleDetection repository
+cd <path/to/clone/PaddleDetection>
+git clone https://github.com/PaddlePaddle/PaddleDetection.git
+
+# Install other dependencies
+cd PaddleDetection
+pip install -r requirements.txt
+
+# Compile and install paddledet
+python setup.py install
+
+```
+
+**Note**
+
+1. If you are working on Windows OS, `pycocotools` installing may failed because of the origin version of cocoapi does not support windows, another version can be used used which only supports Python3:
+
+    ```pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI```
+
+2. If you are using Python <= 3.6, `pycocotools` installing may failed with error like `distutils.errors.DistutilsError: Could not find suitable distribution for Requirement.parse('cython>=0.27.3')`, please install `cython` firstly, for example `pip install cython`
+
+After installation, make sure the tests pass:
+
+```shell
+python ppdet/modeling/tests/test_architectures.py
+```
+
+If the tests are passed, the following information will be prompted:
+
+```
+.......
+----------------------------------------------------------------------
+Ran 7 tests in 12.816s
+OK
+```
+
+## Use built Docker images
+
+> If you  do not have a Docker environment, please refer to [Docker](https://www.docker.com/).
+
+We provide docker images containing the latest PaddleDetection code, and all environment and package dependencies are pre-installed. All you have to do is to **pull and run the docker image**. Then you can enjoy PaddleDetection without any extra steps.
+
+Get these images and guidance in [docker hub](https://hub.docker.com/repository/docker/paddlecloud/paddledetection), including CPU, GPU, ROCm environment versions.
+
+If you have some customized requirements about automatic building docker images, you can get it in github repo [PaddlePaddle/PaddleCloud](https://github.com/PaddlePaddle/PaddleCloud/tree/main/tekton).
+
+## Inference demo
+
+**Congratulation!** Now you have installed PaddleDetection successfully and try our inference demo:
+
+```
+# Predict an image by GPU
+export CUDA_VISIBLE_DEVICES=0
+python tools/infer.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o use_gpu=true weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --infer_img=demo/000000014439.jpg
+```
+
+An image of the same name with the predicted result will be generated under the `output` folder.
+The result is as shown below：
+
+![](../images/000000014439.jpg)
diff --git a/docs/tutorials/INSTALL_cn.md b/docs/tutorials/INSTALL_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..33001aa7108f1fce8ae4484a54b5f4c71fde88e6
--- /dev/null
+++ b/docs/tutorials/INSTALL_cn.md
@@ -0,0 +1,121 @@
+[English](INSTALL.md) | 简体中文
+
+
+# 安装文档
+
+
+
+## 环境要求
+
+- PaddlePaddle 2.3.2
+- OS 64位操作系统
+- Python 3(3.5.1+/3.6/3.7/3.8/3.9)，64位版本
+- pip/pip3(9.0.1+)，64位版本
+- CUDA >= 10.2
+- cuDNN >= 7.6
+
+PaddleYOLO 依赖 PaddlePaddle 版本关系：
+
+|  PaddleYOLO版本  | PaddlePaddle版本  |    备注    |
+| :------------------: | :---------------: | :-------: |
+|    develop           |       >= 2.3.2    |     默认使用动态图模式    |
+|    release/2.6       |       >= 2.3.2    |     默认使用动态图模式    |
+|    release/2.5       |       >= 2.2.2    |     默认使用动态图模式    |
+
+## 安装说明
+
+### 1. 安装PaddlePaddle
+
+```
+# CUDA10.2
+python -m pip install paddlepaddle-gpu==2.3.2 -i https://mirror.baidu.com/pypi/simple
+
+# CPU
+python -m pip install paddlepaddle==2.3.2 -i https://mirror.baidu.com/pypi/simple
+```
+- 更多CUDA版本或环境快速安装，请参考[PaddlePaddle快速安装文档](https://www.paddlepaddle.org.cn/install/quick)
+- 更多安装方式例如conda或源码编译安装方法，请参考[PaddlePaddle安装文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)
+
+请确保您的PaddlePaddle安装成功并且版本不低于需求版本。使用以下命令进行验证。
+
+```
+# 在您的Python解释器中确认PaddlePaddle安装成功
+>>> import paddle
+>>> paddle.utils.run_check()
+
+# 确认PaddlePaddle版本
+python -c "import paddle; print(paddle.__version__)"
+```
+**注意**
+1. 如果您希望在多卡环境下使用PaddleDetection，请首先安装NCCL
+
+### 2. 安装PaddleDetection
+
+
+
+
+**注意：** pip安装方式只支持Python3
+
+
+
+```
+# 克隆PaddleDetection仓库
+cd <path/to/clone/PaddleDetection>
+git clone https://github.com/PaddlePaddle/PaddleDetection.git
+
+# 安装其他依赖
+cd PaddleDetection
+pip install -r requirements.txt
+
+# 编译安装paddledet
+python setup.py install
+```
+
+**注意**
+1. 如果github下载代码较慢，可尝试使用[gitee](https://gitee.com/PaddlePaddle/PaddleDetection.git)或者[代理加速](https://doc.fastgit.org/zh-cn/guide.html)。
+
+1. 若您使用的是Windows系统，由于原版cocoapi不支持Windows，`pycocotools`依赖可能安装失败，可采用第三方实现版本，该版本仅支持Python3
+
+    ```pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI```
+
+2. 若您使用的是Python <= 3.6的版本，安装`pycocotools`可能会报错`distutils.errors.DistutilsError: Could not find suitable distribution for Requirement.parse('cython>=0.27.3')`, 您可通过先安装`cython`如`pip install cython`解决该问题
+
+
+安装后确认测试通过：
+
+```
+python ppdet/modeling/tests/test_architectures.py
+```
+
+测试通过后会提示如下信息：
+
+```
+.......
+----------------------------------------------------------------------
+Ran 7 tests in 12.816s
+OK
+```
+
+## 使用Docker镜像
+> 如果您没有Docker运行环境，请参考[Docker官网](https://www.docker.com/)进行安装。
+
+我们提供了包含最新 PaddleDetection 代码的docker镜像，并预先安装好了所有的环境和库依赖，您只需要**拉取docker镜像**，然后**运行docker镜像**，无需其他任何额外操作，即可开始使用PaddleDetection的所有功能。
+
+在[Docker Hub](https://hub.docker.com/repository/docker/paddlecloud/paddledetection)中获取这些镜像及相应的使用指南，包括CPU、GPU、ROCm版本。
+如果您对自动化制作docker镜像感兴趣，或有自定义需求，请访问[PaddlePaddle/PaddleCloud](https://github.com/PaddlePaddle/PaddleCloud/tree/main/tekton)做进一步了解。
+
+## 快速体验
+
+**恭喜！** 您已经成功安装了PaddleDetection，接下来快速体验目标检测效果
+
+```
+# 在GPU上预测一张图片
+export CUDA_VISIBLE_DEVICES=0
+python tools/infer.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o use_gpu=true weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --infer_img=demo/000000014439.jpg
+```
+
+会在`output`文件夹下生成一个画有预测结果的同名图像。
+
+结果如下图：
+
+![](../images/000000014439.jpg)
diff --git a/docs/tutorials/QUICK_STARTED.md b/docs/tutorials/QUICK_STARTED.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4e7ae9491d38ca07154b67e9523b7ae8d77ca7a
--- /dev/null
+++ b/docs/tutorials/QUICK_STARTED.md
@@ -0,0 +1,91 @@
+English | [简体中文](QUICK_STARTED_cn.md)
+
+# Quick Start
+In order to enable users to experience PaddleDetection and produce models in a short time, this tutorial introduces the pipeline to get a decent object detection model by finetuning on a small dataset in 10 minutes only. In practical applications, it is recommended that users select a suitable model configuration file for their specific demand.
+
+- **Set GPU**
+
+
+```bash
+export CUDA_VISIBLE_DEVICES=0
+```
+
+## Inference Demo with Pre-trained Models
+
+```
+# predict an image using PP-YOLO
+python tools/infer.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o use_gpu=true weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --infer_img=demo/000000014439.jpg
+```
+
+the result：
+
+![](../images/000000014439.jpg)
+
+
+## Data preparation
+The Dataset is [Kaggle dataset](https://www.kaggle.com/andrewmvd/road-sign-detection) ，including 877 images and 4 data categories: crosswalk, speedlimit, stop, trafficlight. The dataset is divided into training set (701 images) and test set (176 images)，[download link](https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar).
+
+```
+# Note: this command could skip and
+# the dataset will be dowloaded automatically at the stage of training.
+python dataset/roadsign_voc/download_roadsign_voc.py
+```
+
+## Training & Evaluation & Inference
+### 1、Training
+```
+# It will takes about 10 minutes on 1080Ti and 1 hour on CPU
+# -c set configuration file
+# -o overwrite the settings in the configuration file
+# --eval Evaluate while training, and a model named best_model.pdmodel with the most evaluation results will be automatically saved
+
+
+python tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml --eval -o use_gpu=true
+```
+
+If you want to observe the loss change curve in real time through VisualDL, add --use_vdl=true to the training command, and set the log save path through --vdl_log_dir.
+
+**Note: VisualDL need Python>=3.5**
+
+Please install [VisualDL](https://github.com/PaddlePaddle/VisualDL) first
+
+```
+python -m pip install visualdl -i https://mirror.baidu.com/pypi/simple
+```
+
+```
+python -u tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml \
+                        --use_vdl=true \
+                        --vdl_log_dir=vdl_dir/scalar \
+                        --eval
+```
+View the change curve in real time through the visualdl command:
+```
+visualdl --logdir vdl_dir/scalar/ --host <host_IP> --port <port_num>
+```
+
+### 2、Evaluation
+```
+# Evaluate best_model by default
+# -c set config file
+# -o overwrite the settings in the configuration file
+
+python tools/eval.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o use_gpu=true
+```
+
+The final mAP should be around 0.85. The dataset is small so the precision may vary a little after each training.
+
+
+### 3、Inference
+```
+# -c set config file
+# -o overwrite the settings in the configuration file
+# --infer_img image path
+# After the prediction is over, an image of the same name with the prediction result will be generated in the output folder
+
+python tools/infer.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o use_gpu=true --infer_img=demo/road554.png
+```
+
+The result is as shown below：
+
+![](../images/road554.png)
diff --git a/docs/tutorials/QUICK_STARTED_cn.md b/docs/tutorials/QUICK_STARTED_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..d201e7fc901c2aef8b1ce95ebc9ba087ed3a1d9e
--- /dev/null
+++ b/docs/tutorials/QUICK_STARTED_cn.md
@@ -0,0 +1,88 @@
+[English](QUICK_STARTED.md) | 简体中文
+
+# 快速开始
+为了使得用户能够在很短时间内快速产出模型，掌握PaddleDetection的使用方式，这篇教程通过一个预训练检测模型对小数据集进行finetune。在较短时间内即可产出一个效果不错的模型。实际业务中，建议用户根据需要选择合适模型配置文件进行适配。
+
+- **设置显卡**
+```bash
+export CUDA_VISIBLE_DEVICES=0
+```
+
+## 一、快速体验
+```
+# 用PP-YOLO算法在COCO数据集上预训练模型预测一张图片
+python tools/infer.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o use_gpu=true weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --infer_img=demo/000000014439.jpg
+```
+
+结果如下图：
+
+![demo image](../images/000000014439.jpg)
+
+
+## 二、准备数据
+数据集参考[Kaggle数据集](https://www.kaggle.com/andrewmvd/road-sign-detection) ，包含877张图像，数据类别4类：crosswalk，speedlimit，stop，trafficlight。
+将数据划分为训练集701张图和测试集176张图，[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar).
+
+```
+# 注意：可跳过这步下载，后面训练会自动下载
+python dataset/roadsign_voc/download_roadsign_voc.py
+```
+
+
+## 三、训练、评估、预测
+### 1、训练
+```
+# 边训练边测试 CPU需要约1小时(use_gpu=false)，1080Ti GPU需要约10分钟
+# -c 参数表示指定使用哪个配置文件
+# -o 参数表示指定配置文件中的全局变量（覆盖配置文件中的设置），这里设置使用gpu
+# --eval 参数表示边训练边评估，最后会自动保存一个名为model_final.pdparams的模型
+
+python tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml --eval -o use_gpu=true
+```
+
+如果想通过VisualDL实时观察loss变化曲线，在训练命令中添加--use_vdl=true，以及通过--vdl_log_dir设置日志保存路径。
+
+**但注意VisualDL需Python>=3.5**
+
+首先安装[VisualDL](https://github.com/PaddlePaddle/VisualDL)
+```
+python -m pip install visualdl -i https://mirror.baidu.com/pypi/simple
+```
+
+```
+python -u tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml \
+                        --use_vdl=true \
+                        --vdl_log_dir=vdl_dir/scalar \
+                        --eval
+```
+通过visualdl命令实时查看变化曲线：
+```
+visualdl --logdir vdl_dir/scalar/ --host <host_IP> --port <port_num>
+```
+
+
+### 2、评估
+```
+# 评估 默认使用训练过程中保存的model_final.pdparams
+# -c 参数表示指定使用哪个配置文件
+# -o 参数表示指定配置文件中的全局变量（覆盖配置文件中的设置）
+# 目前只支持单卡评估
+
+python tools/eval.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o use_gpu=true
+```
+最终模型精度在mAP=0.85左右，由于数据集较小因此每次训练结束后精度会有一定波动
+
+
+### 3、预测
+```
+# -c 参数表示指定使用哪个配置文件
+# -o 参数表示指定配置文件中的全局变量（覆盖配置文件中的设置）
+# --infer_img 参数指定预测图像路径
+# 预测结束后会在output文件夹中生成一张画有预测结果的同名图像
+
+python tools/infer.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml -o use_gpu=true --infer_img=demo/road554.png
+```
+
+结果如下图：
+
+![road554 image](../images/road554.png)
diff --git a/docs/tutorials/config_annotation/multi_scale_test_config.md b/docs/tutorials/config_annotation/multi_scale_test_config.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b6b6bb1fd4d08e696ad8d0d729e18207f3220d8
--- /dev/null
+++ b/docs/tutorials/config_annotation/multi_scale_test_config.md
@@ -0,0 +1,45 @@
+# Multi Scale Test Configuration
+
+Tags: Configuration
+
+---
+```yaml
+
+##################################### Multi scale test configuration #####################################
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - MultiscaleTestResize: {origin_target_size: [800, 1333], target_size: [700 , 900]}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+
+TestReader:
+  sample_transforms:
+  - Decode: {}
+  - MultiscaleTestResize: {origin_target_size: [800, 1333], target_size: [700 , 900]}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+```
+
+---
+
+Multi Scale Test is a TTA (Test Time Augmentation) method, it can improve object detection performance. 
+
+The input image will be scaled into different scales, then model generated predictions (bboxes) at different scales, finally all the predictions will be combined to generate final prediction. (Here **NMS** is used to aggregate the predictions.)
+
+## _MultiscaleTestResize_ option
+
+`MultiscaleTestResize` option is used to enable multi scale test prediction. 
+
+`origin_target_size: [800, 1333]` means the input image will be scaled to 800 (for short edge) and 1333 (max edge length cannot be greater than 1333) at first
+
+`target_size: [700 , 900]` property is used to specify different scales. 
+
+It can be plugged into evaluation process or test (inference) process, by adding `MultiscaleTestResize` entry to `EvalReader.sample_transforms` or `TestReader.sample_transforms`
+
+---
+
+###Note
+
+Now only CascadeRCNN, FasterRCNN and MaskRCNN are supported for multi scale testing. And batch size must be 1.
\ No newline at end of file
diff --git a/docs/tutorials/config_annotation/multi_scale_test_config_cn.md b/docs/tutorials/config_annotation/multi_scale_test_config_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..36851def51e7ae3a414b78df656100b5072685c0
--- /dev/null
+++ b/docs/tutorials/config_annotation/multi_scale_test_config_cn.md
@@ -0,0 +1,45 @@
+# 多尺度测试的配置
+
+标签: 配置
+
+---
+```yaml
+
+##################################### 多尺度测试的配置 #####################################
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - MultiscaleTestResize: {origin_target_size: [800, 1333], target_size: [700 , 900]}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+
+TestReader:
+  sample_transforms:
+  - Decode: {}
+  - MultiscaleTestResize: {origin_target_size: [800, 1333], target_size: [700 , 900]}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+```
+
+---
+
+多尺度测试是一种TTA方法（测试时增强），可以用于提高目标检测的准确率
+
+输入图像首先被缩放为不同尺度的图像，然后模型对这些不同尺度的图像进行预测，最后将这些不同尺度上的预测结果整合为最终预测结果。（这里使用了**NMS**来整合不同尺度的预测结果）
+
+## _MultiscaleTestResize_ 选项
+
+`MultiscaleTestResize` 选项用于开启多尺度测试. 
+
+`origin_target_size: [800, 1333]` 项代表输入图像首先缩放为短边为800，最长边不超过1333.
+
+`target_size: [700 , 900]` 项设置不同的预测尺度。
+
+通过在`EvalReader.sample_transforms`或`TestReader.sample_transforms`中设置`MultiscaleTestResize`项，可以在评估过程或预测过程中开启多尺度测试。
+
+---
+
+###注意
+
+目前多尺度测试只支持CascadeRCNN, FasterRCNN and MaskRCNN网络, 并且batch size需要是1.
\ No newline at end of file
diff --git a/docs/tutorials/config_annotation/ppyolo_r50vd_dcn_1x_coco_annotation.md b/docs/tutorials/config_annotation/ppyolo_r50vd_dcn_1x_coco_annotation.md
new file mode 100644
index 0000000000000000000000000000000000000000..2cbc188dc345c84ca619284baaf610d757cc3414
--- /dev/null
+++ b/docs/tutorials/config_annotation/ppyolo_r50vd_dcn_1x_coco_annotation.md
@@ -0,0 +1,266 @@
+# YOLO系列模型参数配置教程
+
+标签： 模型参数配置
+
+以`ppyolo_r50vd_dcn_1x_coco.yml`为例，这个模型由五个子配置文件组成：
+
+- 数据配置文件 `coco_detection.yml`
+
+```yaml
+# 数据评估类型
+metric: COCO
+# 数据集的类别数
+num_classes: 80
+
+# TrainDataset
+TrainDataset:
+  !COCODataSet
+    # 图像数据路径，相对 dataset_dir 路径，os.path.join(dataset_dir, image_dir)
+    image_dir: train2017
+    # 标注文件路径，相对 dataset_dir 路径，os.path.join(dataset_dir, anno_path)
+    anno_path: annotations/instances_train2017.json
+    # 数据文件夹
+    dataset_dir: dataset/coco
+    # data_fields
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  !COCODataSet
+    # 图像数据路径，相对 dataset_dir 路径，os.path.join(dataset_dir, image_dir)
+    image_dir: val2017
+    # 标注文件路径，相对 dataset_dir 路径，os.path.join(dataset_dir, anno_path)
+    anno_path: annotations/instances_val2017.json
+    # 数据文件夹，os.path.join(dataset_dir, anno_path)
+    dataset_dir: dataset/coco
+
+TestDataset:
+  !ImageFolder
+    # 标注文件路径，相对 dataset_dir 路径
+    anno_path: annotations/instances_val2017.json
+```
+
+- 优化器配置文件 `optimizer_1x.yml`
+
+```yaml
+# 总训练轮数
+epoch: 405
+
+# 学习率设置
+LearningRate:
+  # 默认为8卡训学习率
+  base_lr: 0.01
+  # 学习率调整策略
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    # 学习率变化位置(轮数)
+    milestones:
+    - 243
+    - 324
+  # Warmup
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 4000
+
+# 优化器
+OptimizerBuilder:
+  # 优化器
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  # 正则化
+  regularizer:
+    factor: 0.0005
+    type: L2
+```
+
+- 数据读取配置文件 `ppyolo_reader.yml`
+
+```yaml
+# 每张GPU reader进程个数
+worker_num: 2
+# 训练数据
+TrainReader:
+  inputs_def:
+    num_max_boxes: 50
+  # 训练数据transforms
+  sample_transforms:
+    - Decode: {}
+    - Mixup: {alpha: 1.5, beta: 1.5}
+    - RandomDistort: {}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {}
+    - RandomFlip: {}
+  # batch_transforms
+  batch_transforms:
+    - BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeBox: {}
+    - PadBox: {num_max_boxes: 50}
+    - BboxXYXY2XYWH: {}
+    - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
+    - Permute: {}
+    - Gt2YoloTarget: {anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]], anchors: [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]], downsample_ratios: [32, 16, 8]}
+  # 训练时batch_size
+  batch_size: 24
+  # 读取数据是否乱序
+  shuffle: true
+  # 是否丢弃最后不能完整组成batch的数据
+  drop_last: true
+  # mixup_epoch，大于最大epoch，表示训练过程一直使用mixup数据增广
+  mixup_epoch: 25000
+  # 是否通过共享内存进行数据读取加速，需要保证共享内存大小(如/dev/shm)满足大于1G
+  use_shared_memory: true
+
+# 评估数据
+EvalReader:
+  # 评估数据transforms
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
+    - Permute: {}
+  # 评估时batch_size
+  batch_size: 8
+  # 是否丢弃没有标注的数据
+  drop_empty: false
+
+# 测试数据
+TestReader:
+  inputs_def:
+    image_shape: [3, 608, 608]
+  # 测试数据transforms
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
+    - Permute: {}
+  # 测试时batch_size
+  batch_size: 1
+```
+
+- 模型配置文件 `ppyolo_r50vd_dcn.yml`
+
+```yaml
+# 模型结构类型
+architecture: YOLOv3
+# 预训练模型地址
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_pretrained.pdparams
+# norm_type
+norm_type: sync_bn
+# 是否使用ema
+use_ema: true
+# ema_decay
+ema_decay: 0.9998
+
+# YOLOv3
+YOLOv3:
+  # backbone
+  backbone: ResNet
+  # neck
+  neck: PPYOLOFPN
+  # yolo_head
+  yolo_head: YOLOv3Head
+  # post_process
+  post_process: BBoxPostProcess
+
+
+# backbone
+ResNet:
+  # depth
+  depth: 50
+  # variant
+  variant: d
+  # return_idx, 0 represent res2
+  return_idx: [1, 2, 3]
+  # dcn_v2_stages
+  dcn_v2_stages: [3]
+  # freeze_at
+  freeze_at: -1
+  # freeze_norm
+  freeze_norm: false
+  # norm_decay
+  norm_decay: 0.
+
+# PPYOLOFPN
+PPYOLOFPN:
+  # 是否coord_conv
+  coord_conv: true
+  # 是否drop_block
+  drop_block: true
+  # block_size
+  block_size: 3
+  # keep_prob
+  keep_prob: 0.9
+  # 是否spp
+  spp: true
+
+# YOLOv3Head
+YOLOv3Head:
+  # anchors
+  anchors: [[10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  # anchor_masks
+  anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+  # loss
+  loss: YOLOv3Loss
+  # 是否使用iou_aware
+  iou_aware: true
+  # iou_aware_factor
+  iou_aware_factor: 0.4
+
+# YOLOv3Loss
+YOLOv3Loss:
+  # ignore_thresh
+  ignore_thresh: 0.7
+  # downsample
+  downsample: [32, 16, 8]
+  # 是否label_smooth
+  label_smooth: false
+  # scale_x_y
+  scale_x_y: 1.05
+  # iou_loss
+  iou_loss: IouLoss
+  # iou_aware_loss
+  iou_aware_loss: IouAwareLoss
+
+# IouLoss
+IouLoss:
+  loss_weight: 2.5
+  loss_square: true
+
+# IouAwareLoss
+IouAwareLoss:
+  loss_weight: 1.0
+
+# BBoxPostProcess
+BBoxPostProcess:
+  decode:
+    name: YOLOBox
+    conf_thresh: 0.01
+    downsample_ratio: 32
+    clip_bbox: true
+    scale_x_y: 1.05
+  # nms 配置
+  nms:
+    name: MatrixNMS
+    keep_top_k: 100
+    score_threshold: 0.01
+    post_threshold: 0.01
+    nms_top_k: -1
+    background_label: -1
+
+```
+
+- 运行时置文件 `runtime.yml`
+
+```yaml
+# 是否使用gpu
+use_gpu: true
+# 日志打印间隔
+log_iter: 20
+# save_dir
+save_dir: output
+# 模型保存间隔时间
+snapshot_epoch: 1
+```
diff --git a/docs/tutorials/config_annotation/ppyolo_r50vd_dcn_1x_coco_annotation_en.md b/docs/tutorials/config_annotation/ppyolo_r50vd_dcn_1x_coco_annotation_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..420ce684dcbbea2d9c8c3accdbf42c2fc24ddb24
--- /dev/null
+++ b/docs/tutorials/config_annotation/ppyolo_r50vd_dcn_1x_coco_annotation_en.md
@@ -0,0 +1,266 @@
+# YOLO series model parameter configuration tutorial
+
+Tag: Model parameter configuration
+
+Take `ppyolo_r50vd_dcn_1x_coco.yml` as an example, The model consists of five sub-profiles:
+
+- Data profile `coco_detection.yml`
+
+```yaml
+# Data evaluation type
+metric: COCO
+# The number of categories in the dataset
+num_classes: 80
+
+# TrainDataset
+TrainDataset:
+  !COCODataSet
+    # Image data path, Relative path of dataset_dir, os.path.join(dataset_dir, image_dir)
+    image_dir: train2017
+    # Annotation file path, Relative path of dataset_dir, os.path.join(dataset_dir, anno_path)
+    anno_path: annotations/instances_train2017.json
+    # data file
+    dataset_dir: dataset/coco
+    # data_fields
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  !COCODataSet
+    # Image data path, Relative path of dataset_dir, os.path.join(dataset_dir, image_dir)
+    image_dir: val2017
+    # Annotation file path, Relative path of dataset_dir, os.path.join(dataset_dir, anno_path)
+    anno_path: annotations/instances_val2017.json
+    # data file os.path.join(dataset_dir, anno_path)
+    dataset_dir: dataset/coco
+
+TestDataset:
+  !ImageFolder
+    # Annotation file path, Relative path of dataset_dir, os.path.join(dataset_dir, anno_path)
+    anno_path: annotations/instances_val2017.json
+```
+
+- Optimizer configuration file `optimizer_1x.yml`
+
+```yaml
+# Total training epoches
+epoch: 405
+
+# learning rate setting
+LearningRate:
+  # Default is 8 Gpus training learning rate
+  base_lr: 0.01
+  # Learning rate adjustment strategy
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    # Position of change in learning rate (number of epoches)
+    milestones:
+    - 243
+    - 324
+  # Warmup
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 4000
+
+# Optimizer
+OptimizerBuilder:
+  # Optimizer
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  # Regularization
+  regularizer:
+    factor: 0.0005
+    type: L2
+```
+
+- Data reads configuration files `ppyolo_reader.yml`
+
+```yaml
+# Number of PROCESSES per GPU Reader
+worker_num: 2
+# training data
+TrainReader:
+  inputs_def:
+    num_max_boxes: 50
+  # Training data transforms
+  sample_transforms:
+    - Decode: {}
+    - Mixup: {alpha: 1.5, beta: 1.5}
+    - RandomDistort: {}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {}
+    - RandomFlip: {}
+  # batch_transforms
+  batch_transforms:
+    - BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeBox: {}
+    - PadBox: {num_max_boxes: 50}
+    - BboxXYXY2XYWH: {}
+    - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
+    - Permute: {}
+    - Gt2YoloTarget: {anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]], anchors: [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]], downsample_ratios: [32, 16, 8]}
+  # Batch size during training
+  batch_size: 24
+  # Read data is out of order
+  shuffle: true
+  # Whether to discard data that does not complete the batch
+  drop_last: true
+  # mixup_epoch，Greater than maximum epoch, Indicates that the training process has been augmented with mixup data
+  mixup_epoch: 25000
+  # Whether to use the shared memory to accelerate data reading, ensure that the shared memory size (such as /dev/shm) is greater than 1 GB
+  use_shared_memory: true
+
+# Evaluate data
+EvalReader:
+  # Evaluating data transforms
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
+    - Permute: {}
+  # Batch_size during evaluation
+  batch_size: 8
+  # Whether to discard unlabeled data
+  drop_empty: false
+
+# test data
+TestReader:
+  inputs_def:
+    image_shape: [3, 608, 608]
+  # test data transforms
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
+    - Permute: {}
+  # batch_size during training
+  batch_size: 1
+```
+
+- Model profile `ppyolo_r50vd_dcn.yml`
+
+```yaml
+# Model structure type
+architecture: YOLOv3
+# Pretrain model address
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_pretrained.pdparams
+# norm_type
+norm_type: sync_bn
+# Whether to use EMA
+use_ema: true
+# ema_decay
+ema_decay: 0.9998
+
+# YOLOv3
+YOLOv3:
+  # backbone
+  backbone: ResNet
+  # neck
+  neck: PPYOLOFPN
+  # yolo_head
+  yolo_head: YOLOv3Head
+  # post_process
+  post_process: BBoxPostProcess
+
+
+# backbone
+ResNet:
+  # depth
+  depth: 50
+  # variant
+  variant: d
+  # return_idx, 0 represent res2
+  return_idx: [1, 2, 3]
+  # dcn_v2_stages
+  dcn_v2_stages: [3]
+  # freeze_at
+  freeze_at: -1
+  # freeze_norm
+  freeze_norm: false
+  # norm_decay
+  norm_decay: 0.
+
+# PPYOLOFPN
+PPYOLOFPN:
+  # whether coord_conv or not
+  coord_conv: true
+  # whether drop_block or not
+  drop_block: true
+  # block_size
+  block_size: 3
+  # keep_prob
+  keep_prob: 0.9
+  # whether spp or not
+  spp: true
+
+# YOLOv3Head
+YOLOv3Head:
+  # anchors
+  anchors: [[10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  # anchor_masks
+  anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+  # loss
+  loss: YOLOv3Loss
+  # whether to use iou_aware
+  iou_aware: true
+  # iou_aware_factor
+  iou_aware_factor: 0.4
+
+# YOLOv3Loss
+YOLOv3Loss:
+  # ignore_thresh
+  ignore_thresh: 0.7
+  # downsample
+  downsample: [32, 16, 8]
+  # whether label_smooth or not
+  label_smooth: false
+  # scale_x_y
+  scale_x_y: 1.05
+  # iou_loss
+  iou_loss: IouLoss
+  # iou_aware_loss
+  iou_aware_loss: IouAwareLoss
+
+# IouLoss
+IouLoss:
+  loss_weight: 2.5
+  loss_square: true
+
+# IouAwareLoss
+IouAwareLoss:
+  loss_weight: 1.0
+
+# BBoxPostProcess
+BBoxPostProcess:
+  decode:
+    name: YOLOBox
+    conf_thresh: 0.01
+    downsample_ratio: 32
+    clip_bbox: true
+    scale_x_y: 1.05
+  # nms setting
+  nms:
+    name: MatrixNMS
+    keep_top_k: 100
+    score_threshold: 0.01
+    post_threshold: 0.01
+    nms_top_k: -1
+    background_label: -1
+
+```
+
+- Runtime file `runtime.yml`
+
+```yaml
+# Whether to use gpu
+use_gpu: true
+# Log Printing interval
+log_iter: 20
+# save_dir
+save_dir: output
+# Model save interval
+snapshot_epoch: 1
+```
diff --git a/docs/tutorials/data/DetAnnoTools.md b/docs/tutorials/data/DetAnnoTools.md
new file mode 100644
index 0000000000000000000000000000000000000000..136c0c8c3133004f2da19906f0a603ba390d7169
--- /dev/null
+++ b/docs/tutorials/data/DetAnnoTools.md
@@ -0,0 +1,278 @@
+简体中文 | [English](DetAnnoTools_en.md)
+
+
+
+# 目标检测标注工具
+
+## 目录
+
+[LabelMe](#LabelMe)
+
+* [使用说明](#使用说明)
+  * [安装](#LabelMe安装)
+  * [图片标注过程](#LabelMe图片标注过程)
+* [标注格式](#LabelMe标注格式)
+  * [导出数据格式](#LabelMe导出数据格式)
+  * [格式转化总结](#格式转化总结)
+  * [标注文件(json)-->VOC](#标注文件(json)-->VOC数据集)
+  * [标注文件(json)-->COCO](#标注文件(json)-->COCO数据集)
+
+[LabelImg](#LabelImg)
+
+* [使用说明](#使用说明)
+  * [LabelImg安装](#LabelImg安装)
+  * [安装注意事项](#安装注意事项)
+  * [图片标注过程](#LabelImg图片标注过程)
+* [标注格式](#LabelImg标注格式)
+  * [导出数据格式](#LabelImg导出数据格式)
+  * [格式转换注意事项](#格式转换注意事项)
+
+
+
+## [LabelMe](https://github.com/wkentaro/labelme)
+
+### 使用说明
+
+#### LabelMe安装
+
+具体安装操作请参考[LabelMe官方教程](https://github.com/wkentaro/labelme)中的Installation
+
+<details>
+<summary><b> Ubuntu</b></summary>
+
+```
+sudo apt-get install labelme
+
+# or
+sudo pip3 install labelme
+
+# or install standalone executable from:
+# https://github.com/wkentaro/labelme/releases
+```
+
+</details>
+
+<details>
+<summary><b> macOS</b></summary>
+
+```
+brew install pyqt  # maybe pyqt5
+pip install labelme
+
+# or
+brew install wkentaro/labelme/labelme  # command line interface
+# brew install --cask wkentaro/labelme/labelme  # app
+
+# or install standalone executable/app from:
+# https://github.com/wkentaro/labelme/releases
+```
+
+</details>
+
+
+
+推荐使用Anaconda的安装方式
+
+```
+conda create –name=labelme python=3
+conda activate labelme
+pip install pyqt5
+pip install labelme
+```
+
+
+
+
+
+#### LabelMe图片标注过程
+
+启动labelme后，选择图片文件或者图片所在文件夹
+
+左侧编辑栏选择`create polygons`  绘制标注区域如下图所示（右击图像区域可以选择不同的标注形状），绘制好区域后按下回车，弹出新的框填入标注区域对应的标签，如：people
+
+左侧菜单栏点击保存，生成`json`形式的**标注文件**
+
+![](https://media3.giphy.com/media/XdnHZgge5eynRK3ATK/giphy.gif?cid=790b7611192e4c0ec2b5e6990b6b0f65623154ffda66b122&rid=giphy.gif&ct=g)
+
+
+
+### LabelMe标注格式
+
+#### LabelMe导出数据格式
+
+```
+#生成标注文件
+png/jpeg/jpg-->labelme标注-->json
+```
+
+
+
+
+
+#### 格式转化总结
+
+```
+#标注文件转化为VOC数据集格式
+json-->labelme2voc.py-->VOC数据集
+
+#标注文件转化为COCO数据集格式
+json-->labelme2coco.py-->COCO数据集
+```
+
+
+
+
+
+#### 标注文件(json)-->VOC数据集
+
+使用[官方给出的labelme2voc.py](https://github.com/wkentaro/labelme/blob/main/examples/bbox_detection/labelme2voc.py)这份脚本
+
+下载该脚本，在命令行中使用
+
+```Te
+python labelme2voc.py data_annotated(标注文件所在文件夹) data_dataset_voc(输出文件夹) --labels labels.txt
+```
+
+运行后，在指定的输出文件夹中会如下的目录
+
+```
+# It generates:
+#   - data_dataset_voc/JPEGImages
+#   - data_dataset_voc/Annotations
+#   - data_dataset_voc/AnnotationsVisualization
+
+```
+
+
+
+
+
+#### 标注文件(json)-->COCO数据集
+
+使用[PaddleDetection提供的x2coco.py](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/tools/x2coco.py) 将labelme标注的数据转换为COCO数据集形式
+
+```bash
+python tools/x2coco.py \
+                --dataset_type labelme \
+                --json_input_dir ./labelme_annos/ \
+                --image_input_dir ./labelme_imgs/ \
+                --output_dir ./cocome/ \
+                --train_proportion 0.8 \
+                --val_proportion 0.2 \
+                --test_proportion 0.0
+```
+
+用户数据集转成COCO数据后目录结构如下（注意数据集中路径名、文件名尽量不要使用中文，避免中文编码问题导致出错）：
+
+```
+dataset/xxx/
+├── annotations
+│   ├── train.json  # coco数据的标注文件
+│   ├── valid.json  # coco数据的标注文件
+├── images
+│   ├── xxx1.jpg
+│   ├── xxx2.jpg
+│   ├── xxx3.jpg
+│   |   ...
+...
+```
+
+
+
+
+
+## [LabelImg](https://github.com/tzutalin/labelImg)
+
+### 使用说明
+
+#### LabelImg安装
+
+安装操作请参考[LabelImg官方教程](https://github.com/tzutalin/labelImg)
+
+<details>
+<summary><b> Ubuntu</b></summary>
+
+```
+sudo apt-get install pyqt5-dev-tools
+sudo pip3 install -r requirements/requirements-linux-python3.txt
+make qt5py3
+python3 labelImg.py
+python3 labelImg.py [IMAGE_PATH] [PRE-DEFINED CLASS FILE]
+```
+
+</details>
+
+<details>
+<summary><b>macOS</b></summary>
+
+```
+brew install qt  # Install qt-5.x.x by Homebrew
+brew install libxml2
+
+or using pip
+
+pip3 install pyqt5 lxml # Install qt and lxml by pip
+
+make qt5py3
+python3 labelImg.py
+python3 labelImg.py [IMAGE_PATH] [PRE-DEFINED CLASS FILE]
+```
+
+</details>
+
+
+
+推荐使用Anaconda的安装方式
+
+ 首先下载并进入 [labelImg](https://github.com/tzutalin/labelImg#labelimg) 的目录
+
+```
+conda install pyqt=5
+conda install -c anaconda lxml
+pyrcc5 -o libs/resources.py resources.qrc
+python labelImg.py
+python labelImg.py [IMAGE_PATH] [PRE-DEFINED CLASS FILE]
+```
+
+
+
+
+
+#### 安装注意事项
+
+以Anaconda安装方式为例，比Labelme配置要麻烦一些
+
+启动方式是通过python运行脚本`python labelImg.py <图片路径>`
+
+
+
+#### LabelImg图片标注过程
+
+启动labelImg后，选择图片文件或者图片所在文件夹
+
+左侧编辑栏选择`创建区块`  绘制标注区，在弹出新的框选择对应的标签
+
+左侧菜单栏点击保存，可以选择VOC/YOLO/CreateML三种类型的标注文件
+
+
+
+![](https://user-images.githubusercontent.com/34162360/177526022-fd9c63d8-e476-4b63-ae02-76d032bb7656.gif)
+
+
+
+
+
+### LabelImg标注格式
+
+#### LabelImg导出数据格式
+
+```
+#生成标注文件
+png/jpeg/jpg-->labelImg标注-->xml/txt/json
+```
+
+
+
+#### 格式转换注意事项
+
+**PaddleDetection支持VOC或COCO格式的数据**，经LabelImg标注导出后的标注文件，需要修改为**VOC或COCO格式**，调整说明可以参考[准备训练数据](./PrepareDataSet.md#%E5%87%86%E5%A4%87%E8%AE%AD%E7%BB%83%E6%95%B0%E6%8D%AE)
diff --git a/docs/tutorials/data/DetAnnoTools_en.md b/docs/tutorials/data/DetAnnoTools_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..6d356a4ad493b191478f0ff5b7a9f4a9b5c9cac2
--- /dev/null
+++ b/docs/tutorials/data/DetAnnoTools_en.md
@@ -0,0 +1,270 @@
+[简体中文](DetAnnoTools.md) | English
+
+
+
+# Object Detection Annotation Tools
+
+## Concents
+
+[LabelMe](#LabelMe)
+
+* [Instruction](#Instruction-of-LabelMe)
+  * [Installation](#Installation)
+  * [Annotation of Images](#Annotation-of-images-in-LabelMe)
+* [Annotation Format](#Annotation-Format-of-LabelMe)
+  * [Export Format](#Export-Format-of-LabelMe)
+  * [Summary of Format Conversion](#Summary-of-Format-Conversion)
+  * [Annotation file(json)—>VOC Dataset](#annotation-filejsonvoc-dataset)
+  * [Annotation file(json)—>COCO Dataset](#annotation-filejsoncoco-dataset)
+
+[LabelImg](#LabelImg)
+
+* [Instruction](#Instruction-of-LabelImg)
+  * [Installation](#Installation-of-LabelImg)
+  * [Installation Notes](#Installation-Notes)
+  * [Annotation of images](#Annotation-of-images-in-LabelImg)
+* [Annotation Format](#Annotation-Format-of-LabelImg)
+  * [Export Format](#Export-Format-of-LabelImg)
+  * [Notes of Format Conversion](#Notes-of-Format-Conversion)
+
+
+
+## [LabelMe](https://github.com/wkentaro/labelme)
+
+### Instruction of LabelMe
+
+#### Installation
+
+Please refer to [The github of LabelMe](https://github.com/wkentaro/labelme) for installation details.
+
+<details>
+<summary><b> Ubuntu</b></summary>
+
+```
+sudo apt-get install labelme
+
+# or
+sudo pip3 install labelme
+
+# or install standalone executable from:
+# https://github.com/wkentaro/labelme/releases
+```
+
+</details>
+
+<details>
+<summary><b> macOS</b></summary>
+
+```
+brew install pyqt  # maybe pyqt5
+pip install labelme
+
+# or
+brew install wkentaro/labelme/labelme  # command line interface
+# brew install --cask wkentaro/labelme/labelme  # app
+
+# or install standalone executable/app from:
+# https://github.com/wkentaro/labelme/releases
+```
+
+</details>
+
+
+
+We recommend installing by Anoncanda.
+
+```
+conda create –name=labelme python=3
+conda activate labelme
+pip install pyqt5
+pip install labelme
+```
+
+
+
+
+
+#### Annotation of Images in LabelMe
+
+After starting labelme, select an image or an folder with images.
+
+Select  `create polygons`   in the formula bar. Draw an annotation area as shown in the following  GIF. You can right-click on the image to select different shape. When finished, press the Enter/Return key, then fill the corresponding label in the popup box, such as, people.
+
+Click the save button in the formula bar，it will generate an annotation file in json.
+
+![](https://media3.giphy.com/media/XdnHZgge5eynRK3ATK/giphy.gif?cid=790b7611192e4c0ec2b5e6990b6b0f65623154ffda66b122&rid=giphy.gif&ct=g)
+
+
+
+### Annotation Format of LabelMe
+
+#### Export Format of LabelMe
+
+```
+#generate an annotation file
+png/jpeg/jpg-->labelme-->json
+```
+
+
+
+
+
+#### Summary of Format Conversion
+
+```
+#convert annotation file to VOC dataset format
+json-->labelme2voc.py-->VOC dataset
+
+#convert annotation file to COCO dataset format
+json-->labelme2coco.py-->COCO dataset
+```
+
+
+
+
+
+#### Annotation file(json)—>VOC Dataset
+
+Use this script [labelme2voc.py](https://github.com/wkentaro/labelme/blob/main/examples/bbox_detection/labelme2voc.py) in command line.
+
+```Te
+python labelme2voc.py data_annotated(annotation folder) data_dataset_voc(output folder) --labels labels.txt
+```
+
+Then, it will generate following contents:
+
+```
+# It generates:
+#   - data_dataset_voc/JPEGImages
+#   - data_dataset_voc/Annotations
+#   - data_dataset_voc/AnnotationsVisualization
+
+```
+
+
+
+
+
+#### Annotation file(json)—>COCO Dataset
+
+Convert the data annotated by LabelMe to COCO dataset by the script [x2coco.py](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/tools/x2coco.py) provided by PaddleDetection.
+
+```bash
+python tools/x2coco.py \
+                --dataset_type labelme \
+                --json_input_dir ./labelme_annos/ \
+                --image_input_dir ./labelme_imgs/ \
+                --output_dir ./cocome/ \
+                --train_proportion 0.8 \
+                --val_proportion 0.2 \
+                --test_proportion 0.0
+```
+
+After the user dataset is converted to COCO data, the directory structure is as follows (Try to avoid use Chinese for the path name in case of errors caused by Chinese coding problems):
+
+```
+dataset/xxx/
+├── annotations
+│   ├── train.json  # Annotation file of coco data
+│   ├── valid.json  # Annotation file of coco data
+├── images
+│   ├── xxx1.jpg
+│   ├── xxx2.jpg
+│   ├── xxx3.jpg
+│   |   ...
+...
+```
+
+
+
+
+
+## [LabelImg](https://github.com/tzutalin/labelImg)
+
+### Instruction
+
+#### Installation of LabelImg
+
+Please refer to [The github of LabelImg](https://github.com/tzutalin/labelImg) for installation details.
+
+<details>
+<summary><b> Ubuntu</b></summary>
+
+```
+sudo apt-get install pyqt5-dev-tools
+sudo pip3 install -r requirements/requirements-linux-python3.txt
+make qt5py3
+python3 labelImg.py
+python3 labelImg.py [IMAGE_PATH] [PRE-DEFINED CLASS FILE]
+```
+
+</details>
+
+<details>
+<summary><b>macOS</b></summary>
+
+```
+brew install qt  # Install qt-5.x.x by Homebrew
+brew install libxml2
+
+or using pip
+
+pip3 install pyqt5 lxml # Install qt and lxml by pip
+
+make qt5py3
+python3 labelImg.py
+python3 labelImg.py [IMAGE_PATH] [PRE-DEFINED CLASS FILE]
+```
+
+</details>
+
+
+
+We recommend installing by Anoncanda.
+
+Download and go to the folder of  [labelImg](https://github.com/tzutalin/labelImg#labelimg)
+
+```
+conda install pyqt=5
+conda install -c anaconda lxml
+pyrcc5 -o libs/resources.py resources.qrc
+python labelImg.py
+python labelImg.py [IMAGE_PATH] [PRE-DEFINED CLASS FILE]
+```
+
+
+
+
+
+#### Installation Notes
+
+Use python scripts to startup LabelImg: `python labelImg.py <IMAGE_PATH>`
+
+#### Annotation of images in LabelImg
+
+After the startup of LabelImg, select an image or a folder with images.
+
+Select  `Create RectBox`  in the formula bar. Draw an annotation area as shown in the following  GIF. When finished, select corresponding label in the popup box. Then save the annotated file in three forms:  VOC/YOLO/CreateML.
+
+
+
+![](https://user-images.githubusercontent.com/34162360/177526022-fd9c63d8-e476-4b63-ae02-76d032bb7656.gif)
+
+
+
+
+
+### Annotation Format of LabelImg
+
+#### Export Format of LabelImg
+
+```
+#generate annotation files
+png/jpeg/jpg-->labelImg-->xml/txt/json
+```
+
+
+
+#### Notes of Format Conversion
+
+**PaddleDetection supports the format of VOC or COCO.** The annotation file generated by LabelImg needs to be converted by VOC or COCO.  You can refer to [PrepareDataSet](./PrepareDataSet.md#%E5%87%86%E5%A4%87%E8%AE%AD%E7%BB%83%E6%95%B0%E6%8D%AE).
diff --git a/docs/tutorials/data/PrepareDetDataSet.md b/docs/tutorials/data/PrepareDetDataSet.md
new file mode 100644
index 0000000000000000000000000000000000000000..f956c170bd4926b18acb58d3f5e88739536e4b78
--- /dev/null
+++ b/docs/tutorials/data/PrepareDetDataSet.md
@@ -0,0 +1,497 @@
+# 目标检测数据准备
+## 目录
+- [目标检测数据说明](#目标检测数据说明)
+- [准备训练数据](#准备训练数据)
+    - [VOC数据数据](#VOC数据数据)
+        - [VOC数据集下载](#VOC数据集下载)
+        - [VOC数据标注文件介绍](#VOC数据标注文件介绍)
+    - [COCO数据数据](#COCO数据数据)
+        - [COCO数据集下载](#COCO数据下载)
+        - [COCO数据标注文件介绍](#COCO数据标注文件介绍)
+    - [用户数据准备](#用户数据准备)
+        - [用户数据转成VOC数据](#用户数据转成VOC数据)
+        - [用户数据转成COCO数据](#用户数据转成COCO数据)
+        - [用户数据自定义reader](#用户数据自定义reader)
+    - [用户数据使用示例](#用户数据使用示例)
+        - [数据格式转换](#数据格式转换)
+        - [自定义数据训练](#自定义数据训练)
+- [(可选)生成Anchor](#(可选)生成Anchor)
+
+### 目标检测数据说明  
+
+目标检测的数据比分类复杂，一张图像中，需要标记出各个目标区域的位置和类别。
+
+一般的目标区域位置用一个矩形框来表示，一般用以下3种方式表达：
+
+|         表达方式    |                 说明               |
+| :----------------: | :--------------------------------: |
+|     x1,y1,x2,y2    | (x1,y1)为左上角坐标，(x2,y2)为右下角坐标  |  
+|     x1,y1,w,h      | (x1,y1)为左上角坐标，w为目标区域宽度，h为目标区域高度  |
+|     xc,yc,w,h    | (xc,yc)为目标区域中心坐标，w为目标区域宽度，h为目标区域高度  |  
+
+常见的目标检测数据集如Pascal VOC采用的`[x1,y1,x2,y2]` 表示物体的bounding box, [COCO](https://cocodataset.org/#format-data)采用的`[x1,y1,w,h]` 表示物体的bounding box.
+
+### 准备训练数据  
+
+PaddleDetection默认支持[COCO](http://cocodataset.org)和[Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) 和[WIDER-FACE](http://shuoyang1213.me/WIDERFACE/) 数据源。  
+同时还支持自定义数据源，包括：  
+
+(1) 自定义数据数据转换成VOC数据；  
+(2) 自定义数据数据转换成COCO数据；  
+(3) 自定义新的数据源，增加自定义的reader。
+
+
+首先进入到`PaddleDetection`根目录下
+```
+cd PaddleDetection/
+ppdet_root=$(pwd)
+```
+
+#### VOC数据数据  
+
+VOC数据是[Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) 比赛使用的数据。Pascal VOC比赛不仅包含图像分类分类任务，还包含图像目标检测、图像分割等任务，其标注文件中包含多个任务的标注内容。
+VOC数据集指的是Pascal VOC比赛使用的数据。用户自定义的VOC数据，xml文件中的非必须字段，请根据实际情况选择是否标注或是否使用默认值。
+
+##### VOC数据集下载  
+
+- 通过代码自动化下载VOC数据集，数据集较大，下载需要较长时间
+
+    ```
+    # 执行代码自动化下载VOC数据集  
+    python dataset/voc/download_voc.py
+    ```
+
+    代码执行完成后VOC数据集文件组织结构为：
+    ```
+    >>cd dataset/voc/
+    >>tree
+    ├── create_list.py
+    ├── download_voc.py
+    ├── generic_det_label_list.txt
+    ├── generic_det_label_list_zh.txt
+    ├── label_list.txt
+    ├── VOCdevkit/VOC2007
+    │   ├── annotations
+    │       ├── 001789.xml
+    │       |   ...
+    │   ├── JPEGImages
+    │       ├── 001789.jpg
+    │       |   ...
+    │   ├── ImageSets
+    │       |   ...
+    ├── VOCdevkit/VOC2012
+    │   ├── Annotations
+    │       ├── 2011_003876.xml
+    │       |   ...
+    │   ├── JPEGImages
+    │       ├── 2011_003876.jpg
+    │       |   ...
+    │   ├── ImageSets
+    │       |   ...
+    |   ...
+    ```
+
+    各个文件说明
+    ```
+    # label_list.txt 是类别名称列表，文件名必须是 label_list.txt。若使用VOC数据集，config文件中use_default_label为true时不需要这个文件
+    >>cat label_list.txt
+    aeroplane
+    bicycle
+    ...
+
+    # trainval.txt 是训练数据集文件列表
+    >>cat trainval.txt
+    VOCdevkit/VOC2007/JPEGImages/007276.jpg VOCdevkit/VOC2007/Annotations/007276.xml
+    VOCdevkit/VOC2012/JPEGImages/2011_002612.jpg VOCdevkit/VOC2012/Annotations/2011_002612.xml
+    ...
+
+    # test.txt 是测试数据集文件列表
+    >>cat test.txt
+    VOCdevkit/VOC2007/JPEGImages/000001.jpg VOCdevkit/VOC2007/Annotations/000001.xml
+    ...
+
+    # label_list.txt voc 类别名称列表
+    >>cat label_list.txt
+
+    aeroplane
+    bicycle
+    ...
+    ```
+- 已下载VOC数据集  
+    按照如上数据文件组织结构组织文件即可。
+
+##### VOC数据标注文件介绍  
+
+VOC数据是每个图像文件对应一个同名的xml文件，xml文件中标记物体框的坐标和类别等信息。例如图像`2007_002055.jpg`：
+![](../images/2007_002055.jpg)
+
+图片对应的xml文件内包含对应图片的基本信息，比如文件名、来源、图像尺寸以及图像中包含的物体区域信息和类别信息等。
+
+xml文件中包含以下字段：
+- filename，表示图像名称。
+- size，表示图像尺寸。包括：图像宽度、图像高度、图像深度。
+    ```
+    <size>
+        <width>500</width>
+        <height>375</height>
+        <depth>3</depth>
+    </size>
+    ```
+- object字段，表示每个物体。包括:
+
+    |    标签    |    说明    |
+    | :--------: | :-----------: |
+    |   name    |     物体类别名称       |  
+    |   pose    |    关于目标物体姿态描述（非必须字段）  |  
+    |   truncated    |   如果物体的遮挡超过15-20％并且位于边界框之外，请标记为`truncated`（非必须字段）    |  
+    |   difficult    |   难以识别的物体标记为`difficult`（非必须字段）      |  
+    |   bndbox子标签    |  (xmin,ymin) 左上角坐标，(xmax,ymax) 右下角坐标，  |  
+
+
+#### COCO数据  
+COCO数据是[COCO](http://cocodataset.org) 比赛使用的数据。同样的，COCO比赛数也包含多个比赛任务，其标注文件中包含多个任务的标注内容。
+COCO数据集指的是COCO比赛使用的数据。用户自定义的COCO数据，json文件中的一些字段，请根据实际情况选择是否标注或是否使用默认值。
+
+
+##### COCO数据下载  
+- 通过代码自动化下载COCO数据集，数据集较大，下载需要较长时间
+
+    ```
+    # 执行代码自动化下载COCO数据集  
+    python dataset/coco/download_coco.py
+    ```
+
+    代码执行完成后COCO数据集文件组织结构为：
+    ```
+    >>cd dataset/coco/
+    >>tree
+    ├── annotations
+    │   ├── instances_train2017.json
+    │   ├── instances_val2017.json
+    │   |   ...
+    ├── train2017
+    │   ├── 000000000009.jpg
+    │   ├── 000000580008.jpg
+    │   |   ...
+    ├── val2017
+    │   ├── 000000000139.jpg
+    │   ├── 000000000285.jpg
+    │   |   ...
+    |   ...
+    ```
+- 已下载COCO数据集  
+    按照如上数据文件组织结构组织文件即可。  
+
+##### COCO数据标注介绍  
+COCO数据标注是将所有训练图像的标注都存放到一个json文件中。数据以字典嵌套的形式存放。
+
+json文件中包含以下key：  
+- info，表示标注文件info。
+- licenses，表示标注文件licenses。
+- images，表示标注文件中图像信息列表，每个元素是一张图像的信息。如下为其中一张图像的信息：
+    ```
+    {
+        'license': 3,                       # license
+        'file_name': '000000391895.jpg',    # file_name
+         # coco_url
+        'coco_url': 'http://images.cocodataset.org/train2017/000000391895.jpg',
+        'height': 360,                      # image height
+        'width': 640,                       # image width
+        'date_captured': '2013-11-14 11:18:45', # date_captured
+        # flickr_url
+        'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
+        'id': 391895                        # image id
+    }
+    ```
+- annotations，表示标注文件中目标物体的标注信息列表，每个元素是一个目标物体的标注信息。如下为其中一个目标物体的标注信息：
+    ```
+    {
+
+        'segmentation':             # 物体的分割标注
+        'area': 2765.1486500000005, # 物体的区域面积
+        'iscrowd': 0,               # iscrowd
+        'image_id': 558840,         # image id
+        'bbox': [199.84, 200.46, 77.71, 70.88], # bbox [x1,y1,w,h]
+        'category_id': 58,          # category_id
+        'id': 156                   # image id
+    }
+    ```
+
+    ```
+    # 查看COCO标注文件
+    import json
+    coco_anno = json.load(open('./annotations/instances_train2017.json'))
+
+    # coco_anno.keys
+    print('\nkeys:', coco_anno.keys())
+
+    # 查看类别信息
+    print('\n物体类别:', coco_anno['categories'])
+
+    # 查看一共多少张图
+    print('\n图像数量：', len(coco_anno['images']))
+
+    # 查看一共多少个目标物体
+    print('\n标注物体数量：', len(coco_anno['annotations']))
+
+    # 查看一条目标物体标注信息
+    print('\n查看一条目标物体标注信息：', coco_anno['annotations'][0])
+    ```
+
+#### 用户数据准备
+对于用户数据有3种处理方法：  
+(1) 将用户数据转成VOC数据(根据需要仅包含物体检测所必须的标签即可)  
+(2) 将用户数据转成COCO数据(根据需要仅包含物体检测所必须的标签即可)  
+(3) 自定义一个用户数据的reader(较复杂数据，需要自定义reader)  
+
+##### 用户数据转成VOC数据  
+用户数据集转成VOC数据后目录结构如下（注意数据集中路径名、文件名尽量不要使用中文，避免中文编码问题导致出错）：
+
+```
+dataset/xxx/
+├── annotations
+│   ├── xxx1.xml
+│   ├── xxx2.xml
+│   ├── xxx3.xml
+│   |   ...
+├── images
+│   ├── xxx1.jpg
+│   ├── xxx2.jpg
+│   ├── xxx3.jpg
+│   |   ...
+├── label_list.txt (必须提供，且文件名称必须是label_list.txt )
+├── train.txt (训练数据集文件列表, ./images/xxx1.jpg ./annotations/xxx1.xml)
+└── valid.txt (测试数据集文件列表)
+```
+
+各个文件说明
+```
+# label_list.txt 是类别名称列表，改文件名必须是这个
+>>cat label_list.txt
+classname1
+classname2
+...
+
+# train.txt 是训练数据文件列表
+>>cat train.txt
+./images/xxx1.jpg ./annotations/xxx1.xml
+./images/xxx2.jpg ./annotations/xxx2.xml
+...
+
+# valid.txt 是验证数据文件列表
+>>cat valid.txt
+./images/xxx3.jpg ./annotations/xxx3.xml
+...
+```
+
+##### 用户数据转成COCO数据
+在`./tools/`中提供了`x2coco.py`用于将VOC数据集、labelme标注的数据集或cityscape数据集转换为COCO数据，例如:
+
+（1）labelme数据转换为COCO数据：
+```bash
+python tools/x2coco.py \
+                --dataset_type labelme \
+                --json_input_dir ./labelme_annos/ \
+                --image_input_dir ./labelme_imgs/ \
+                --output_dir ./cocome/ \
+                --train_proportion 0.8 \
+                --val_proportion 0.2 \
+                --test_proportion 0.0
+```
+（2）voc数据转换为COCO数据：
+```bash
+python tools/x2coco.py \
+        --dataset_type voc \
+        --voc_anno_dir path/to/VOCdevkit/VOC2007/Annotations/ \
+        --voc_anno_list path/to/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt \
+        --voc_label_list dataset/voc/label_list.txt \
+        --voc_out_name voc_train.json
+```
+
+用户数据集转成COCO数据后目录结构如下（注意数据集中路径名、文件名尽量不要使用中文，避免中文编码问题导致出错）：
+```
+dataset/xxx/
+├── annotations
+│   ├── train.json  # coco数据的标注文件
+│   ├── valid.json  # coco数据的标注文件
+├── images
+│   ├── xxx1.jpg
+│   ├── xxx2.jpg
+│   ├── xxx3.jpg
+│   |   ...
+...
+```
+
+##### 用户数据自定义reader
+如果数据集有新的数据需要添加进PaddleDetection中，您可参考数据处理文档中的[添加新数据源](../advanced_tutorials/READER.md#2.3自定义数据集)文档部分，开发相应代码完成新的数据源支持，同时数据处理具体代码解析等可阅读[数据处理文档](../advanced_tutorials/READER.md)。
+
+
+#### 用户数据使用示例  
+
+以[Kaggle数据集](https://www.kaggle.com/andrewmvd/road-sign-detection) 比赛数据为例，说明如何准备自定义数据。
+Kaggle上的 [road-sign-detection](https://www.kaggle.com/andrewmvd/road-sign-detection) 比赛数据包含877张图像，数据类别4类：crosswalk，speedlimit，stop，trafficlight。
+可从Kaggle上下载，也可以从[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar) 下载。
+路标数据集示例图：
+![](../images/road554.png)
+
+```
+# 下载解压数据
+>>cd $(ppdet_root)/dataset
+# 下载kaggle数据集并解压，当前文件组织结构如下
+
+├── annotations
+│   ├── road0.xml
+│   ├── road1.xml
+│   ├── road10.xml
+│   |   ...
+├── images
+│   ├── road0.jpg
+│   ├── road1.jpg
+│   ├── road2.jpg
+│   |   ...
+```
+
+#### 数据格式转换
+
+将数据划分为训练集和测试集
+```
+# 生成 label_list.txt 文件
+>>echo -e "speedlimit\ncrosswalk\ntrafficlight\nstop" > label_list.txt
+
+# 生成 train.txt、valid.txt和test.txt列表文件
+>>ls images/*.png | shuf > all_image_list.txt
+>>awk -F"/" '{print $2}' all_image_list.txt | awk -F".png" '{print $1}'  | awk -F"\t" '{print "images/"$1".png annotations/"$1".xml"}' > all_list.txt
+
+# 训练集、验证集、测试集比例分别约80%、10%、10%。
+>>head -n 88 all_list.txt > test.txt
+>>head -n 176 all_list.txt | tail -n 88 > valid.txt
+>>tail -n 701 all_list.txt > train.txt
+
+# 删除不用文件
+>>rm -rf all_image_list.txt all_list.txt
+
+最终数据集文件组织结构为：
+
+├── annotations
+│   ├── road0.xml
+│   ├── road1.xml
+│   ├── road10.xml
+│   |   ...
+├── images
+│   ├── road0.jpg
+│   ├── road1.jpg
+│   ├── road2.jpg
+│   |   ...
+├── label_list.txt
+├── test.txt
+├── train.txt
+└── valid.txt
+
+# label_list.txt 是类别名称列表，文件名必须是 label_list.txt
+>>cat label_list.txt
+crosswalk
+speedlimit
+stop
+trafficlight
+
+# train.txt 是训练数据集文件列表，每一行是一张图像路径和对应标注文件路径，以空格分开。注意这里的路径是数据集文件夹内的相对路径。
+>>cat train.txt
+./images/road839.png ./annotations/road839.xml
+./images/road363.png ./annotations/road363.xml
+...
+
+# valid.txt 是验证数据集文件列表，每一行是一张图像路径和对应标注文件路径，以空格分开。注意这里的路径是数据集文件夹内的相对路径。
+>>cat valid.txt
+./images/road218.png ./annotations/road218.xml
+./images/road681.png ./annotations/road681.xml
+```
+
+也可以下载准备好的数据[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar) ，解压到`dataset/roadsign_voc/`文件夹下即可。  
+准备好数据后，一般的我们要对数据有所了解，比如图像量，图像尺寸，每一类目标区域个数，目标区域大小等。如有必要，还要对数据进行清洗。  
+roadsign数据集统计:
+
+|    数据    |    图片数量    |
+| :--------: | :-----------: |
+|   train    |     701       |
+|   valid    |     176       |
+
+**说明：**
+（1）用户数据，建议在训练前仔细检查数据，避免因数据标注格式错误或图像数据不完整造成训练过程中的crash
+（2）如果图像尺寸太大的话，在不限制读入数据尺寸情况下，占用内存较多，会造成内存/显存溢出，请合理设置batch_size，可从小到大尝试
+
+#### 自定义数据训练
+
+数据准备完成后，需要修改PaddleDetection中关于Dataset的配置文件，在`configs/datasets`文件夹下。比如roadsign数据集的配置文件如下：
+```
+metric: VOC # 目前支持COCO, VOC, WiderFace等评估标准
+num_classes: 4 # 数据集的类别数，不包含背景类，roadsign数据集为4类，其他数据需要修改为自己的数据类别
+
+TrainDataset:
+  !VOCDataSet
+    dataset_dir: dataset/roadsign_voc # 训练集的图片所在文件相对于dataset_dir的路径
+    anno_path: train.txt # 训练集的标注文件相对于dataset_dir的路径
+    label_list: label_list.txt # 数据集所在路径，相对于PaddleDetection路径
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] # 控制dataset输出的sample所包含的字段，注意此为训练集Reader独有的且必须配置的字段
+
+EvalDataset:
+  !VOCDataSet
+    dataset_dir: dataset/roadsign_voc # 数据集所在路径，相对于PaddleDetection路径
+    anno_path: valid.txt # 验证集的标注文件相对于dataset_dir的路径
+    label_list: label_list.txt # 标签文件，相对于dataset_dir的路径
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+TestDataset:
+  !ImageFolder
+    anno_path: label_list.txt # 标注文件所在路径，仅用于读取数据集的类别信息，支持json和txt格式
+    dataset_dir: dataset/roadsign_voc # 数据集所在路径，若添加了此行，则`anno_path`路径为相对于`dataset_dir`路径，若此行不设置或去掉此行，则为相对于PaddleDetection路径
+```
+
+然后在对应模型配置文件中将自定义数据文件路径替换为新路径，以`configs/yolov3/yolov3_mobilenet_v1_roadsign.yml`为例
+
+```
+_BASE_: [
+  '../datasets/roadsign_voc.yml', # 指定为自定义数据集配置路径
+  '../runtime.yml',
+  '_base_/optimizer_40e.yml',
+  '_base_/yolov3_mobilenet_v1.yml',
+  '_base_/yolov3_reader.yml',
+]
+pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams
+weights: output/yolov3_mobilenet_v1_roadsign/model_final
+
+YOLOv3Loss:
+  ignore_thresh: 0.7
+  label_smooth: true
+```
+
+
+在PaddleDetection的yml配置文件中，使用`!`直接序列化模块实例(可以是函数，实例等)，上述的配置文件均使用Dataset进行了序列化。
+
+配置修改完成后，即可以启动训练评估，命令如下
+
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml --eval
+```
+
+更详细的命令参考[30分钟快速上手PaddleDetection](../GETTING_STARTED_cn.md)
+
+**注意：**
+请运行前自行仔细检查数据集的配置路径，在训练或验证时如果TrainDataset和EvalDataset的路径配置有误，会提示自动下载数据集。若使用自定义数据集，在推理时如果TestDataset路径配置有误，会提示使用默认COCO数据集的类别信息。
+
+
+
+### (可选)生成Anchor
+在yolo系列模型中，大多数情况下使用默认的anchor设置即可, 你也可以运行`tools/anchor_cluster.py`来得到适用于你的数据集Anchor，使用方法如下：
+``` bash
+python tools/anchor_cluster.py -c configs/ppyolo/ppyolo.yml -n 9 -s 608 -m v2 -i 1000
+```
+目前`tools/anchor_cluster.py`支持的主要参数配置如下表所示：
+
+|    参数    |    用途    |    默认值    |    备注    |
+|:------:|:------:|:------:|:------:|
+| -c/--config | 模型的配置文件 | 无默认值 | 必须指定 |
+| -n/--n | 聚类的簇数 | 9 | Anchor的数目 |
+| -s/--size | 图片的输入尺寸 | None | 若指定，则使用指定的尺寸，如果不指定, 则尝试从配置文件中读取图片尺寸 |
+|  -m/--method  |  使用的Anchor聚类方法  |  v2  |  目前只支持yolov2的聚类算法  |
+|  -i/--iters  |  kmeans聚类算法的迭代次数  |  1000  | kmeans算法收敛或者达到迭代次数后终止 |
diff --git a/docs/tutorials/data/PrepareDetDataSet_en.md b/docs/tutorials/data/PrepareDetDataSet_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..dbbe90d049c4239e5fa5b075df84685d98cc91ab
--- /dev/null
+++ b/docs/tutorials/data/PrepareDetDataSet_en.md
@@ -0,0 +1,450 @@
+# How to Prepare Training Data
+## Directory
+- [How to Prepare Training Data](#how-to-prepare-training-data)
+  - [Directory](#directory)
+    - [Description of Object Detection Data](#description-of-object-detection-data)
+    - [Prepare Training Data](#prepare-training-data)
+      - [VOC Data](#voc-data)
+        - [VOC Dataset Download](#voc-dataset-download)
+        - [Introduction to VOC Data Annotation File](#introduction-to-voc-data-annotation-file)
+      - [COCO Data](#coco-data)
+        - [COCO Data Download](#coco-data-download)
+        - [Description of COCO Data Annotation](#description-of-coco-data-annotation)
+      - [User Data](#user-data)
+        - [Convert User Data to VOC Data](#convert-user-data-to-voc-data)
+        - [Convert User Data to COCO Data](#convert-user-data-to-coco-data)
+        - [Reader of User Define Data](#reader-of-user-define-data)
+      - [Example of User Data Conversion](#example-of-user-data-conversion)
+
+### Description of Object Detection Data
+The data of object detection is more complex than classification. In an image, it is necessary to mark the position and category of each object.
+
+The general object position is represented by a rectangular box, which is generally expressed in the following three ways
+
+| Expression  |                                  Explanation                                   |
+| :---------: | :----------------------------------------------------------------------------: |
+| x1,y1,x2,y2 |    (x1,y1)is the top left coordinate, (x2,y2)is the bottom right coordonate    |
+|  x1,y1,w,h  | (x1,y1)is the top left coordinate, w is width of object, h is height of object |
+|  xc,yc,w,h  |    (xc,yc)is center of object, w is width of object, h is height of object     |
+
+Common object detection datasets such as Pascal VOC, adopting `[x1,y1,x2,y2]` to express the bounding box of object. COCO uses `[x1,y1,w,h]`, [format](https://cocodataset.org/#format-data).
+
+### Prepare Training Data
+PaddleDetection is supported [COCO](http://cocodataset.org) and [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [WIDER-FACE](http://shuoyang1213.me/WIDERFACE/) datasets by default.
+
+It also supports custom data sources including:
+
+(1) Convert custom data to VOC format;  
+(2) Convert custom data to COOC format;  
+(3) Customize a new data source, and add custom reader;  
+
+firstly, enter `PaddleDetection` root directory
+
+```
+cd PaddleDetection/
+ppdet_root=$(pwd)
+```
+
+#### VOC Data
+
+VOC data is used in [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) competition. Pascal VOC competition not only contains image classification task, but also contains object detection and object segmentation et al., the annotation file contains the ground truth of multiple tasks.
+VOC dataset denotes the data of PAscal VOC competition. when customizeing VOC data, For non mandatory fields in the XML file, please select whether to label or use the default value according to the actual situation.
+
+##### VOC Dataset Download  
+
+- Download VOC datasets through code automation. The datasets are large and take a long time to download
+
+    ```
+    # Execute code to automatically download VOC dataset
+    python dataset/voc/download_voc.py
+    ```
+
+    After code execution, the VOC dataset file organization structure is：
+    ```
+    >>cd dataset/voc/
+    >>tree
+    ├── create_list.py
+    ├── download_voc.py
+    ├── generic_det_label_list.txt
+    ├── generic_det_label_list_zh.txt
+    ├── label_list.txt
+    ├── VOCdevkit/VOC2007
+    │   ├── annotations
+    │       ├── 001789.xml
+    │       |   ...
+    │   ├── JPEGImages
+    │       ├── 001789.jpg
+    │       |   ...
+    │   ├── ImageSets
+    │       |   ...
+    ├── VOCdevkit/VOC2012
+    │   ├── Annotations
+    │       ├── 2011_003876.xml
+    │       |   ...
+    │   ├── JPEGImages
+    │       ├── 2011_003876.jpg
+    │       |   ...
+    │   ├── ImageSets
+    │       |   ...
+    |   ...
+    ```
+
+    Description of each document
+    ```
+    # label_list.txt is list of classes name，filename must be label_list.txt. If using VOC dataset, when `use_default_label=true` in config file, this file is not required.
+
+    >>cat label_list.txt
+    aeroplane
+    bicycle
+    ...
+
+    # trainval.txt is file list of trainset
+    >>cat trainval.txt
+    VOCdevkit/VOC2007/JPEGImages/007276.jpg VOCdevkit/VOC2007/Annotations/007276.xml
+    VOCdevkit/VOC2012/JPEGImages/2011_002612.jpg VOCdevkit/VOC2012/Annotations/2011_002612.xml
+    ...
+
+    # test.txt is file list of testset
+    >>cat test.txt
+    VOCdevkit/VOC2007/JPEGImages/000001.jpg VOCdevkit/VOC2007/Annotations/000001.xml
+    ...
+
+    # label_list.txt voc list of classes name
+    >>cat label_list.txt
+
+    aeroplane
+    bicycle
+    ...
+    ```
+- If the VOC dataset has been downloaded
+    You can organize files according to the above data file organization structure.
+
+##### Introduction to VOC Data Annotation File
+
+In VOC dataset, Each image file corresponds to an XML file with the same name, the coordinates and categories of the marked object frame in the XML file, such as `2007_002055.jpg`:
+![](../images/2007_002055.jpg)
+
+The XML file corresponding to the image contains the basic information of the corresponding image, such as file name, source, image size, object area information and category information contained in the image.
+
+The XML file contains the following fields：
+- filename, indicating the image name.
+- size, indicating the image size, including: image width, image height and image depth
+    ```
+    <size>
+        <width>500</width>
+        <height>375</height>
+        <depth>3</depth>
+    </size>
+    ```
+- object field, indict each object, including:
+
+    |      Label       |                                                        Explanation                                                         |
+    | :--------------: | :------------------------------------------------------------------------------------------------------------------------: |
+    |       name       |                                                    name of object class                                                    |
+    |       pose       |                               attitude description of the target object (non required field)                               |
+    |    truncated     | If the occlusion of the object exceeds 15-20% and is outside the bounding box，mark it as `truncated` (non required field) |
+    |    difficult     |                   objects that are difficult to recognize are marked as`difficult` (non required field)                    |
+    | bndbox son laebl |                            (xmin,ymin) top left coordinate, (xmax,ymax) bottom right coordinate                            |
+
+
+#### COCO Data
+COOC data is used in [COCO](http://cocodataset.org) competition. alike, Coco competition also contains multiple competition tasks, and its annotation file contains the annotation contents of multiple tasks.
+The coco dataset refers to the data used in the coco competition. Customizing coco data, some fields in JSON file, please select whether to label or use the default value according to the actual situation.
+
+
+##### COCO Data Download
+- The coco dataset is downloaded automatically through the code. The dataset is large and takes a long time to download
+
+    ```
+    # automatically download coco datasets by executing code
+    python dataset/coco/download_coco.py
+    ```
+
+    after code execution, the organization structure of coco dataset file is：
+    ```
+    >>cd dataset/coco/
+    >>tree
+    ├── annotations
+    │   ├── instances_train2017.json
+    │   ├── instances_val2017.json
+    │   |   ...
+    ├── train2017
+    │   ├── 000000000009.jpg
+    │   ├── 000000580008.jpg
+    │   |   ...
+    ├── val2017
+    │   ├── 000000000139.jpg
+    │   ├── 000000000285.jpg
+    │   |   ...
+    |   ...
+    ```
+- If the coco dataset has been downloaded  
+    The files can be organized according to the above data file organization structure.
+
+##### Description of COCO Data Annotation  
+Coco data annotation is to store the annotations of all training images in a JSON file. Data is stored in the form of nested dictionaries.
+
+The JSON file contains the following keys:  
+- info，indicating the annotation file info。
+- licenses, indicating the label file licenses。
+- images, indicating the list of image information in the annotation file, and each element is the information of an image. The following is the information of one of the images:
+    ```
+    {
+        'license': 3,                       # license
+        'file_name': '000000391895.jpg',    # file_name
+         # coco_url
+        'coco_url': 'http://images.cocodataset.org/train2017/000000391895.jpg',
+        'height': 360,                      # image height
+        'width': 640,                       # image width
+        'date_captured': '2013-11-14 11:18:45', # date_captured
+        # flickr_url
+        'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
+        'id': 391895                        # image id
+    }
+    ```
+- annotations: indicating the annotation information list of the target object in the annotation file. Each element is the annotation information of a target object. The following is the annotation information of one of the target objects:
+    ```
+    {
+
+        'segmentation':             # object segmentation annotation
+        'area': 2765.1486500000005, # object area
+        'iscrowd': 0,               # iscrowd
+        'image_id': 558840,         # image id
+        'bbox': [199.84, 200.46, 77.71, 70.88], # bbox [x1,y1,w,h]
+        'category_id': 58,          # category_id
+        'id': 156                   # image id
+    }
+    ```
+
+    ```
+    # Viewing coco annotation files
+    import json
+    coco_anno = json.load(open('./annotations/instances_train2017.json'))
+
+    # coco_anno.keys
+    print('\nkeys:', coco_anno.keys())
+
+    # Viewing categories information
+    print('\ncategories:', coco_anno['categories'])
+
+    # Viewing the number of images
+    print('\nthe number of images：', len(coco_anno['images']))
+
+    # Viewing the number of obejcts
+    print('\nthe number of annotation：', len(coco_anno['annotations']))
+
+    # View object annotation information
+    print('\nobject annotation information: ', coco_anno['annotations'][0])
+    ```
+
+    Coco data is prepared as follows.
+    `dataset/coco/`Initial document organization
+    ```
+    >>cd dataset/coco/
+    >>tree
+    ├── download_coco.py
+    ```
+
+#### User Data
+There are three processing methods for user data:  
+  (1) Convert user data into VOC data (only include labels necessary for object detection as required)  
+  (2) Convert user data into coco data (only include labels necessary for object detection as required)  
+  (3) Customize a reader for user data (for complex data, you need to customize the reader)  
+
+##### Convert User Data to VOC Data
+After the user dataset is converted to VOC data, the directory structure is as follows (note that the path name and file name in the dataset should not use Chinese as far as possible to avoid errors caused by Chinese coding problems):
+
+```
+dataset/xxx/
+├── annotations
+│   ├── xxx1.xml
+│   ├── xxx2.xml
+│   ├── xxx3.xml
+│   |   ...
+├── images
+│   ├── xxx1.jpg
+│   ├── xxx2.jpg
+│   ├── xxx3.jpg
+│   |   ...
+├── label_list.txt (Must be provided and the file name must be label_list.txt )
+├── train.txt (list of trainset ./images/xxx1.jpg ./annotations/xxx1.xml)
+└── valid.txt (list of valid file)
+```
+
+Description of each document
+```
+# label_list.txt is a list of category names. The file name must be this
+>>cat label_list.txt
+classname1
+classname2
+...
+
+# train.txt is list of trainset
+>>cat train.txt
+./images/xxx1.jpg ./annotations/xxx1.xml
+./images/xxx2.jpg ./annotations/xxx2.xml
+...
+
+# valid.txt is list of validset
+>>cat valid.txt
+./images/xxx3.jpg ./annotations/xxx3.xml
+...
+```
+
+##### Convert User Data to COCO Data
+`x2coco.py` is provided in `./tools/` to convert VOC dataset, labelme labeled dataset or cityscape dataset into coco data, for example:
+
+（1）Conversion of labelme data to coco data:
+```bash
+python tools/x2coco.py \
+                --dataset_type labelme \
+                --json_input_dir ./labelme_annos/ \
+                --image_input_dir ./labelme_imgs/ \
+                --output_dir ./cocome/ \
+                --train_proportion 0.8 \
+                --val_proportion 0.2 \
+                --test_proportion 0.0
+```
+（2）Convert VOC data to coco data:
+```bash
+python tools/x2coco.py \
+        --dataset_type voc \
+        --voc_anno_dir path/to/VOCdevkit/VOC2007/Annotations/ \
+        --voc_anno_list path/to/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt \
+        --voc_label_list dataset/voc/label_list.txt \
+        --voc_out_name voc_train.json
+```
+
+After the user dataset is converted to coco data, the directory structure is as follows (note that the path name and file name in the dataset should not use Chinese as far as possible to avoid errors caused by Chinese coding problems):
+```
+dataset/xxx/
+├── annotations
+│   ├── train.json  # Annotation file of coco data
+│   ├── valid.json  # Annotation file of coco data
+├── images
+│   ├── xxx1.jpg
+│   ├── xxx2.jpg
+│   ├── xxx3.jpg
+│   |   ...
+...
+```
+
+##### Reader of User Define Data  
+  If new data in the dataset needs to be added to paddedetection, you can refer to the [add new data source] (../advanced_tutorials/READER.md#2.3_Customizing_Dataset) document section in the data processing document to develop corresponding code to complete the new data source support. At the same time, you can read the [data processing document] (../advanced_tutorials/READER.md) for specific code analysis of data processing
+
+The configuration file for the Dataset exists in the `configs/datasets` folder. For example, the COCO dataset configuration file is as follows:
+```
+metric: COCO # Currently supports COCO, VOC, OID, Wider Face and other evaluation standards
+num_classes: 80 # num_classes: The number of classes in the dataset, excluding background classes
+
+TrainDataset:
+  !COCODataSet
+    image_dir: train2017 # The path where the training set image resides relative to the dataset_dir
+    anno_path: annotations/instances_train2017.json # Path to the annotation file of the training set relative to the dataset_dir
+    dataset_dir: dataset/coco #The path where the dataset is located relative to the PaddleDetection path
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] # Controls the fields contained in the sample output of the dataset, note data_fields are unique to the trainreader and must be configured
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017 # The path where the images of the validation set reside relative to the dataset_dir
+    anno_path: annotations/instances_val2017.json # The path to the annotation file of the validation set relative to the dataset_dir
+    dataset_dir: dataset/coco # The path where the dataset is located relative to the PaddleDetection path
+TestDataset:
+  !ImageFolder
+    anno_path: dataset/coco/annotations/instances_val2017.json # The path of the annotation file,  it is only used to read the category information of the dataset. JSON and TXT formats are supported
+    dataset_dir: dataset/coco # The path of the dataset, note if this row is added, `anno_path` will be 'dataset_dir/anno_path`, if not set or removed, `anno_path` is `anno_path`
+```
+In the YML profile for Paddle Detection, use `!`directly serializes module instances (functions, instances, etc.). The above configuration files are serialized using Dataset.
+
+**Note:**
+Please carefully check the configuration path of the dataset before running. During training or verification, if the path of TrainDataset or EvalDataset is wrong, it will download the dataset automatically. When using a user-defined dataset, if the TestDataset path is incorrectly configured during inference, the category of the default COCO dataset will be used.
+
+
+#### Example of User Data Conversion
+  Take [Kaggle Dataset](https://www.kaggle.com/andrewmvd/road-sign-detection) competition data as an example to illustrate how to prepare custom data. The dataset of Kaggle [road-sign-detection](https://www.kaggle.com/andrewmvd/road-sign-detection) competition contains 877 images, four categories：crosswalk，speedlimit，stop，trafficlight. Available for download from kaggle, also available from [link](https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar).
+  Example diagram of road sign dataset:  
+  ![](../images/road554.png)
+
+```
+# Downing and unziping data
+  >>cd $(ppdet_root)/dataset
+# Download and unzip the kaggle dataset. The current file organization is as follows
+
+├── annotations
+│   ├── road0.xml
+│   ├── road1.xml
+│   ├── road10.xml
+│   |   ...
+├── images
+│   ├── road0.jpg
+│   ├── road1.jpg
+│   ├── road2.jpg
+│   |   ...
+```
+
+The data is divided into training set and test set
+```
+# Generating label_list.txt
+>>echo -e "speedlimit\ncrosswalk\ntrafficlight\nstop" > label_list.txt
+
+# Generating train.txt, valid.txt and test.txt
+>>ls images/*.png | shuf > all_image_list.txt
+>>awk -F"/" '{print $2}' all_image_list.txt | awk -F".png" '{print $1}'  | awk -F"\t" '{print "images/"$1".png annotations/"$1".xml"}' > all_list.txt
+
+# The proportion of training set, verification set and test set is about 80%, 10% and 10% respectively.
+>>head -n 88 all_list.txt > test.txt
+>>head -n 176 all_list.txt | tail -n 88 > valid.txt
+>>tail -n 701 all_list.txt > train.txt
+
+# Deleting unused files
+>>rm -rf all_image_list.txt all_list.txt
+
+The organization structure of the final dataset file is:
+
+├── annotations
+│   ├── road0.xml
+│   ├── road1.xml
+│   ├── road10.xml
+│   |   ...
+├── images
+│   ├── road0.jpg
+│   ├── road1.jpg
+│   ├── road2.jpg
+│   |   ...
+├── label_list.txt
+├── test.txt
+├── train.txt
+└── valid.txt
+
+# label_list.txt is list of file name, file name must be label_list.txt
+>>cat label_list.txt
+crosswalk
+speedlimit
+stop
+trafficlight
+
+# train.txt is the list of training dataset files, and each line is an image path and the corresponding annotation file path, separated by spaces. Note that the path here is a relative path within the dataset folder.
+>>cat train.txt
+./images/road839.png ./annotations/road839.xml
+./images/road363.png ./annotations/road363.xml
+...
+
+# valid.txt is the list of validation dataset files. Each line is an image path and the corresponding annotation file path, separated by spaces. Note that the path here is a relative path within the dataset folder.
+>>cat valid.txt
+./images/road218.png ./annotations/road218.xml
+./images/road681.png ./annotations/road681.xml
+```
+
+You can also download [the prepared data](https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar), unzip to `dataset/roadsign_voc/`  
+After preparing the data, we should generally understand the data, such as image quantity, image size, number of target areas of each type, target area size, etc. If necessary, clean the data.
+
+Roadsign dataset statistics:
+
+| data  | number of images |
+| :---: | :--------------: |
+| train |       701        |
+| valid |       176        |
+
+**Explanation:**  
+  (1) For user data, it is recommended to carefully check the data before training to avoid crash during training due to wrong data annotation format or incomplete image data  
+  (2) If the image size is too large, it will occupy more memory without limiting the read data size, which will cause memory / video memory overflow. Please set batch reasonably_ Size, you can try from small to large
diff --git a/docs/tutorials/data/README.md b/docs/tutorials/data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..947b650e18cbc9cf9bb57c8b6600588ed0a6501f
--- /dev/null
+++ b/docs/tutorials/data/README.md
@@ -0,0 +1,27 @@
+# 数据准备
+
+数据对于深度学习开发起到了至关重要的作用，数据采集和标注的质量是提升业务模型效果的重要因素。本文档主要介绍PaddleDetection中如何进行数据准备，包括采集高质量数据方法，覆盖多场景类型，提升模型泛化能力；以及各类任务数据标注工具和方法，并在PaddleDetection下使用
+
+## 数据采集
+在深度学习任务的实际落地中，数据采集往往决定了最终模型的效果，对于数据采集的几点建议如下：
+
+### 确定方向
+任务类型、数据的类别和目标场景这些因素决定了要收集什么数据，首先需要根据这些因素来确定整体数据收集的工作方向。
+
+### 开源数据集
+在实际场景中数据采集成本其实十分高昂，完全靠自己收集在时间和金钱上都有很高的成本，开源数据集是帮助增加训练数据量的重要手段，所以很多时候会考虑加入一些相似任务的开源数据。在使用中请遵守各个开源数据集的license规定的使用条件。
+
+### 增加场景数据
+开源数据一般不会覆盖实际使用的的目标场景，用户需要评估开源数据集中已包含的场景和目标场景间的差异，有针对性地补充目标场景数据，尽量让训练和部署数据的场景一致。
+
+### 类别均衡
+在采集阶段，也需要尽量保持类别均衡，帮助模型正确学习到目标特征。
+
+
+## 数据标注及格式说明
+
+| 任务类型  |     数据标注   |   数据格式说明 |
+|:--------:| :--------:|:--------:|
+|    目标检测   |   [文档链接](DetAnnoTools.md)  |    [文档链接](PrepareDetDataSet.md)  |
+|    关键点检测   |   [文档链接](KeyPointAnnoTools.md)  |    [文档链接](PrepareKeypointDataSet.md)  |
+|    多目标跟踪   |   [文档链接](MOTAnnoTools.md)  |    [文档链接](PrepareMOTDataSet.md)  |
diff --git a/docs/tutorials/logging_en.md b/docs/tutorials/logging_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b45ceba69d39098f70d0b8825d372529ce40cd0b
--- /dev/null
+++ b/docs/tutorials/logging_en.md
@@ -0,0 +1,46 @@
+# Logging
+
+This document talks about how to track metrics and visualize model performance during training. The library currently supports [VisualDL](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/03_VisualDL/visualdl_usage_en.html) and [Weights & Biases](https://docs.wandb.ai).
+
+## VisualDL
+Logging to VisualDL is supported only in python >= 3.5. To install VisualDL
+
+```
+pip install visualdl
+```
+
+PaddleDetection uses a callback to log the training metrics at the end of every step and metrics from the validation step at the end of every epoch. To use VisualDL for visualization, add the `--use_vdl` flag to the training command and `--vdl_log_dir <logs>` to set the directory which stores the records.
+
+For example
+
+```
+python tools/train -c config.yml --use_vdl --vdl_log_dir ./logs
+```
+
+Another possible way to do this is to add the aforementioned flags to the `config.yml` file.
+
+## Weights & Biases
+W&B is a MLOps tool that can be used for experiment tracking, dataset/model versioning, visualizing results and collaborating with colleagues. A W&B logger is integrated directly into PaddleDetection and to use it, first you need to install the wandb sdk and login to your wandb account.
+
+```
+pip install wandb
+wandb login
+```
+
+To use wandb to log metrics while training add the `--use_wandb` flag to the training command and any other arguments for the W&B logger can be provided like this - 
+
+```
+python tools/train -c config.yml --use_wandb -o wandb-project=MyDetector wandb-entity=MyTeam wandb-save_dir=./logs
+```
+
+The arguments to the W&B logger must be proceeded by `-o` and each invidiual argument must contain the prefix "wandb-".
+
+If this is too tedious, an alternative way is to add the arguments to the `config.yml` file under the `wandb` header. For example
+
+```
+use_wandb: True
+wandb:
+    project: MyProject
+    entity: MyTeam
+    save_dir: ./logs
+```
diff --git a/ppdet/__init__.py b/ppdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fcc982fb60c796e6b9b6e23026d50ef0e9611ae
--- /dev/null
+++ b/ppdet/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import (core, data, engine, modeling, model_zoo, optimizer, metrics,
+               utils, slim)
+
+
+try:
+    from .version import full_version as __version__
+    from .version import commit as __git_commit__
+except ImportError:
+    import sys
+    sys.stderr.write("Warning: import ppdet from source directory " \
+            "without installing, run 'python setup.py install' to " \
+            "install ppdet firstly\n")
diff --git a/ppdet/core/__init__.py b/ppdet/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0427717715c9af31dfa57f1b69f8369fc9178a2
--- /dev/null
+++ b/ppdet/core/__init__.py
@@ -0,0 +1,15 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import config
diff --git a/ppdet/core/config/__init__.py b/ppdet/core/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2
--- /dev/null
+++ b/ppdet/core/config/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ppdet/core/config/schema.py b/ppdet/core/config/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e41b5c34693a709fa61d47489f6934ead0c17e0
--- /dev/null
+++ b/ppdet/core/config/schema.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import inspect
+import importlib
+import re
+
+try:
+    from docstring_parser import parse as doc_parse
+except Exception:
+
+    def doc_parse(*args):
+        pass
+
+
+try:
+    from typeguard import check_type
+except Exception:
+
+    def check_type(*args):
+        pass
+
+
+__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema']
+
+
+class SchemaValue(object):
+    def __init__(self, name, doc='', type=None):
+        super(SchemaValue, self).__init__()
+        self.name = name
+        self.doc = doc
+        self.type = type
+
+    def set_default(self, value):
+        self.default = value
+
+    def has_default(self):
+        return hasattr(self, 'default')
+
+
+class SchemaDict(dict):
+    def __init__(self, **kwargs):
+        super(SchemaDict, self).__init__()
+        self.schema = {}
+        self.strict = False
+        self.doc = ""
+        self.update(kwargs)
+
+    def __setitem__(self, key, value):
+        # XXX also update regular dict to SchemaDict??
+        if isinstance(value, dict) and key in self and isinstance(self[key],
+                                                                  SchemaDict):
+            self[key].update(value)
+        else:
+            super(SchemaDict, self).__setitem__(key, value)
+
+    def __missing__(self, key):
+        if self.has_default(key):
+            return self.schema[key].default
+        elif key in self.schema:
+            return self.schema[key]
+        else:
+            raise KeyError(key)
+
+    def copy(self):
+        newone = SchemaDict()
+        newone.__dict__.update(self.__dict__)
+        newone.update(self)
+        return newone
+
+    def set_schema(self, key, value):
+        assert isinstance(value, SchemaValue)
+        self.schema[key] = value
+
+    def set_strict(self, strict):
+        self.strict = strict
+
+    def has_default(self, key):
+        return key in self.schema and self.schema[key].has_default()
+
+    def is_default(self, key):
+        if not self.has_default(key):
+            return False
+        if hasattr(self[key], '__dict__'):
+            return True
+        else:
+            return key not in self or self[key] == self.schema[key].default
+
+    def find_default_keys(self):
+        return [
+            k for k in list(self.keys()) + list(self.schema.keys())
+            if self.is_default(k)
+        ]
+
+    def mandatory(self):
+        return any([k for k in self.schema.keys() if not self.has_default(k)])
+
+    def find_missing_keys(self):
+        missing = [
+            k for k in self.schema.keys()
+            if k not in self and not self.has_default(k)
+        ]
+        placeholders = [k for k in self if self[k] in ('<missing>', '<value>')]
+        return missing + placeholders
+
+    def find_extra_keys(self):
+        return list(set(self.keys()) - set(self.schema.keys()))
+
+    def find_mismatch_keys(self):
+        mismatch_keys = []
+        for arg in self.schema.values():
+            if arg.type is not None:
+                try:
+                    check_type("{}.{}".format(self.name, arg.name),
+                               self[arg.name], arg.type)
+                except Exception:
+                    mismatch_keys.append(arg.name)
+        return mismatch_keys
+
+    def validate(self):
+        missing_keys = self.find_missing_keys()
+        if missing_keys:
+            raise ValueError("Missing param for class<{}>: {}".format(
+                self.name, ", ".join(missing_keys)))
+        extra_keys = self.find_extra_keys()
+        if extra_keys and self.strict:
+            raise ValueError("Extraneous param for class<{}>: {}".format(
+                self.name, ", ".join(extra_keys)))
+        mismatch_keys = self.find_mismatch_keys()
+        if mismatch_keys:
+            raise TypeError("Wrong param type for class<{}>: {}".format(
+                self.name, ", ".join(mismatch_keys)))
+
+
+class SharedConfig(object):
+    """
+    Representation class for `__shared__` annotations, which work as follows:
+
+    - if `key` is set for the module in config file, its value will take
+      precedence
+    - if `key` is not set for the module but present in the config file, its
+      value will be used
+    - otherwise, use the provided `default_value` as fallback
+
+    Args:
+        key: config[key] will be injected
+        default_value: fallback value
+    """
+
+    def __init__(self, key, default_value=None):
+        super(SharedConfig, self).__init__()
+        self.key = key
+        self.default_value = default_value
+
+
+def extract_schema(cls):
+    """
+    Extract schema from a given class
+
+    Args:
+        cls (type): Class from which to extract.
+
+    Returns:
+        schema (SchemaDict): Extracted schema.
+    """
+    ctor = cls.__init__
+    # python 2 compatibility
+    if hasattr(inspect, 'getfullargspec'):
+        argspec = inspect.getfullargspec(ctor)
+        annotations = argspec.annotations
+        has_kwargs = argspec.varkw is not None
+    else:
+        argspec = inspect.getfullargspec(ctor)
+        # python 2 type hinting workaround, see pep-3107
+        # however, since `typeguard` does not support python 2, type checking
+        # is still python 3 only for now
+        annotations = getattr(ctor, '__annotations__', {})
+        has_kwargs = argspec.varkw is not None
+
+    names = [arg for arg in argspec.args if arg != 'self']
+    defaults = argspec.defaults
+    num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0
+    num_required = len(names) - num_defaults
+
+    docs = cls.__doc__
+    if docs is None and getattr(cls, '__category__', None) == 'op':
+        docs = cls.__call__.__doc__
+    try:
+        docstring = doc_parse(docs)
+    except Exception:
+        docstring = None
+
+    if docstring is None:
+        comments = {}
+    else:
+        comments = {}
+        for p in docstring.params:
+            match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name)
+            if match_obj is not None:
+                comments[match_obj.group(1)] = p.description
+
+    schema = SchemaDict()
+    schema.name = cls.__name__
+    schema.doc = ""
+    if docs is not None:
+        start_pos = docs[0] == '\n' and 1 or 0
+        schema.doc = docs[start_pos:].split("\n")[0].strip()
+    # XXX handle paddle's weird doc convention
+    if '**' == schema.doc[:2] and '**' == schema.doc[-2:]:
+        schema.doc = schema.doc[2:-2].strip()
+    schema.category = hasattr(cls, '__category__') and getattr(
+        cls, '__category__') or 'module'
+    schema.strict = not has_kwargs
+    schema.pymodule = importlib.import_module(cls.__module__)
+    schema.inject = getattr(cls, '__inject__', [])
+    schema.shared = getattr(cls, '__shared__', [])
+    for idx, name in enumerate(names):
+        comment = name in comments and comments[name] or name
+        if name in schema.inject:
+            type_ = None
+        else:
+            type_ = name in annotations and annotations[name] or None
+        value_schema = SchemaValue(name, comment, type_)
+        if name in schema.shared:
+            assert idx >= num_required, "shared config must have default value"
+            default = defaults[idx - num_required]
+            value_schema.set_default(SharedConfig(name, default))
+        elif idx >= num_required:
+            default = defaults[idx - num_required]
+            value_schema.set_default(default)
+        schema.set_schema(name, value_schema)
+
+    return schema
diff --git a/ppdet/core/config/yaml_helpers.py b/ppdet/core/config/yaml_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..181cfe6fcd7368c6cadb32d1021a8c55a1d98aa5
--- /dev/null
+++ b/ppdet/core/config/yaml_helpers.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+
+import yaml
+from .schema import SharedConfig
+
+__all__ = ['serializable', 'Callable']
+
+
+def represent_dictionary_order(self, dict_data):
+    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())
+
+
+def setup_orderdict():
+    from collections import OrderedDict
+    yaml.add_representer(OrderedDict, represent_dictionary_order)
+
+
+def _make_python_constructor(cls):
+    def python_constructor(loader, node):
+        if isinstance(node, yaml.SequenceNode):
+            args = loader.construct_sequence(node, deep=True)
+            return cls(*args)
+        else:
+            kwargs = loader.construct_mapping(node, deep=True)
+            try:
+                return cls(**kwargs)
+            except Exception as ex:
+                print("Error when construct {} instance from yaml config".
+                      format(cls.__name__))
+                raise ex
+
+    return python_constructor
+
+
+def _make_python_representer(cls):
+    # python 2 compatibility
+    if hasattr(inspect, 'getfullargspec'):
+        argspec = inspect.getfullargspec(cls)
+    else:
+        argspec = inspect.getfullargspec(cls.__init__)
+    argnames = [arg for arg in argspec.args if arg != 'self']
+
+    def python_representer(dumper, obj):
+        if argnames:
+            data = {name: getattr(obj, name) for name in argnames}
+        else:
+            data = obj.__dict__
+        if '_id' in data:
+            del data['_id']
+        return dumper.represent_mapping(u'!{}'.format(cls.__name__), data)
+
+    return python_representer
+
+
+def serializable(cls):
+    """
+    Add loader and dumper for given class, which must be
+    "trivially serializable"
+
+    Args:
+        cls: class to be serialized
+
+    Returns: cls
+    """
+    yaml.add_constructor(u'!{}'.format(cls.__name__),
+                         _make_python_constructor(cls))
+    yaml.add_representer(cls, _make_python_representer(cls))
+    return cls
+
+
+yaml.add_representer(SharedConfig,
+                     lambda d, o: d.represent_data(o.default_value))
+
+
+@serializable
+class Callable(object):
+    """
+    Helper to be used in Yaml for creating arbitrary class objects
+
+    Args:
+        full_type (str): the full module path to target function
+    """
+
+    def __init__(self, full_type, args=[], kwargs={}):
+        super(Callable, self).__init__()
+        self.full_type = full_type
+        self.args = args
+        self.kwargs = kwargs
+
+    def __call__(self):
+        if '.' in self.full_type:
+            idx = self.full_type.rfind('.')
+            module = importlib.import_module(self.full_type[:idx])
+            func_name = self.full_type[idx + 1:]
+        else:
+            try:
+                module = importlib.import_module('builtins')
+            except Exception:
+                module = importlib.import_module('__builtin__')
+            func_name = self.full_type
+
+        func = getattr(module, func_name)
+        return func(*self.args, **self.kwargs)
diff --git a/ppdet/core/workspace.py b/ppdet/core/workspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..6735bcfc26d426565bf0c4cef50dd100f4c5fd30
--- /dev/null
+++ b/ppdet/core/workspace.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import importlib
+import os
+import sys
+
+import yaml
+import collections
+
+try:
+    collectionsAbc = collections.abc
+except AttributeError:
+    collectionsAbc = collections
+
+from .config.schema import SchemaDict, SharedConfig, extract_schema
+from .config.yaml_helpers import serializable
+
+__all__ = [
+    'global_config',
+    'load_config',
+    'merge_config',
+    'get_registered_modules',
+    'create',
+    'register',
+    'serializable',
+    'dump_value',
+]
+
+
+def dump_value(value):
+    # XXX this is hackish, but collections.abc is not available in python 2
+    if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)):
+        value = yaml.dump(value, default_flow_style=True)
+        value = value.replace('\n', '')
+        value = value.replace('...', '')
+        return "'{}'".format(value)
+    else:
+        # primitive types
+        return str(value)
+
+
+class AttrDict(dict):
+    """Single level attribute dict, NOT recursive"""
+
+    def __init__(self, **kwargs):
+        super(AttrDict, self).__init__()
+        super(AttrDict, self).update(kwargs)
+
+    def __getattr__(self, key):
+        if key in self:
+            return self[key]
+        raise AttributeError("object has no attribute '{}'".format(key))
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+    def copy(self):
+        new_dict = AttrDict()
+        for k, v in self.items():
+            new_dict.update({k: v})
+        return new_dict
+
+
+global_config = AttrDict()
+
+BASE_KEY = '_BASE_'
+
+
+# parse and load _BASE_ recursively
+def _load_config_with_base(file_path):
+    with open(file_path) as f:
+        file_cfg = yaml.load(f, Loader=yaml.Loader)
+
+    # NOTE: cfgs outside have higher priority than cfgs in _BASE_
+    if BASE_KEY in file_cfg:
+        all_base_cfg = AttrDict()
+        base_ymls = list(file_cfg[BASE_KEY])
+        for base_yml in base_ymls:
+            if base_yml.startswith("~"):
+                base_yml = os.path.expanduser(base_yml)
+            if not base_yml.startswith('/'):
+                base_yml = os.path.join(os.path.dirname(file_path), base_yml)
+
+            with open(base_yml) as f:
+                base_cfg = _load_config_with_base(base_yml)
+                all_base_cfg = merge_config(base_cfg, all_base_cfg)
+
+        del file_cfg[BASE_KEY]
+        return merge_config(file_cfg, all_base_cfg)
+
+    return file_cfg
+
+
+def load_config(file_path):
+    """
+    Load config from file.
+
+    Args:
+        file_path (str): Path of the config file to be loaded.
+
+    Returns: global config
+    """
+    _, ext = os.path.splitext(file_path)
+    assert ext in ['.yml', '.yaml'], "only support yaml files for now"
+
+    # load config from file and merge into global config
+    cfg = _load_config_with_base(file_path)
+    cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]
+    merge_config(cfg)
+
+    return global_config
+
+
+def dict_merge(dct, merge_dct):
+    """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
+    updating only top-level keys, dict_merge recurses down into dicts nested
+    to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
+    ``dct``.
+
+    Args:
+        dct: dict onto which the merge is executed
+        merge_dct: dct merged into dct
+
+    Returns: dct
+    """
+    for k, v in merge_dct.items():
+        if (k in dct and isinstance(dct[k], dict) and
+                isinstance(merge_dct[k], collectionsAbc.Mapping)):
+            dict_merge(dct[k], merge_dct[k])
+        else:
+            dct[k] = merge_dct[k]
+    return dct
+
+
+def merge_config(config, another_cfg=None):
+    """
+    Merge config into global config or another_cfg.
+
+    Args:
+        config (dict): Config to be merged.
+
+    Returns: global config
+    """
+    global global_config
+    dct = another_cfg or global_config
+    return dict_merge(dct, config)
+
+
+def get_registered_modules():
+    return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)}
+
+
+def make_partial(cls):
+    op_module = importlib.import_module(cls.__op__.__module__)
+    op = getattr(op_module, cls.__op__.__name__)
+    cls.__category__ = getattr(cls, '__category__', None) or 'op'
+
+    def partial_apply(self, *args, **kwargs):
+        kwargs_ = self.__dict__.copy()
+        kwargs_.update(kwargs)
+        return op(*args, **kwargs_)
+
+    if getattr(cls, '__append_doc__', True):  # XXX should default to True?
+        if sys.version_info[0] > 2:
+            cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__)
+            cls.__init__.__doc__ = op.__doc__
+            cls.__call__ = partial_apply
+            cls.__call__.__doc__ = op.__doc__
+        else:
+            # XXX work around for python 2
+            partial_apply.__doc__ = op.__doc__
+            cls.__call__ = partial_apply
+    return cls
+
+
+def register(cls):
+    """
+    Register a given module class.
+
+    Args:
+        cls (type): Module class to be registered.
+
+    Returns: cls
+    """
+    if cls.__name__ in global_config:
+        raise ValueError("Module class already registered: {}".format(
+            cls.__name__))
+    if hasattr(cls, '__op__'):
+        cls = make_partial(cls)
+    global_config[cls.__name__] = extract_schema(cls)
+    return cls
+
+
+def create(cls_or_name, **kwargs):
+    """
+    Create an instance of given module class.
+
+    Args:
+        cls_or_name (type or str): Class of which to create instance.
+
+    Returns: instance of type `cls_or_name`
+    """
+    assert type(cls_or_name) in [type, str
+                                 ], "should be a class or name of a class"
+    name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__
+    if name in global_config:
+        if isinstance(global_config[name], SchemaDict):
+            pass
+        elif hasattr(global_config[name], "__dict__"):
+            # support instance return directly
+            return global_config[name]
+        else:
+            raise ValueError("The module {} is not registered".format(name))
+    else:
+        raise ValueError("The module {} is not registered".format(name))
+
+    config = global_config[name]
+    cls = getattr(config.pymodule, name)
+    cls_kwargs = {}
+    cls_kwargs.update(global_config[name])
+
+    # parse `shared` annoation of registered modules
+    if getattr(config, 'shared', None):
+        for k in config.shared:
+            target_key = config[k]
+            shared_conf = config.schema[k].default
+            assert isinstance(shared_conf, SharedConfig)
+            if target_key is not None and not isinstance(target_key,
+                                                         SharedConfig):
+                continue  # value is given for the module
+            elif shared_conf.key in global_config:
+                # `key` is present in config
+                cls_kwargs[k] = global_config[shared_conf.key]
+            else:
+                cls_kwargs[k] = shared_conf.default_value
+
+    # parse `inject` annoation of registered modules
+    if getattr(cls, 'from_config', None):
+        cls_kwargs.update(cls.from_config(config, **kwargs))
+
+    if getattr(config, 'inject', None):
+        for k in config.inject:
+            target_key = config[k]
+            # optional dependency
+            if target_key is None:
+                continue
+
+            if isinstance(target_key, dict) or hasattr(target_key, '__dict__'):
+                if 'name' not in target_key.keys():
+                    continue
+                inject_name = str(target_key['name'])
+                if inject_name not in global_config:
+                    raise ValueError(
+                        "Missing injection name {} and check it's name in cfg file".
+                        format(k))
+                target = global_config[inject_name]
+                for i, v in target_key.items():
+                    if i == 'name':
+                        continue
+                    target[i] = v
+                if isinstance(target, SchemaDict):
+                    cls_kwargs[k] = create(inject_name)
+            elif isinstance(target_key, str):
+                if target_key not in global_config:
+                    raise ValueError("Missing injection config:", target_key)
+                target = global_config[target_key]
+                if isinstance(target, SchemaDict):
+                    cls_kwargs[k] = create(target_key)
+                elif hasattr(target, '__dict__'):  # serialized object
+                    cls_kwargs[k] = target
+            else:
+                raise ValueError("Unsupported injection type:", target_key)
+    # prevent modification of global config values of reference types
+    # (e.g., list, dict) from within the created module instances
+    #kwargs = copy.deepcopy(kwargs)
+    return cls(**cls_kwargs)
diff --git a/ppdet/data/__init__.py b/ppdet/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a12aa323e7350d13e9b02ff7816ae6d69ab9044e
--- /dev/null
+++ b/ppdet/data/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import source
+from . import transform
+from . import reader
+
+from .source import *
+from .transform import *
+from .reader import *
diff --git a/ppdet/data/reader.py b/ppdet/data/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..227fabca6dce8ef76dabe88864a69000d38468dd
--- /dev/null
+++ b/ppdet/data/reader.py
@@ -0,0 +1,611 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import traceback
+import six
+import sys
+if sys.version_info >= (3, 0):
+    pass
+else:
+    pass
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from copy import deepcopy
+
+from paddle.io import DataLoader, DistributedBatchSampler
+from .utils import default_collate_fn
+
+from ppdet.core.workspace import register
+from . import transform
+from .shm_utils import _get_shared_memory_size_in_M
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('reader')
+
+MAIN_PID = os.getpid()
+
+
+class Compose(object):
+    def __init__(self, transforms, num_classes=80):
+        self.transforms = transforms
+        self.transforms_cls = []
+        for t in self.transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+
+                self.transforms_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        return data
+
+
+class BatchCompose(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+        return batch_data
+
+
+class BaseDataLoader(object):
+    """
+    Base DataLoader implementation for detection models
+
+    Args:
+        sample_transforms (list): a list of transforms to perform
+                                  on each sample
+        batch_transforms (list): a list of transforms to perform
+                                 on batch
+        batch_size (int): batch size for batch collating, default 1.
+        shuffle (bool): whether to shuffle samples
+        drop_last (bool): whether to drop the last incomplete,
+                          default False
+        num_classes (int): class number of dataset, default 80
+        collate_batch (bool): whether to collate batch in dataloader.
+            If set to True, the samples will collate into batch according
+            to the batch size. Otherwise, the ground-truth will not collate,
+            which is used when the number of ground-truch is different in 
+            samples.
+        use_shared_memory (bool): whether to use shared memory to
+                accelerate data loading, enable this only if you
+                are sure that the shared memory size of your OS
+                is larger than memory cost of input datas of model.
+                Note that shared memory will be automatically
+                disabled if the shared memory of OS is less than
+                1G, which is not enough for detection models.
+                Default False.
+    """
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sample transform
+        self._sample_transforms = Compose(
+            sample_transforms, num_classes=num_classes)
+
+        # batch transfrom 
+        self._batch_transforms = BatchCompose(batch_transforms, num_classes,
+                                              collate_batch)
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset,
+                 worker_num,
+                 batch_sampler=None,
+                 return_list=False):
+        self.dataset = dataset
+        self.dataset.check_or_download_dataset()
+        self.dataset.parse_dataset()
+        # get data
+        self.dataset.set_transform(self._sample_transforms)
+        # set kwargs
+        self.dataset.set_kwargs(**self.kwargs)
+        # batch sampler
+        if batch_sampler is None:
+            self._batch_sampler = DistributedBatchSampler(
+                self.dataset,
+                batch_size=self.batch_size,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler = batch_sampler
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self._batch_sampler,
+            collate_fn=self._batch_transforms,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+        self.loader = iter(self.dataloader)
+
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            return next(self.loader)
+        except StopIteration:
+            self.loader = iter(self.dataloader)
+            six.reraise(*sys.exc_info())
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class TrainReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(TrainReader, self).__init__(sample_transforms, batch_transforms,
+                                          batch_size, shuffle, drop_last,
+                                          num_classes, collate_batch, **kwargs)
+
+
+@register
+class EvalReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=True,
+                 num_classes=80,
+                 **kwargs):
+        super(EvalReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
+
+@register
+class TestReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 **kwargs):
+        super(TestReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
+
+@register
+class EvalMOTReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=1,
+                 **kwargs):
+        super(EvalMOTReader, self).__init__(sample_transforms, batch_transforms,
+                                            batch_size, shuffle, drop_last,
+                                            num_classes, **kwargs)
+
+
+@register
+class TestMOTReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=1,
+                 **kwargs):
+        super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,
+                                            batch_size, shuffle, drop_last,
+                                            num_classes, **kwargs)
+
+
+# For Semi-Supervised Object Detection (SSOD)
+class Compose_SSOD(object):
+    def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80):
+        self.base_transforms = base_transforms
+        self.base_transforms_cls = []
+        for t in self.base_transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.base_transforms_cls.append(f)
+
+        self.weak_augs = weak_aug
+        self.weak_augs_cls = []
+        for t in self.weak_augs:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.weak_augs_cls.append(f)
+
+        self.strong_augs = strong_aug
+        self.strong_augs_cls = []
+        for t in self.strong_augs:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.strong_augs_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.base_transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        weak_data = deepcopy(data)
+        strong_data = deepcopy(data)
+        for f in self.weak_augs_cls:
+            try:
+                weak_data = f(weak_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map weak aug [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        for f in self.strong_augs_cls:
+            try:
+                strong_data = f(strong_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map strong aug [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        weak_data['strong_aug'] = strong_data
+        return weak_data
+
+
+class BatchCompose_SSOD(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose_SSOD, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        # split strong_data from data(weak_data)
+        strong_data = []
+        for sample in data:
+            strong_data.append(sample['strong_aug'])
+            sample.pop('strong_aug')
+
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+                strong_data = f(strong_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+            for sample in strong_data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+            strong_batch_data = default_collate_fn(strong_data)
+            return batch_data, strong_batch_data
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+
+            strong_batch_data = {}
+            for k in strong_data[0].keys():
+                tmp_data = []
+                for i in range(len(strong_data)):
+                    tmp_data.append(strong_data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                strong_batch_data[k] = tmp_data
+
+        return batch_data, strong_batch_data
+
+
+class CombineSSODLoader(object):
+    def __init__(self, label_loader, unlabel_loader):
+        self.label_loader = label_loader
+        self.unlabel_loader = unlabel_loader
+
+    def __iter__(self):
+        while True:
+            try:
+                label_samples = next(self.label_loader_iter)
+            except:
+                self.label_loader_iter = iter(self.label_loader)
+                label_samples = next(self.label_loader_iter)
+
+            try:
+                unlabel_samples = next(self.unlabel_loader_iter)
+            except:
+                self.unlabel_loader_iter = iter(self.unlabel_loader)
+                unlabel_samples = next(self.unlabel_loader_iter)
+
+            yield (
+                label_samples[0],  # sup weak
+                label_samples[1],  # sup strong
+                unlabel_samples[0],  # unsup weak
+                unlabel_samples[1]  # unsup strong
+            )
+
+    def __call__(self):
+        return self.__iter__()
+
+
+class BaseSemiDataLoader(object):
+    def __init__(self,
+                 sample_transforms=[],
+                 weak_aug=[],
+                 strong_aug=[],
+                 sup_batch_transforms=[],
+                 unsup_batch_transforms=[],
+                 sup_batch_size=1,
+                 unsup_batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sup transforms
+        self._sample_transforms_label = Compose_SSOD(
+            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
+        self._batch_transforms_label = BatchCompose_SSOD(
+            sup_batch_transforms, num_classes, collate_batch)
+        self.batch_size_label = sup_batch_size
+
+        # unsup transforms
+        self._sample_transforms_unlabel = Compose_SSOD(
+            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
+        self._batch_transforms_unlabel = BatchCompose_SSOD(
+            unsup_batch_transforms, num_classes, collate_batch)
+        self.batch_size_unlabel = unsup_batch_size
+
+        # common
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset_label,
+                 dataset_unlabel,
+                 worker_num,
+                 batch_sampler_label=None,
+                 batch_sampler_unlabel=None,
+                 return_list=False):
+        # sup dataset 
+        self.dataset_label = dataset_label
+        self.dataset_label.check_or_download_dataset()
+        self.dataset_label.parse_dataset()
+        self.dataset_label.set_transform(self._sample_transforms_label)
+        self.dataset_label.set_kwargs(**self.kwargs)
+        if batch_sampler_label is None:
+            self._batch_sampler_label = DistributedBatchSampler(
+                self.dataset_label,
+                batch_size=self.batch_size_label,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler_label = batch_sampler_label
+
+        # unsup dataset
+        self.dataset_unlabel = dataset_unlabel
+        self.dataset_unlabel.length = self.dataset_label.__len__()
+        self.dataset_unlabel.check_or_download_dataset()
+        self.dataset_unlabel.parse_dataset()
+        self.dataset_unlabel.set_transform(self._sample_transforms_unlabel)
+        self.dataset_unlabel.set_kwargs(**self.kwargs)
+        if batch_sampler_unlabel is None:
+            self._batch_sampler_unlabel = DistributedBatchSampler(
+                self.dataset_unlabel,
+                batch_size=self.batch_size_unlabel,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler_unlabel = batch_sampler_unlabel
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader_label = DataLoader(
+            dataset=self.dataset_label,
+            batch_sampler=self._batch_sampler_label,
+            collate_fn=self._batch_transforms_label,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+
+        self.dataloader_unlabel = DataLoader(
+            dataset=self.dataset_unlabel,
+            batch_sampler=self._batch_sampler_unlabel,
+            collate_fn=self._batch_transforms_unlabel,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+
+        self.dataloader = CombineSSODLoader(self.dataloader_label,
+                                            self.dataloader_unlabel)
+        self.loader = iter(self.dataloader)
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler_label)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return next(self.loader)
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class SemiTrainReader(BaseSemiDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 weak_aug=[],
+                 strong_aug=[],
+                 sup_batch_transforms=[],
+                 unsup_batch_transforms=[],
+                 sup_batch_size=1,
+                 unsup_batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(SemiTrainReader, self).__init__(
+            sample_transforms, weak_aug, strong_aug, sup_batch_transforms,
+            unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle,
+            drop_last, num_classes, collate_batch, **kwargs)
diff --git a/ppdet/data/shm_utils.py b/ppdet/data/shm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a929a809cec9bc1e6b1dd335faa0ba4f2e44ff87
--- /dev/null
+++ b/ppdet/data/shm_utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+SIZE_UNIT = ['K', 'M', 'G', 'T']
+SHM_QUERY_CMD = 'df -h'
+SHM_KEY = 'shm'
+SHM_DEFAULT_MOUNT = '/dev/shm'
+
+# [ shared memory size check ]
+# In detection models, image/target data occupies a lot of memory, and
+# will occupy lots of shared memory in multi-process DataLoader, we use
+# following code to get shared memory size and perform a size check to
+# disable shared memory use if shared memory size is not enough.
+# Shared memory getting process as follows:
+# 1. use `df -h` get all mount info
+# 2. pick up spaces whose mount info contains 'shm'
+# 3. if 'shm' space number is only 1, return its size
+# 4. if there are multiple 'shm' space, try to find the default mount
+#    directory '/dev/shm' is Linux-like system, otherwise return the
+#    biggest space size.
+
+
+def _parse_size_in_M(size_str):
+    if size_str[-1] == 'B':
+        num, unit = size_str[:-2], size_str[-2]
+    else:
+        num, unit = size_str[:-1], size_str[-1]
+    assert unit in SIZE_UNIT, \
+            "unknown shm size unit {}".format(unit)
+    return float(num) * \
+            (1024 ** (SIZE_UNIT.index(unit) - 1))
+
+
+def _get_shared_memory_size_in_M():
+    try:
+        df_infos = os.popen(SHM_QUERY_CMD).readlines()
+    except:
+        return None
+    else:
+        shm_infos = []
+        for df_info in df_infos:
+            info = df_info.strip()
+            if info.find(SHM_KEY) >= 0:
+                shm_infos.append(info.split())
+
+        if len(shm_infos) == 0:
+            return None
+        elif len(shm_infos) == 1:
+            return _parse_size_in_M(shm_infos[0][3])
+        else:
+            default_mount_infos = [
+                si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT
+            ]
+            if default_mount_infos:
+                return _parse_size_in_M(default_mount_infos[0][3])
+            else:
+                return max([_parse_size_in_M(si[3]) for si in shm_infos])
diff --git a/ppdet/data/source/__init__.py b/ppdet/data/source/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19a7bcc3b3788ea2762ef337de0bb9159e31b7b
--- /dev/null
+++ b/ppdet/data/source/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import coco
+from . import voc
+from . import category
+
+from .coco import *
+from .voc import *
+from .category import *
+from .dataset import ImageFolder
diff --git a/ppdet/data/source/category.py b/ppdet/data/source/category.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ed02c1d570bd1104b982388204d07052ba13df5
--- /dev/null
+++ b/ppdet/data/source/category.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from ppdet.data.source.voc import pascalvoc_label
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['get_categories']
+
+
+def get_categories(metric_type, anno_file=None, arch=None):
+    """
+    Get class id to category id map and category id
+    to category name map from annotation file.
+
+    Args:
+        metric_type (str): metric type, currently support 'coco', 'voc', 'oid'
+            and 'widerface'.
+        anno_file (str): annotation file path
+    """
+    if anno_file == None or (not os.path.isfile(anno_file)):
+        logger.warning(
+            "anno_file '{}' is None or not set or not exist, "
+            "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, "
+            "otherwise the default categories will be used by metric_type.".
+            format(anno_file))
+
+    if metric_type.lower() == 'coco':
+        if anno_file and os.path.isfile(anno_file):
+            if anno_file.endswith('json'):
+                # lazy import pycocotools here
+                from pycocotools.coco import COCO
+                coco = COCO(anno_file)
+                cats = coco.loadCats(coco.getCatIds())
+
+                clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
+                catid2name = {cat['id']: cat['name'] for cat in cats}
+
+            elif anno_file.endswith('txt'):
+                cats = []
+                with open(anno_file) as f:
+                    for line in f.readlines():
+                        cats.append(line.strip())
+                if cats[0] == 'background': cats = cats[1:]
+
+                clsid2catid = {i: i for i in range(len(cats))}
+                catid2name = {i: name for i, name in enumerate(cats)}
+
+            else:
+                raise ValueError("anno_file {} should be json or txt.".format(
+                    anno_file))
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of COCO17
+        else:
+            logger.warning("metric_type: {}, load default categories of COCO.".
+                           format(metric_type))
+            return _coco17_category()
+
+    elif metric_type.lower() == 'voc':
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+
+            if cats[0] == 'background':
+                cats = cats[1:]
+
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of
+        # VOC all 20 categories
+        else:
+            logger.warning("metric_type: {}, load default categories of VOC.".
+                           format(metric_type))
+            return _vocall_category()
+
+    else:
+        raise ValueError("unknown metric type {}".format(metric_type))
+
+
+def _coco17_category():
+    """
+    Get class id to category id map and category id
+    to category name map of COCO2017 dataset
+
+    """
+    clsid2catid = {
+        1: 1,
+        2: 2,
+        3: 3,
+        4: 4,
+        5: 5,
+        6: 6,
+        7: 7,
+        8: 8,
+        9: 9,
+        10: 10,
+        11: 11,
+        12: 13,
+        13: 14,
+        14: 15,
+        15: 16,
+        16: 17,
+        17: 18,
+        18: 19,
+        19: 20,
+        20: 21,
+        21: 22,
+        22: 23,
+        23: 24,
+        24: 25,
+        25: 27,
+        26: 28,
+        27: 31,
+        28: 32,
+        29: 33,
+        30: 34,
+        31: 35,
+        32: 36,
+        33: 37,
+        34: 38,
+        35: 39,
+        36: 40,
+        37: 41,
+        38: 42,
+        39: 43,
+        40: 44,
+        41: 46,
+        42: 47,
+        43: 48,
+        44: 49,
+        45: 50,
+        46: 51,
+        47: 52,
+        48: 53,
+        49: 54,
+        50: 55,
+        51: 56,
+        52: 57,
+        53: 58,
+        54: 59,
+        55: 60,
+        56: 61,
+        57: 62,
+        58: 63,
+        59: 64,
+        60: 65,
+        61: 67,
+        62: 70,
+        63: 72,
+        64: 73,
+        65: 74,
+        66: 75,
+        67: 76,
+        68: 77,
+        69: 78,
+        70: 79,
+        71: 80,
+        72: 81,
+        73: 82,
+        74: 84,
+        75: 85,
+        76: 86,
+        77: 87,
+        78: 88,
+        79: 89,
+        80: 90
+    }
+
+    catid2name = {
+        0: 'background',
+        1: 'person',
+        2: 'bicycle',
+        3: 'car',
+        4: 'motorcycle',
+        5: 'airplane',
+        6: 'bus',
+        7: 'train',
+        8: 'truck',
+        9: 'boat',
+        10: 'traffic light',
+        11: 'fire hydrant',
+        13: 'stop sign',
+        14: 'parking meter',
+        15: 'bench',
+        16: 'bird',
+        17: 'cat',
+        18: 'dog',
+        19: 'horse',
+        20: 'sheep',
+        21: 'cow',
+        22: 'elephant',
+        23: 'bear',
+        24: 'zebra',
+        25: 'giraffe',
+        27: 'backpack',
+        28: 'umbrella',
+        31: 'handbag',
+        32: 'tie',
+        33: 'suitcase',
+        34: 'frisbee',
+        35: 'skis',
+        36: 'snowboard',
+        37: 'sports ball',
+        38: 'kite',
+        39: 'baseball bat',
+        40: 'baseball glove',
+        41: 'skateboard',
+        42: 'surfboard',
+        43: 'tennis racket',
+        44: 'bottle',
+        46: 'wine glass',
+        47: 'cup',
+        48: 'fork',
+        49: 'knife',
+        50: 'spoon',
+        51: 'bowl',
+        52: 'banana',
+        53: 'apple',
+        54: 'sandwich',
+        55: 'orange',
+        56: 'broccoli',
+        57: 'carrot',
+        58: 'hot dog',
+        59: 'pizza',
+        60: 'donut',
+        61: 'cake',
+        62: 'chair',
+        63: 'couch',
+        64: 'potted plant',
+        65: 'bed',
+        67: 'dining table',
+        70: 'toilet',
+        72: 'tv',
+        73: 'laptop',
+        74: 'mouse',
+        75: 'remote',
+        76: 'keyboard',
+        77: 'cell phone',
+        78: 'microwave',
+        79: 'oven',
+        80: 'toaster',
+        81: 'sink',
+        82: 'refrigerator',
+        84: 'book',
+        85: 'clock',
+        86: 'vase',
+        87: 'scissors',
+        88: 'teddy bear',
+        89: 'hair drier',
+        90: 'toothbrush'
+    }
+
+    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
+    catid2name.pop(0)
+
+    return clsid2catid, catid2name
+
+
+def _vocall_category():
+    """
+    Get class id to category id map and category id
+    to category name map of mixup voc dataset
+
+    """
+    label_map = pascalvoc_label()
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
diff --git a/ppdet/data/source/coco.py b/ppdet/data/source/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..330dae6775115bb4401e5adcdc30471b7099f3e8
--- /dev/null
+++ b/ppdet/data/source/coco.py
@@ -0,0 +1,587 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import copy
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+import numpy as np
+from ppdet.core.workspace import register, serializable
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet']
+
+
+@register
+@serializable
+class COCODataSet(DetDataset):
+    """
+    Load dataset with COCO format.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): coco annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        load_crowd (bool): whether to load crowded ground-truth. 
+            False as default
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total 
+            record's, if empty_ratio is out of [0. ,1.), do not sample the 
+            records and use all the empty entries. 1. as default
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1):
+        super(COCODataSet, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            data_fields,
+            sample_num,
+            repeat=repeat)
+        self.load_image_only = False
+        self.load_semantic = False
+        self.load_crowd = load_crowd
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
+
+                has_segmentation = False
+                has_track_id = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(
+                                box['segmentation'],
+                                dtype=object).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                    if 'track_id' in box:
+                        gt_track_id[i][0] = box['track_id']
+                        has_track_id = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+                if has_track_id:
+                    gt_rec.update({'gt_track_id': gt_track_id})
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+
+@register
+@serializable
+class SlicedCOCODataSet(COCODataSet):
+    """Sliced COCODataSet"""
+
+    def __init__(
+            self,
+            dataset_dir=None,
+            image_dir=None,
+            anno_path=None,
+            data_fields=['image'],
+            sample_num=-1,
+            load_crowd=False,
+            allow_empty=False,
+            empty_ratio=1.,
+            repeat=1,
+            sliced_size=[640, 640],
+            overlap_ratio=[0.25, 0.25], ):
+        super(SlicedCOCODataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            load_crowd=load_crowd,
+            allow_empty=allow_empty,
+            empty_ratio=empty_ratio,
+            repeat=repeat, )
+        self.sliced_size = sliced_size
+        self.overlap_ratio = overlap_ratio
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+        ct_sub = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            logger.error(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+
+        sub_img_ids = 0
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            slice_image_result = sahi.slicing.slice_image(
+                image=im_path,
+                slice_height=self.sliced_size[0],
+                slice_width=self.sliced_size[1],
+                overlap_height_ratio=self.overlap_ratio[0],
+                overlap_width_ratio=self.overlap_ratio[1])
+
+            sub_img_num = len(slice_image_result)
+            for _ind in range(sub_img_num):
+                im = slice_image_result.images[_ind]
+                coco_rec = {
+                    'image': im,
+                    'im_id': np.array([sub_img_ids + _ind]),
+                    'h': im.shape[0],
+                    'w': im.shape[1],
+                    'ori_im_id': np.array([img_id]),
+                    'st_pix': np.array(
+                        slice_image_result.starting_pixels[_ind],
+                        dtype=np.float32),
+                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
+                } if 'image' in self.data_fields else {}
+                records.append(coco_rec)
+            ct_sub += sub_img_num
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('{} samples and slice to {} sub_samples in file {}'.format(
+            ct, ct_sub, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+
+@register
+@serializable
+class SemiCOCODataSet(COCODataSet):
+    """Semi-COCODataSet used for supervised and unsupervised dataSet"""
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1,
+                 supervised=True):
+        super(SemiCOCODataSet, self).__init__(
+            dataset_dir, image_dir, anno_path, data_fields, sample_num,
+            load_crowd, allow_empty, empty_ratio, repeat)
+        self.supervised = supervised
+        self.length = -1  # defalut -1 means all
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset or self.supervised == False:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+
+                has_segmentation = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(box['segmentation']
+                                        ).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+        if self.supervised:
+            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
+        else:
+            if self.length > 0:  # unsup length will be decide by sup length
+                all_roidbs = self.roidbs.copy()
+                selected_idxs = [
+                    np.random.choice(len(all_roidbs))
+                    for _ in range(self.length)
+                ]
+                self.roidbs = [all_roidbs[i] for i in selected_idxs]
+            logger.info(
+                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
diff --git a/ppdet/data/source/dataset.py b/ppdet/data/source/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f22b222aa1a99bf1239db5c379cc4bd1a6632e0
--- /dev/null
+++ b/ppdet/data/source/dataset.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import copy
+import numpy as np
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from paddle.io import Dataset
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.download import get_dataset_path
+from ppdet.data import source
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@serializable
+class DetDataset(Dataset):
+    """
+    Load detection dataset.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        use_default_label (bool): whether to load default label list.
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 use_default_label=None,
+                 repeat=1,
+                 **kwargs):
+        super(DetDataset, self).__init__()
+        self.dataset_dir = dataset_dir if dataset_dir is not None else ''
+        self.anno_path = anno_path
+        self.image_dir = image_dir if image_dir is not None else ''
+        self.data_fields = data_fields
+        self.sample_num = sample_num
+        self.use_default_label = use_default_label
+        self.repeat = repeat
+        self._epoch = 0
+        self._curr_iter = 0
+
+    def __len__(self, ):
+        return len(self.roidbs) * self.repeat
+
+    def __call__(self, *args, **kwargs):
+        return self
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
+            # Add previous image as input, only used in CenterTrack
+            idx_pre_img = idx - 1
+            if idx_pre_img < 0:
+                idx_pre_img = idx + 1
+            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
+
+    def check_or_download_dataset(self):
+        self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,
+                                            self.image_dir)
+
+    def set_kwargs(self, **kwargs):
+        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
+        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
+        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
+        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
+
+    def set_transform(self, transform):
+        self.transform = transform
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def parse_dataset(self, ):
+        raise NotImplementedError(
+            "Need to implement parse_dataset method of Dataset")
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        return os.path.join(self.dataset_dir, self.anno_path)
+
+
+def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):
+    return f.lower().endswith(extensions)
+
+
+def _make_dataset(dir):
+    dir = os.path.expanduser(dir)
+    if not os.path.isdir(dir):
+        raise ('{} should be a dir'.format(dir))
+    images = []
+    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
+        for fname in sorted(fnames):
+            path = os.path.join(root, fname)
+            if _is_valid_file(path):
+                images.append(path)
+    return images
+
+
+@register
+@serializable
+class ImageFolder(DetDataset):
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 sample_num=-1,
+                 use_default_label=None,
+                 **kwargs):
+        super(ImageFolder, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            sample_num=sample_num,
+            use_default_label=use_default_label)
+        self._imid2path = {}
+        self.roidbs = None
+        self.sample_num = sample_num
+
+    def check_or_download_dataset(self):
+        return
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        if self.dataset_dir:
+            return os.path.join(self.dataset_dir, self.anno_path)
+        else:
+            return self.anno_path
+
+    def parse_dataset(self, ):
+        if not self.roidbs:
+            self.roidbs = self._load_images()
+
+    def _parse(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._parse()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self.image_dir = images
+        self.roidbs = self._load_images()
+
+    def set_slice_images(self,
+                         images,
+                         slice_size=[640, 640],
+                         overlap_ratio=[0.25, 0.25]):
+        self.image_dir = images
+        ori_records = self._load_images()
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            logger.error(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+
+        sub_img_ids = 0
+        ct = 0
+        ct_sub = 0
+        records = []
+        for i, ori_rec in enumerate(ori_records):
+            im_path = ori_rec['im_file']
+            slice_image_result = sahi.slicing.slice_image(
+                image=im_path,
+                slice_height=slice_size[0],
+                slice_width=slice_size[1],
+                overlap_height_ratio=overlap_ratio[0],
+                overlap_width_ratio=overlap_ratio[1])
+
+            sub_img_num = len(slice_image_result)
+            for _ind in range(sub_img_num):
+                im = slice_image_result.images[_ind]
+                rec = {
+                    'image': im,
+                    'im_id': np.array([sub_img_ids + _ind]),
+                    'h': im.shape[0],
+                    'w': im.shape[1],
+                    'ori_im_id': np.array([ori_rec['im_id'][0]]),
+                    'st_pix': np.array(
+                        slice_image_result.starting_pixels[_ind],
+                        dtype=np.float32),
+                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
+                } if 'image' in self.data_fields else {}
+                records.append(rec)
+            ct_sub += sub_img_num
+            ct += 1
+        logger.info('{} samples and slice to {} sub_samples.'.format(ct,
+                                                                     ct_sub))
+        self.roidbs = records
+
+    def get_label_list(self):
+        # Only VOC dataset needs label list in ImageFold 
+        return self.anno_path
+
+
+@register
+class CommonDataset(object):
+    def __init__(self, **dataset_args):
+        super(CommonDataset, self).__init__()
+        dataset_args = copy.deepcopy(dataset_args)
+        type = dataset_args.pop("name")
+        self.dataset = getattr(source, type)(**dataset_args)
+
+    def __call__(self):
+        return self.dataset
+
+
+@register
+class TrainDataset(CommonDataset):
+    pass
+
+
+@register
+class EvalMOTDataset(CommonDataset):
+    pass
+
+
+@register
+class TestMOTDataset(CommonDataset):
+    pass
+
+
+@register
+class EvalDataset(CommonDataset):
+    pass
+
+
+@register
+class TestDataset(CommonDataset):
+    pass
diff --git a/ppdet/data/source/voc.py b/ppdet/data/source/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f103588537c5499ef83133fe3f8d4ba7303e685
--- /dev/null
+++ b/ppdet/data/source/voc.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import xml.etree.ElementTree as ET
+
+from ppdet.core.workspace import register, serializable
+
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class VOCDataSet(DetDataset):
+    """
+    Load dataset with PascalVOC format.
+
+    Notes:
+    `anno_path` must contains xml file and image file path for annotations.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): voc annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        label_list (str): if use_default_label is False, will load
+            mapping between category and class index.
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total 
+            record's, if empty_ratio is out of [0. ,1.), do not sample the 
+            records and use all the empty entries. 1. as default
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 label_list=None,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1):
+        super(VOCDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            repeat=repeat)
+        self.label_list = label_list
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self, ):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        # mapping category name to class id
+        # first_class:0, second_class:1, ...
+        records = []
+        empty_records = []
+        ct = 0
+        cname2cid = {}
+        if self.label_list:
+            label_path = os.path.join(self.dataset_dir, self.label_list)
+            if not os.path.exists(label_path):
+                raise ValueError("label_list {} does not exists".format(
+                    label_path))
+            with open(label_path, 'r') as fr:
+                label_id = 0
+                for line in fr.readlines():
+                    cname2cid[line.strip()] = label_id
+                    label_id += 1
+        else:
+            cname2cid = pascalvoc_label()
+
+        with open(anno_path, 'r') as fr:
+            while True:
+                line = fr.readline()
+                if not line:
+                    break
+                img_file, xml_file = [os.path.join(image_dir, x) \
+                        for x in line.strip().split()[:2]]
+                if not os.path.exists(img_file):
+                    logger.warning(
+                        'Illegal image file: {}, and it will be ignored'.format(
+                            img_file))
+                    continue
+                if not os.path.isfile(xml_file):
+                    logger.warning(
+                        'Illegal xml file: {}, and it will be ignored'.format(
+                            xml_file))
+                    continue
+                tree = ET.parse(xml_file)
+                if tree.find('id') is None:
+                    im_id = np.array([ct])
+                else:
+                    im_id = np.array([int(tree.find('id').text)])
+
+                objs = tree.findall('object')
+                im_w = float(tree.find('size').find('width').text)
+                im_h = float(tree.find('size').find('height').text)
+                if im_w < 0 or im_h < 0:
+                    logger.warning(
+                        'Illegal width: {} or height: {} in annotation, '
+                        'and {} will be ignored'.format(im_w, im_h, xml_file))
+                    continue
+
+                num_bbox, i = len(objs), 0
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
+                difficult = np.zeros((num_bbox, 1), dtype=np.int32)
+                for obj in objs:
+                    cname = obj.find('name').text
+
+                    # user dataset may not contain difficult field
+                    _difficult = obj.find('difficult')
+                    _difficult = int(
+                        _difficult.text) if _difficult is not None else 0
+
+                    x1 = float(obj.find('bndbox').find('xmin').text)
+                    y1 = float(obj.find('bndbox').find('ymin').text)
+                    x2 = float(obj.find('bndbox').find('xmax').text)
+                    y2 = float(obj.find('bndbox').find('ymax').text)
+                    x1 = max(0, x1)
+                    y1 = max(0, y1)
+                    x2 = min(im_w - 1, x2)
+                    y2 = min(im_h - 1, y2)
+                    if x2 > x1 and y2 > y1:
+                        gt_bbox[i, :] = [x1, y1, x2, y2]
+                        gt_class[i, 0] = cname2cid[cname]
+                        gt_score[i, 0] = 1.
+                        difficult[i, 0] = _difficult
+                        i += 1
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: xml_file: {}'
+                            ', x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                xml_file, x1, y1, x2, y2))
+                gt_bbox = gt_bbox[:i, :]
+                gt_class = gt_class[:i, :]
+                gt_score = gt_score[:i, :]
+                difficult = difficult[:i, :]
+
+                voc_rec = {
+                    'im_file': img_file,
+                    'im_id': im_id,
+                    'h': im_h,
+                    'w': im_w
+                } if 'image' in self.data_fields else {}
+
+                gt_rec = {
+                    'gt_class': gt_class,
+                    'gt_score': gt_score,
+                    'gt_bbox': gt_bbox,
+                    'difficult': difficult
+                }
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        voc_rec[k] = v
+
+                if len(objs) == 0:
+                    empty_records.append(voc_rec)
+                else:
+                    records.append(voc_rec)
+
+                ct += 1
+                if self.sample_num > 0 and ct >= self.sample_num:
+                    break
+        assert ct > 0, 'not found any voc record in %s' % (self.anno_path)
+        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs, self.cname2cid = records, cname2cid
+
+    def get_label_list(self):
+        return os.path.join(self.dataset_dir, self.label_list)
+
+
+def pascalvoc_label():
+    labels_map = {
+        'aeroplane': 0,
+        'bicycle': 1,
+        'bird': 2,
+        'boat': 3,
+        'bottle': 4,
+        'bus': 5,
+        'car': 6,
+        'cat': 7,
+        'chair': 8,
+        'cow': 9,
+        'diningtable': 10,
+        'dog': 11,
+        'horse': 12,
+        'motorbike': 13,
+        'person': 14,
+        'pottedplant': 15,
+        'sheep': 16,
+        'sofa': 17,
+        'train': 18,
+        'tvmonitor': 19
+    }
+    return labels_map
diff --git a/ppdet/data/transform/__init__.py b/ppdet/data/transform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5deb535a64394d4614137b072cf677a6b6010ca
--- /dev/null
+++ b/ppdet/data/transform/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import operators
+from . import batch_operators
+
+from .operators import *
+from .batch_operators import *
+
+__all__ = []
+__all__ += registered_ops
diff --git a/ppdet/data/transform/autoaugment_utils.py b/ppdet/data/transform/autoaugment_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfa89d374d94260c881566c12ef6a6afd5e823b9
--- /dev/null
+++ b/ppdet/data/transform/autoaugment_utils.py
@@ -0,0 +1,1586 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference: 
+#   https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py
+"""AutoAugment util file."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+import math
+from PIL import Image, ImageEnhance
+import numpy as np
+import cv2
+from copy import deepcopy
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+# Represents an invalid bounding box that is used for checking for padding
+# lists of bounding box coordinates for a few augmentation operations
+_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]]
+
+
+def policy_v0():
+    """Autoaugment policy that was used in AutoAugment Detection Paper."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [
+        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
+        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
+        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
+        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
+        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
+    ]
+    return policy
+
+
+def policy_v1():
+    """Autoaugment policy that was used in AutoAugment Detection Paper."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [
+        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
+        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
+        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
+        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
+        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
+        [('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)],
+        [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)],
+        [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)],
+        [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)],
+        [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)],  # , 
+        [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)],
+        [('Color', 1.0, 6), ('Equalize', 1.0, 2)],
+        [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)],
+        [('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)],
+        [('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)],
+        [('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)],
+        [('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)],
+        [('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)],
+        [('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)],
+        [('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)],
+    ]
+    return policy
+
+
+def policy_vtest():
+    """Autoaugment test policy for debugging."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ]
+    return policy
+
+
+def policy_v2():
+    """Additional policy that performs well on object detection."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [
+        [('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)],
+        [('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2),
+         ('Rotate_BBox', 0.8, 10)],
+        [('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)],
+        [('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8),
+         ('Brightness', 0.0, 10)],
+        [('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10),
+         ('AutoContrast', 0.6, 0)],
+        [('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)],
+        [('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8),
+         ('Solarize', 0.0, 10)],
+        [('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8),
+         ('Rotate_BBox', 0.8, 8)],
+        [('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)],
+        [('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6),
+         ('Rotate_BBox', 0.6, 6)],
+        [('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4), ('Cutout', 0.2, 8)],
+        [('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6),
+         ('ShearY_BBox', 0.6, 8)],
+        [('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2),
+         ('Brightness', 0.2, 2)],
+        [('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6),
+         ('SolarizeAdd', 0.2, 10)],
+        [('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)],
+    ]
+    return policy
+
+
+def policy_v3():
+    """"Additional policy that performs well on object detection."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [
+        [('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)],
+        [('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)],
+        [('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)],
+        [('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)],
+        [('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)],
+        [('Sharpness', 0.0, 2), ('Color', 0.4, 8)],
+        [('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)],
+        [('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)],
+        [('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)],
+        [('Equalize', 0.0, 4), ('Cutout', 0.8, 10)],
+        [('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)],
+        [('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)],
+        [('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)],
+        [('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)],
+        [('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)],
+    ]
+    return policy
+
+
+def _equal(val1, val2, eps=1e-8):
+    return abs(val1 - val2) <= eps
+
+
+def blend(image1, image2, factor):
+    """Blend image1 and image2 using 'factor'.
+
+    Factor can be above 0.0.    A value of 0.0 means only image1 is used.
+    A value of 1.0 means only image2 is used.    A value between 0.0 and
+    1.0 means we linearly interpolate the pixel values between the two
+    images.    A value greater than 1.0 "extrapolates" the difference
+    between the two pixel values, and we clip the results to values
+    between 0 and 255.
+
+    Args:
+        image1: An image Tensor of type uint8.
+        image2: An image Tensor of type uint8.
+        factor: A floating point value above 0.0.
+
+    Returns:
+        A blended image Tensor of type uint8.
+    """
+    if factor == 0.0:
+        return image1
+    if factor == 1.0:
+        return image2
+
+    image1 = image1.astype(np.float32)
+    image2 = image2.astype(np.float32)
+
+    difference = image2 - image1
+    scaled = factor * difference
+
+    # Do addition in float.
+    temp = image1 + scaled
+
+    # Interpolate
+    if factor > 0.0 and factor < 1.0:
+        # Interpolation means we always stay within 0 and 255.
+        return temp.astype(np.uint8)
+
+    # Extrapolate:
+    #
+    # We need to clip and then cast.
+    return np.clip(temp, a_min=0, a_max=255).astype(np.uint8)
+
+
+def cutout(image, pad_size, replace=0):
+    """Apply cutout (https://arxiv.org/abs/1708.04552) to image.
+
+    This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
+    a random location within `img`. The pixel values filled in will be of the
+    value `replace`. The located where the mask will be applied is randomly
+    chosen uniformly over the whole image.
+
+    Args:
+        image: An image Tensor of type uint8.
+        pad_size: Specifies how big the zero mask that will be generated is that
+            is applied to the image. The mask will be of size
+            (2*pad_size x 2*pad_size).
+        replace: What pixel value to fill in the image in the area that has
+            the cutout mask applied to it.
+
+    Returns:
+        An image Tensor that is of type uint8.
+    Example:
+        img = cv2.imread( "/home/vis/gry/train/img_data/test.jpg", cv2.COLOR_BGR2RGB )
+        new_img = cutout(img, pad_size=50, replace=0)
+    """
+    image_height, image_width = image.shape[0], image.shape[1]
+
+    cutout_center_height = np.random.randint(low=0, high=image_height)
+    cutout_center_width = np.random.randint(low=0, high=image_width)
+
+    lower_pad = np.maximum(0, cutout_center_height - pad_size)
+    upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size)
+    left_pad = np.maximum(0, cutout_center_width - pad_size)
+    right_pad = np.maximum(0, image_width - cutout_center_width - pad_size)
+
+    cutout_shape = [
+        image_height - (lower_pad + upper_pad),
+        image_width - (left_pad + right_pad)
+    ]
+    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+    mask = np.pad(np.zeros(
+        cutout_shape, dtype=image.dtype),
+                  padding_dims,
+                  'constant',
+                  constant_values=1)
+    mask = np.expand_dims(mask, -1)
+    mask = np.tile(mask, [1, 1, 3])
+    image = np.where(
+        np.equal(mask, 0),
+        np.ones_like(
+            image, dtype=image.dtype) * replace,
+        image)
+    return image.astype(np.uint8)
+
+
+def solarize(image, threshold=128):
+    # For each pixel in the image, select the pixel
+    # if the value is less than the threshold.
+    # Otherwise, subtract 255 from the pixel.
+    return np.where(image < threshold, image, 255 - image)
+
+
+def solarize_add(image, addition=0, threshold=128):
+    # For each pixel in the image less than threshold
+    # we add 'addition' amount to it and then clip the
+    # pixel value to be between 0 and 255. The value
+    # of 'addition' is between -128 and 128.
+    added_image = image.astype(np.int64) + addition
+    added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8)
+    return np.where(image < threshold, added_image, image)
+
+
+def color(image, factor):
+    """use cv2 to deal"""
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
+    return blend(degenerate, image, factor)
+
+
+# refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197
+def contrast(img, factor):
+    img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor)
+    return np.array(img)
+
+
+def brightness(image, factor):
+    """Equivalent of PIL Brightness."""
+    degenerate = np.zeros_like(image)
+    return blend(degenerate, image, factor)
+
+
+def posterize(image, bits):
+    """Equivalent of PIL Posterize."""
+    shift = 8 - bits
+    return np.left_shift(np.right_shift(image, shift), shift)
+
+
+def rotate(image, degrees, replace):
+    """Rotates the image by degrees either clockwise or counterclockwise.
+
+    Args:
+        image: An image Tensor of type uint8.
+        degrees: Float, a scalar angle in degrees to rotate all images by. If
+            degrees is positive the image will be rotated clockwise otherwise it will
+            be rotated counterclockwise.
+        replace: A one or three value 1D tensor to fill empty pixels caused by
+            the rotate operation.
+
+    Returns:
+        The rotated version of image.
+    """
+    image = wrap(image)
+    image = Image.fromarray(image)
+    image = image.rotate(degrees)
+    image = np.array(image, dtype=np.uint8)
+    return unwrap(image, replace)
+
+
+def random_shift_bbox(image,
+                      bbox,
+                      pixel_scaling,
+                      replace,
+                      new_min_bbox_coords=None):
+    """Move the bbox and the image content to a slightly new random location.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+            The potential values for the new min corner of the bbox will be between
+            [old_min - pixel_scaling * bbox_height/2,
+             old_min - pixel_scaling * bbox_height/2].
+        pixel_scaling: A float between 0 and 1 that specifies the pixel range
+            that the new bbox location will be sampled from.
+        replace: A one or three value 1D tensor to fill empty pixels.
+        new_min_bbox_coords: If not None, then this is a tuple that specifies the
+            (min_y, min_x) coordinates of the new bbox. Normally this is randomly
+            specified, but this allows it to be manually set. The coordinates are
+            the absolute coordinates between 0 and image height/width and are int32.
+
+    Returns:
+        The new image that will have the shifted bbox location in it along with
+        the new bbox that contains the new coordinates.
+    """
+    # Obtains image height and width and create helper clip functions.
+    image_height, image_width = image.shape[0], image.shape[1]
+    image_height = float(image_height)
+    image_width = float(image_width)
+
+    def clip_y(val):
+        return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32)
+
+    def clip_x(val):
+        return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32)
+
+    # Convert bbox to pixel coordinates.
+    min_y = int(image_height * bbox[0])
+    min_x = int(image_width * bbox[1])
+    max_y = clip_y(image_height * bbox[2])
+    max_x = clip_x(image_width * bbox[3])
+
+    bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1)
+    image_height = int(image_height)
+    image_width = int(image_width)
+
+    # Select the new min/max bbox ranges that are used for sampling the
+    # new min x/y coordinates of the shifted bbox.
+    minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) /
+                                       2.0))
+    maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) /
+                                       2.0))
+    minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) / 2.0))
+    maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) / 2.0))
+
+    # Sample and calculate the new unclipped min/max coordinates of the new bbox.
+    if new_min_bbox_coords is None:
+        unclipped_new_min_y = np.random.randint(
+            low=minval_y, high=maxval_y, dtype=np.int32)
+        unclipped_new_min_x = np.random.randint(
+            low=minval_x, high=maxval_x, dtype=np.int32)
+    else:
+        unclipped_new_min_y, unclipped_new_min_x = (
+            clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1]))
+    unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1
+    unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1
+
+    # Determine if any of the new bbox was shifted outside the current image.
+    # This is used for determining if any of the original bbox content should be
+    # discarded.
+    new_min_y, new_min_x, new_max_y, new_max_x = (
+        clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x),
+        clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x))
+    shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y
+    shifted_max_y = max_y - (unclipped_new_max_y - new_max_y)
+    shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x
+    shifted_max_x = max_x - (unclipped_new_max_x - new_max_x)
+
+    # Create the new bbox tensor by converting pixel integer values to floats.
+    new_bbox = np.stack([
+        float(new_min_y) / float(image_height), float(new_min_x) /
+        float(image_width), float(new_max_y) / float(image_height),
+        float(new_max_x) / float(image_width)
+    ])
+
+    # Copy the contents in the bbox and fill the old bbox location
+    # with gray (128).
+    bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x:
+                         shifted_max_x + 1, :]
+
+    def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask, content_tensor,
+                           image_):
+        """Applies mask to bbox region in image then adds content_tensor to it."""
+        mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_],
+                             [min_x_, (image_width - 1) - max_x_], [0, 0]],
+                      'constant',
+                      constant_values=1)
+
+        content_tensor = np.pad(content_tensor,
+                                [[min_y_, (image_height - 1) - max_y_],
+                                 [min_x_, (image_width - 1) - max_x_], [0, 0]],
+                                'constant',
+                                constant_values=0)
+        return image_ * mask + content_tensor
+
+    # Zero out original bbox location.
+    mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :]
+    grey_tensor = np.zeros_like(mask) + replace[0]
+    image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor,
+                               image)
+
+    # Fill in bbox content to new bbox location.
+    mask = np.zeros_like(bbox_content)
+    image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x, mask,
+                               bbox_content, image)
+
+    return image.astype(np.uint8), new_bbox
+
+
+def _clip_bbox(min_y, min_x, max_y, max_x):
+    """Clip bounding box coordinates between 0 and 1.
+
+    Args:
+        min_y: Normalized bbox coordinate of type float between 0 and 1.
+        min_x: Normalized bbox coordinate of type float between 0 and 1.
+        max_y: Normalized bbox coordinate of type float between 0 and 1.
+        max_x: Normalized bbox coordinate of type float between 0 and 1.
+
+    Returns:
+        Clipped coordinate values between 0 and 1.
+    """
+    min_y = np.clip(min_y, a_min=0, a_max=1.0)
+    min_x = np.clip(min_x, a_min=0, a_max=1.0)
+    max_y = np.clip(max_y, a_min=0, a_max=1.0)
+    max_x = np.clip(max_x, a_min=0, a_max=1.0)
+    return min_y, min_x, max_y, max_x
+
+
+def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05):
+    """Adjusts bbox coordinates to make sure the area is > 0.
+
+    Args:
+        min_y: Normalized bbox coordinate of type float between 0 and 1.
+        min_x: Normalized bbox coordinate of type float between 0 and 1.
+        max_y: Normalized bbox coordinate of type float between 0 and 1.
+        max_x: Normalized bbox coordinate of type float between 0 and 1.
+        delta: Float, this is used to create a gap of size 2 * delta between
+            bbox min/max coordinates that are the same on the boundary.
+            This prevents the bbox from having an area of zero.
+
+    Returns:
+        Tuple of new bbox coordinates between 0 and 1 that will now have a
+        guaranteed area > 0.
+    """
+    height = max_y - min_y
+    width = max_x - min_x
+
+    def _adjust_bbox_boundaries(min_coord, max_coord):
+        # Make sure max is never 0 and min is never 1.
+        max_coord = np.maximum(max_coord, 0.0 + delta)
+        min_coord = np.minimum(min_coord, 1.0 - delta)
+        return min_coord, max_coord
+
+    if _equal(height, 0):
+        min_y, max_y = _adjust_bbox_boundaries(min_y, max_y)
+
+    if _equal(width, 0):
+        min_x, max_x = _adjust_bbox_boundaries(min_x, max_x)
+
+    return min_y, min_x, max_y, max_x
+
+
+def _scale_bbox_only_op_probability(prob):
+    """Reduce the probability of the bbox-only operation.
+
+    Probability is reduced so that we do not distort the content of too many
+    bounding boxes that are close to each other. The value of 3.0 was a chosen
+    hyper parameter when designing the autoaugment algorithm that we found
+    empirically to work well.
+
+    Args:
+        prob: Float that is the probability of applying the bbox-only operation.
+
+    Returns:
+        Reduced probability.
+    """
+    return prob / 3.0
+
+
+def _apply_bbox_augmentation(image, bbox, augmentation_func, *args):
+    """Applies augmentation_func to the subsection of image indicated by bbox.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        augmentation_func: Augmentation function that will be applied to the
+            subsection of image.
+        *args: Additional parameters that will be passed into augmentation_func
+            when it is called.
+
+    Returns:
+        A modified version of image, where the bbox location in the image will
+        have `ugmentation_func applied to it.
+    """
+    image_height = image.shape[0]
+    image_width = image.shape[1]
+
+    min_y = int(image_height * bbox[0])
+    min_x = int(image_width * bbox[1])
+    max_y = int(image_height * bbox[2])
+    max_x = int(image_width * bbox[3])
+
+    # Clip to be sure the max values do not fall out of range.
+    max_y = np.minimum(max_y, image_height - 1)
+    max_x = np.minimum(max_x, image_width - 1)
+
+    # Get the sub-tensor that is the image within the bounding box region.
+    bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :]
+
+    # Apply the augmentation function to the bbox portion of the image.
+    augmented_bbox_content = augmentation_func(bbox_content, *args)
+
+    # Pad the augmented_bbox_content and the mask to match the shape of original
+    # image.
+    augmented_bbox_content = np.pad(
+        augmented_bbox_content, [[min_y, (image_height - 1) - max_y],
+                                 [min_x, (image_width - 1) - max_x], [0, 0]],
+        'constant',
+        constant_values=1)
+
+    # Create a mask that will be used to zero out a part of the original image.
+    mask_tensor = np.zeros_like(bbox_content)
+
+    mask_tensor = np.pad(mask_tensor,
+                         [[min_y, (image_height - 1) - max_y],
+                          [min_x, (image_width - 1) - max_x], [0, 0]],
+                         'constant',
+                         constant_values=1)
+    # Replace the old bbox content with the new augmented content.
+    image = image * mask_tensor + augmented_bbox_content
+    return image.astype(np.uint8)
+
+
+def _concat_bbox(bbox, bboxes):
+    """Helper function that concates bbox to bboxes along the first dimension."""
+
+    # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means
+    # we discard bboxes and start the bboxes Tensor with the current bbox.
+    bboxes_sum_check = np.sum(bboxes)
+    bbox = np.expand_dims(bbox, 0)
+    # This check will be true when it is an _INVALID_BOX
+    if _equal(bboxes_sum_check, -4):
+        bboxes = bbox
+    else:
+        bboxes = np.concatenate([bboxes, bbox], 0)
+    return bboxes
+
+
+def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob,
+                                     augmentation_func, func_changes_bbox,
+                                     *args):
+    """Applies _apply_bbox_augmentation with probability prob.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        new_bboxes: 2D Tensor that is a list of the bboxes in the image after they
+            have been altered by aug_func. These will only be changed when
+            func_changes_bbox is set to true. Each bbox has 4 elements
+            (min_y, min_x, max_y, max_x) of type float that are the normalized
+            bbox coordinates between 0 and 1.
+        prob: Float that is the probability of applying _apply_bbox_augmentation.
+        augmentation_func: Augmentation function that will be applied to the
+            subsection of image.
+        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
+            to image.
+        *args: Additional parameters that will be passed into augmentation_func
+            when it is called.
+
+    Returns:
+        A tuple. Fist element is a modified version of image, where the bbox
+        location in the image will have augmentation_func applied to it if it is
+        chosen to be called with probability `prob`. The second element is a
+        Tensor of Tensors of length 4 that will contain the altered bbox after
+        applying augmentation_func.
+    """
+    should_apply_op = (np.random.rand() + prob >= 1)
+    if func_changes_bbox:
+        if should_apply_op:
+            augmented_image, bbox = augmentation_func(image, bbox, *args)
+        else:
+            augmented_image, bbox = (image, bbox)
+    else:
+        if should_apply_op:
+            augmented_image = _apply_bbox_augmentation(image, bbox,
+                                                       augmentation_func, *args)
+        else:
+            augmented_image = image
+    new_bboxes = _concat_bbox(bbox, new_bboxes)
+    return augmented_image.astype(np.uint8), new_bboxes
+
+
+def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func,
+                                   func_changes_bbox, *args):
+    """Applies aug_func to the image for each bbox in bboxes.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+            has 4 elements (min_y, min_x, max_y, max_x) of type float.
+        prob: Float that is the probability of applying aug_func to a specific
+            bounding box within the image.
+        aug_func: Augmentation function that will be applied to the
+            subsections of image indicated by the bbox values in bboxes.
+        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
+            to image.
+        *args: Additional parameters that will be passed into augmentation_func
+            when it is called.
+
+    Returns:
+        A modified version of image, where each bbox location in the image will
+        have augmentation_func applied to it if it is chosen to be called with
+        probability prob independently across all bboxes. Also the final
+        bboxes are returned that will be unchanged if func_changes_bbox is set to
+        false and if true, the new altered ones will be returned.
+    """
+    # Will keep track of the new altered bboxes after aug_func is repeatedly
+    # applied. The -1 values are a dummy value and this first Tensor will be
+    # removed upon appending the first real bbox.
+    new_bboxes = np.array(_INVALID_BOX)
+
+    # If the bboxes are empty, then just give it _INVALID_BOX. The result
+    # will be thrown away.
+    bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes
+
+    assert bboxes.shape[1] == 4, "bboxes.shape[1] must be 4!!!!"
+
+    # pylint:disable=g-long-lambda
+    # pylint:disable=line-too-long
+    wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)
+    # pylint:enable=g-long-lambda
+    # pylint:enable=line-too-long
+
+    # Setup the while_loop.
+    num_bboxes = bboxes.shape[0]  # We loop until we go over all bboxes.
+    idx = 0  # Counter for the while loop.
+
+    # Conditional function when to end the loop once we go over all bboxes
+    # images_and_bboxes contain (_image, _new_bboxes)
+    def cond(_idx, _images_and_bboxes):
+        return _idx < num_bboxes
+
+    # Shuffle the bboxes so that the augmentation order is not deterministic if
+    # we are not changing the bboxes with aug_func.
+    # if not func_changes_bbox:
+    #     print(bboxes)
+    #     loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0)
+    #     print(loop_bboxes)
+    # else:
+    #     loop_bboxes = bboxes
+    # we can not shuffle the bbox because it does not contain class information here
+    loop_bboxes = deepcopy(bboxes)
+
+    # Main function of while_loop where we repeatedly apply augmentation on the
+    # bboxes in the image.
+    # pylint:disable=g-long-lambda
+    body = lambda _idx, _images_and_bboxes: [
+            _idx + 1, wrapped_aug_func(_images_and_bboxes[0],
+                                         loop_bboxes[_idx],
+                                         _images_and_bboxes[1])]
+    while (cond(idx, (image, new_bboxes))):
+        idx, (image, new_bboxes) = body(idx, (image, new_bboxes))
+
+    # Either return the altered bboxes or the original ones depending on if
+    # we altered them in anyway.
+    if func_changes_bbox:
+        final_bboxes = new_bboxes
+    else:
+        final_bboxes = bboxes
+    return image, final_bboxes
+
+
+def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func,
+                                           func_changes_bbox, *args):
+    """Checks to be sure num bboxes > 0 before calling inner function."""
+    num_bboxes = len(bboxes)
+    new_image = deepcopy(image)
+    new_bboxes = deepcopy(bboxes)
+    if num_bboxes != 0:
+        new_image, new_bboxes = _apply_multi_bbox_augmentation(
+            new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args)
+    return new_image, new_bboxes
+
+
+def rotate_only_bboxes(image, bboxes, prob, degrees, replace):
+    """Apply rotate to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, rotate, func_changes_bbox, degrees, replace)
+
+
+def shear_x_only_bboxes(image, bboxes, prob, level, replace):
+    """Apply shear_x to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, shear_x, func_changes_bbox, level, replace)
+
+
+def shear_y_only_bboxes(image, bboxes, prob, level, replace):
+    """Apply shear_y to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, shear_y, func_changes_bbox, level, replace)
+
+
+def translate_x_only_bboxes(image, bboxes, prob, pixels, replace):
+    """Apply translate_x to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace)
+
+
+def translate_y_only_bboxes(image, bboxes, prob, pixels, replace):
+    """Apply translate_y to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace)
+
+
+def flip_only_bboxes(image, bboxes, prob):
+    """Apply flip_lr to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob,
+                                                  np.fliplr, func_changes_bbox)
+
+
+def solarize_only_bboxes(image, bboxes, prob, threshold):
+    """Apply solarize to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, solarize,
+                                                  func_changes_bbox, threshold)
+
+
+def equalize_only_bboxes(image, bboxes, prob):
+    """Apply equalize to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, equalize,
+                                                  func_changes_bbox)
+
+
+def cutout_only_bboxes(image, bboxes, prob, pad_size, replace):
+    """Apply cutout to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace)
+
+
+def _rotate_bbox(bbox, image_height, image_width, degrees):
+    """Rotates the bbox coordinated by degrees.
+
+    Args:
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        image_height: Int, height of the image.
+        image_width: Int, height of the image.
+        degrees: Float, a scalar angle in degrees to rotate all images by. If
+            degrees is positive the image will be rotated clockwise otherwise it will
+            be rotated counterclockwise.
+
+    Returns:
+        A tensor of the same shape as bbox, but now with the rotated coordinates.
+    """
+    image_height, image_width = (float(image_height), float(image_width))
+
+    # Convert from degrees to radians.
+    degrees_to_radians = math.pi / 180.0
+    radians = degrees * degrees_to_radians
+
+    # Translate the bbox to the center of the image and turn the normalized 0-1
+    # coordinates to absolute pixel locations.
+    # Y coordinates are made negative as the y axis of images goes down with
+    # increasing pixel values, so we negate to make sure x axis and y axis points
+    # are in the traditionally positive direction.
+    min_y = -int(image_height * (bbox[0] - 0.5))
+    min_x = int(image_width * (bbox[1] - 0.5))
+    max_y = -int(image_height * (bbox[2] - 0.5))
+    max_x = int(image_width * (bbox[3] - 0.5))
+    coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x],
+                            [max_y, max_x]]).astype(np.float32)
+    # Rotate the coordinates according to the rotation matrix clockwise if
+    # radians is positive, else negative
+    rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)],
+                                [-math.sin(radians), math.cos(radians)]])
+    new_coords = np.matmul(rotation_matrix,
+                           np.transpose(coordinates)).astype(np.int32)
+
+    # Find min/max values and convert them back to normalized 0-1 floats.
+    min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5)
+    min_x = float(np.min(new_coords[1, :])) / image_width + 0.5
+    max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5)
+    max_x = float(np.max(new_coords[1, :])) / image_width + 0.5
+
+    # Clip the bboxes to be sure the fall between [0, 1].
+    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+    return np.stack([min_y, min_x, max_y, max_x])
+
+
+def rotate_with_bboxes(image, bboxes, degrees, replace):
+    # Rotate the image.
+    image = rotate(image, degrees, replace)
+
+    # Convert bbox coordinates to pixel values.
+    image_height, image_width = image.shape[:2]
+    # pylint:disable=g-long-lambda
+    wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees)
+    # pylint:enable=g-long-lambda
+    new_bboxes = np.zeros_like(bboxes)
+    for idx in range(len(bboxes)):
+        new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx])
+    return image, new_bboxes
+
+
+def translate_x(image, pixels, replace):
+    """Equivalent of PIL Translate in X dimension."""
+    image = Image.fromarray(wrap(image))
+    image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0))
+    return unwrap(np.array(image), replace)
+
+
+def translate_y(image, pixels, replace):
+    """Equivalent of PIL Translate in Y dimension."""
+    image = Image.fromarray(wrap(image))
+    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels))
+    return unwrap(np.array(image), replace)
+
+
+def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal):
+    """Shifts the bbox coordinates by pixels.
+
+    Args:
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        image_height: Int, height of the image.
+        image_width: Int, width of the image.
+        pixels: An int. How many pixels to shift the bbox.
+        shift_horizontal: Boolean. If true then shift in X dimension else shift in
+            Y dimension.
+
+    Returns:
+        A tensor of the same shape as bbox, but now with the shifted coordinates.
+    """
+    pixels = int(pixels)
+    # Convert bbox to integer pixel locations.
+    min_y = int(float(image_height) * bbox[0])
+    min_x = int(float(image_width) * bbox[1])
+    max_y = int(float(image_height) * bbox[2])
+    max_x = int(float(image_width) * bbox[3])
+
+    if shift_horizontal:
+        min_x = np.maximum(0, min_x - pixels)
+        max_x = np.minimum(image_width, max_x - pixels)
+    else:
+        min_y = np.maximum(0, min_y - pixels)
+        max_y = np.minimum(image_height, max_y - pixels)
+
+    # Convert bbox back to floats.
+    min_y = float(min_y) / float(image_height)
+    min_x = float(min_x) / float(image_width)
+    max_y = float(max_y) / float(image_height)
+    max_x = float(max_x) / float(image_width)
+
+    # Clip the bboxes to be sure the fall between [0, 1].
+    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+    return np.stack([min_y, min_x, max_y, max_x])
+
+
+def translate_bbox(image, bboxes, pixels, replace, shift_horizontal):
+    """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
+            between [0, 1].
+        pixels: An int. How many pixels to shift the image and bboxes
+        replace: A one or three value 1D tensor to fill empty pixels.
+        shift_horizontal: Boolean. If true then shift in X dimension else shift in
+            Y dimension.
+
+    Returns:
+        A tuple containing a 3D uint8 Tensor that will be the result of translating
+        image by pixels. The second element of the tuple is bboxes, where now
+        the coordinates will be shifted to reflect the shifted image.
+    """
+    if shift_horizontal:
+        image = translate_x(image, pixels, replace)
+    else:
+        image = translate_y(image, pixels, replace)
+
+    # Convert bbox coordinates to pixel values.
+    image_height, image_width = image.shape[0], image.shape[1]
+    # pylint:disable=g-long-lambda
+    wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal)
+    # pylint:enable=g-long-lambda
+    new_bboxes = deepcopy(bboxes)
+    num_bboxes = len(bboxes)
+    for idx in range(num_bboxes):
+        new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx])
+    return image.astype(np.uint8), new_bboxes
+
+
+def shear_x(image, level, replace):
+    """Equivalent of PIL Shearing in X dimension."""
+    # Shear parallel to x axis is a projective transform
+    # with a matrix form of:
+    # [1    level
+    #    0    1].
+    image = Image.fromarray(wrap(image))
+    image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0))
+    return unwrap(np.array(image), replace)
+
+
+def shear_y(image, level, replace):
+    """Equivalent of PIL Shearing in Y dimension."""
+    # Shear parallel to y axis is a projective transform
+    # with a matrix form of:
+    # [1    0
+    #    level    1].
+    image = Image.fromarray(wrap(image))
+    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0))
+    return unwrap(np.array(image), replace)
+
+
+def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal):
+    """Shifts the bbox according to how the image was sheared.
+
+    Args:
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        image_height: Int, height of the image.
+        image_width: Int, height of the image.
+        level: Float. How much to shear the image.
+        shear_horizontal: If true then shear in X dimension else shear in
+            the Y dimension.
+
+    Returns:
+        A tensor of the same shape as bbox, but now with the shifted coordinates.
+    """
+    image_height, image_width = (float(image_height), float(image_width))
+
+    # Change bbox coordinates to be pixels.
+    min_y = int(image_height * bbox[0])
+    min_x = int(image_width * bbox[1])
+    max_y = int(image_height * bbox[2])
+    max_x = int(image_width * bbox[3])
+    coordinates = np.stack(
+        [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
+    coordinates = coordinates.astype(np.float32)
+
+    # Shear the coordinates according to the translation matrix.
+    if shear_horizontal:
+        translation_matrix = np.stack([[1, 0], [-level, 1]])
+    else:
+        translation_matrix = np.stack([[1, -level], [0, 1]])
+    translation_matrix = translation_matrix.astype(np.float32)
+    new_coords = np.matmul(translation_matrix,
+                           np.transpose(coordinates)).astype(np.int32)
+
+    # Find min/max values and convert them back to floats.
+    min_y = float(np.min(new_coords[0, :])) / image_height
+    min_x = float(np.min(new_coords[1, :])) / image_width
+    max_y = float(np.max(new_coords[0, :])) / image_height
+    max_x = float(np.max(new_coords[1, :])) / image_width
+
+    # Clip the bboxes to be sure the fall between [0, 1].
+    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+    return np.stack([min_y, min_x, max_y, max_x])
+
+
+def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal):
+    """Applies Shear Transformation to the image and shifts the bboxes.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
+            between [0, 1].
+        level: Float. How much to shear the image. This value will be between
+            -0.3 to 0.3.
+        replace: A one or three value 1D tensor to fill empty pixels.
+        shear_horizontal: Boolean. If true then shear in X dimension else shear in
+            the Y dimension.
+
+    Returns:
+        A tuple containing a 3D uint8 Tensor that will be the result of shearing
+        image by level. The second element of the tuple is bboxes, where now
+        the coordinates will be shifted to reflect the sheared image.
+    """
+    if shear_horizontal:
+        image = shear_x(image, level, replace)
+    else:
+        image = shear_y(image, level, replace)
+
+    # Convert bbox coordinates to pixel values.
+    image_height, image_width = image.shape[:2]
+    # pylint:disable=g-long-lambda
+    wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal)
+    # pylint:enable=g-long-lambda
+    new_bboxes = deepcopy(bboxes)
+    num_bboxes = len(bboxes)
+    for idx in range(num_bboxes):
+        new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx])
+    return image.astype(np.uint8), new_bboxes
+
+
+def autocontrast(image):
+    """Implements Autocontrast function from PIL.
+
+    Args:
+        image: A 3D uint8 tensor.
+
+    Returns:
+        The image after it has had autocontrast applied to it and will be of type
+        uint8.
+    """
+
+    def scale_channel(image):
+        """Scale the 2D image using the autocontrast rule."""
+        # A possibly cheaper version can be done using cumsum/unique_with_counts
+        # over the histogram values, rather than iterating over the entire image.
+        # to compute mins and maxes.
+        lo = float(np.min(image))
+        hi = float(np.max(image))
+
+        # Scale the image, making the lowest value 0 and the highest value 255.
+        def scale_values(im):
+            scale = 255.0 / (hi - lo)
+            offset = -lo * scale
+            im = im.astype(np.float32) * scale + offset
+            img = np.clip(im, a_min=0, a_max=255.0)
+            return im.astype(np.uint8)
+
+        result = scale_values(image) if hi > lo else image
+        return result
+
+    # Assumes RGB for now.    Scales each channel independently
+    # and then stacks the result.
+    s1 = scale_channel(image[:, :, 0])
+    s2 = scale_channel(image[:, :, 1])
+    s3 = scale_channel(image[:, :, 2])
+    image = np.stack([s1, s2, s3], 2)
+    return image
+
+
+def sharpness(image, factor):
+    """Implements Sharpness function from PIL."""
+    orig_image = image
+    image = image.astype(np.float32)
+    # Make image 4D for conv operation.
+    # SMOOTH PIL Kernel.
+    kernel = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13.
+    result = cv2.filter2D(image, -1, kernel).astype(np.uint8)
+
+    # Blend the final result.
+    return blend(result, orig_image, factor)
+
+
+def equalize(image):
+    """Implements Equalize function from PIL using."""
+
+    def scale_channel(im, c):
+        """Scale the data in the channel to implement equalize."""
+        im = im[:, :, c].astype(np.int32)
+        # Compute the histogram of the image channel.
+        histo, _ = np.histogram(im, range=[0, 255], bins=256)
+
+        # For the purposes of computing the step, filter out the nonzeros.
+        nonzero = np.where(np.not_equal(histo, 0))
+        nonzero_histo = np.reshape(np.take(histo, nonzero), [-1])
+        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+
+        def build_lut(histo, step):
+            # Compute the cumulative sum, shifting by step // 2
+            # and then normalization by step.
+            lut = (np.cumsum(histo) + (step // 2)) // step
+            # Shift lut, prepending with 0.
+            lut = np.concatenate([[0], lut[:-1]], 0)
+            # Clip the counts to be in range.    This is done
+            # in the C code for image.point.
+            return np.clip(lut, a_min=0, a_max=255).astype(np.uint8)
+
+        # If step is zero, return the original image.    Otherwise, build
+        # lut from the full histogram and step and then index from it.
+        if step == 0:
+            result = im
+        else:
+            result = np.take(build_lut(histo, step), im)
+
+        return result.astype(np.uint8)
+
+    # Assumes RGB for now.    Scales each channel independently
+    # and then stacks the result.
+    s1 = scale_channel(image, 0)
+    s2 = scale_channel(image, 1)
+    s3 = scale_channel(image, 2)
+    image = np.stack([s1, s2, s3], 2)
+    return image
+
+
+def wrap(image):
+    """Returns 'image' with an extra channel set to all 1s."""
+    shape = image.shape
+    extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype)
+    extended = np.concatenate([image, extended_channel], 2).astype(image.dtype)
+    return extended
+
+
+def unwrap(image, replace):
+    """Unwraps an image produced by wrap.
+
+    Where there is a 0 in the last channel for every spatial position,
+    the rest of the three channels in that spatial dimension are grayed
+    (set to 128).    Operations like translate and shear on a wrapped
+    Tensor will leave 0s in empty locations.    Some transformations look
+    at the intensity of values to do preprocessing, and we want these
+    empty pixels to assume the 'average' value, rather than pure black.
+
+
+    Args:
+        image: A 3D Image Tensor with 4 channels.
+        replace: A one or three value 1D tensor to fill empty pixels.
+
+    Returns:
+        image: A 3D image Tensor with 3 channels.
+    """
+    image_shape = image.shape
+    # Flatten the spatial dimensions.
+    flattened_image = np.reshape(image, [-1, image_shape[2]])
+
+    # Find all pixels where the last channel is zero.
+    alpha_channel = flattened_image[:, 3]
+
+    replace = np.concatenate([replace, np.ones([1], image.dtype)], 0)
+
+    # Where they are zero, fill them in with 'replace'.
+    alpha_channel = np.reshape(alpha_channel, (-1, 1))
+    alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1]))
+
+    flattened_image = np.where(
+        np.equal(alpha_channel, 0),
+        np.ones_like(
+            flattened_image, dtype=image.dtype) * replace,
+        flattened_image)
+
+    image = np.reshape(flattened_image, image_shape)
+    image = image[:, :, :3]
+    return image.astype(np.uint8)
+
+
+def _cutout_inside_bbox(image, bbox, pad_fraction):
+    """Generates cutout mask and the mean pixel value of the bbox.
+
+    First a location is randomly chosen within the image as the center where the
+    cutout mask will be applied. Note this can be towards the boundaries of the
+    image, so the full cutout mask may not be applied.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        pad_fraction: Float that specifies how large the cutout mask should be in
+            in reference to the size of the original bbox. If pad_fraction is 0.25,
+            then the cutout mask will be of shape
+            (0.25 * bbox height, 0.25 * bbox width).
+
+    Returns:
+        A tuple. Fist element is a tensor of the same shape as image where each
+        element is either a 1 or 0 that is used to determine where the image
+        will have cutout applied. The second element is the mean of the pixels
+        in the image where the bbox is located.
+        mask value: [0,1]
+    """
+    image_height, image_width = image.shape[0], image.shape[1]
+    # Transform from shape [1, 4] to [4].
+    bbox = np.squeeze(bbox)
+
+    min_y = int(float(image_height) * bbox[0])
+    min_x = int(float(image_width) * bbox[1])
+    max_y = int(float(image_height) * bbox[2])
+    max_x = int(float(image_width) * bbox[3])
+
+    # Calculate the mean pixel values in the bounding box, which will be used
+    # to fill the cutout region.
+    mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1))
+    # Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the
+    # region lies entirely within the bbox.
+    box_height = max_y - min_y + 1
+    box_width = max_x - min_x + 1
+    pad_size_height = int(pad_fraction * (box_height / 2))
+    pad_size_width = int(pad_fraction * (box_width / 2))
+
+    # Sample the center location in the image where the zero mask will be applied.
+    cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32)
+    cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32)
+
+    lower_pad = np.maximum(0, cutout_center_height - pad_size_height)
+    upper_pad = np.maximum(
+        0, image_height - cutout_center_height - pad_size_height)
+    left_pad = np.maximum(0, cutout_center_width - pad_size_width)
+    right_pad = np.maximum(0,
+                           image_width - cutout_center_width - pad_size_width)
+
+    cutout_shape = [
+        image_height - (lower_pad + upper_pad),
+        image_width - (left_pad + right_pad)
+    ]
+    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+
+    mask = np.pad(np.zeros(
+        cutout_shape, dtype=image.dtype),
+                  padding_dims,
+                  'constant',
+                  constant_values=1)
+
+    mask = np.expand_dims(mask, 2)
+    mask = np.tile(mask, [1, 1, 3])
+    return mask, mean
+
+
+def bbox_cutout(image, bboxes, pad_fraction, replace_with_mean):
+    """Applies cutout to the image according to bbox information.
+
+    This is a cutout variant that using bbox information to make more informed
+    decisions on where to place the cutout mask.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
+            between [0, 1].
+        pad_fraction: Float that specifies how large the cutout mask should be in
+            in reference to the size of the original bbox. If pad_fraction is 0.25,
+            then the cutout mask will be of shape
+            (0.25 * bbox height, 0.25 * bbox width).
+        replace_with_mean: Boolean that specified what value should be filled in
+            where the cutout mask is applied. Since the incoming image will be of
+            uint8 and will not have had any mean normalization applied, by default
+            we set the value to be 128. If replace_with_mean is True then we find
+            the mean pixel values across the channel dimension and use those to fill
+            in where the cutout mask is applied.
+
+    Returns:
+        A tuple. First element is a tensor of the same shape as image that has
+        cutout applied to it. Second element is the bboxes that were passed in
+        that will be unchanged.
+    """
+
+    def apply_bbox_cutout(image, bboxes, pad_fraction):
+        """Applies cutout to a single bounding box within image."""
+        # Choose a single bounding box to apply cutout to.
+        random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32)
+        # Select the corresponding bbox and apply cutout.
+        chosen_bbox = np.take(bboxes, random_index, axis=0)
+        mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction)
+
+        # When applying cutout we either set the pixel value to 128 or to the mean
+        # value inside the bbox.
+        replace = mean if replace_with_mean else [128] * 3
+
+        # Apply the cutout mask to the image. Where the mask is 0 we fill it with
+        # `replace`.
+        image = np.where(
+            np.equal(mask, 0),
+            np.ones_like(
+                image, dtype=image.dtype) * replace,
+            image).astype(image.dtype)
+        return image
+
+    # Check to see if there are boxes, if so then apply boxcutout.
+    if len(bboxes) != 0:
+        image = apply_bbox_cutout(image, bboxes, pad_fraction)
+
+    return image, bboxes
+
+
+NAME_TO_FUNC = {
+        'AutoContrast': autocontrast,
+        'Equalize': equalize,
+        'Posterize': posterize,
+        'Solarize': solarize,
+        'SolarizeAdd': solarize_add,
+        'Color': color,
+        'Contrast': contrast,
+        'Brightness': brightness,
+        'Sharpness': sharpness,
+        'Cutout': cutout,
+        'BBox_Cutout': bbox_cutout,
+        'Rotate_BBox': rotate_with_bboxes,
+        # pylint:disable=g-long-lambda
+        'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
+                image, bboxes, pixels, replace, shift_horizontal=True),
+        'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
+                image, bboxes, pixels, replace, shift_horizontal=False),
+        'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
+                image, bboxes, level, replace, shear_horizontal=True),
+        'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
+                image, bboxes, level, replace, shear_horizontal=False),
+        # pylint:enable=g-long-lambda
+        'Rotate_Only_BBoxes': rotate_only_bboxes,
+        'ShearX_Only_BBoxes': shear_x_only_bboxes,
+        'ShearY_Only_BBoxes': shear_y_only_bboxes,
+        'TranslateX_Only_BBoxes': translate_x_only_bboxes,
+        'TranslateY_Only_BBoxes': translate_y_only_bboxes,
+        'Flip_Only_BBoxes': flip_only_bboxes,
+        'Solarize_Only_BBoxes': solarize_only_bboxes,
+        'Equalize_Only_BBoxes': equalize_only_bboxes,
+        'Cutout_Only_BBoxes': cutout_only_bboxes,
+}
+
+
+def _randomly_negate_tensor(tensor):
+    """With 50% prob turn the tensor negative."""
+    should_flip = np.floor(np.random.rand() + 0.5) >= 1
+    final_tensor = tensor if should_flip else -tensor
+    return final_tensor
+
+
+def _rotate_level_to_arg(level):
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate_tensor(level)
+    return (level, )
+
+
+def _shrink_level_to_arg(level):
+    """Converts level to ratio by which we shrink the image content."""
+    if level == 0:
+        return (1.0, )  # if level is zero, do not shrink the image
+    # Maximum shrinking ratio is 2.9.
+    level = 2. / (_MAX_LEVEL / level) + 0.9
+    return (level, )
+
+
+def _enhance_level_to_arg(level):
+    return ((level / _MAX_LEVEL) * 1.8 + 0.1, )
+
+
+def _shear_level_to_arg(level):
+    level = (level / _MAX_LEVEL) * 0.3
+    # Flip level to negative with 50% chance.
+    level = _randomly_negate_tensor(level)
+    return (level, )
+
+
+def _translate_level_to_arg(level, translate_const):
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    # Flip level to negative with 50% chance.
+    level = _randomly_negate_tensor(level)
+    return (level, )
+
+
+def _bbox_cutout_level_to_arg(level, hparams):
+    cutout_pad_fraction = (level /
+                           _MAX_LEVEL) * 0.75  # hparams.cutout_max_pad_fraction
+    return (cutout_pad_fraction, False)  # hparams.cutout_bbox_replace_with_mean
+
+
+def level_to_arg(hparams):
+    return {
+        'AutoContrast': lambda level: (),
+        'Equalize': lambda level: (),
+        'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ),
+        'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ),
+        'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ),
+        'Color': _enhance_level_to_arg,
+        'Contrast': _enhance_level_to_arg,
+        'Brightness': _enhance_level_to_arg,
+        'Sharpness': _enhance_level_to_arg,
+        'Cutout':
+        lambda level: (int((level / _MAX_LEVEL) * 100), ),  # hparams.cutout_const=100
+        # pylint:disable=g-long-lambda
+        'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams),
+        'TranslateX_BBox':
+        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_const=250
+        'TranslateY_BBox':
+        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_cons
+        # pylint:enable=g-long-lambda
+        'ShearX_BBox': _shear_level_to_arg,
+        'ShearY_BBox': _shear_level_to_arg,
+        'Rotate_BBox': _rotate_level_to_arg,
+        'Rotate_Only_BBoxes': _rotate_level_to_arg,
+        'ShearX_Only_BBoxes': _shear_level_to_arg,
+        'ShearY_Only_BBoxes': _shear_level_to_arg,
+        # pylint:disable=g-long-lambda
+        'TranslateX_Only_BBoxes':
+        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const
+        'TranslateY_Only_BBoxes':
+        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const
+        # pylint:enable=g-long-lambda
+        'Flip_Only_BBoxes': lambda level: (),
+        'Solarize_Only_BBoxes':
+        lambda level: (int((level / _MAX_LEVEL) * 256), ),
+        'Equalize_Only_BBoxes': lambda level: (),
+        # pylint:disable=g-long-lambda
+        'Cutout_Only_BBoxes':
+        lambda level: (int((level / _MAX_LEVEL) * 50), ),  # hparams.cutout_bbox_const
+        # pylint:enable=g-long-lambda
+    }
+
+
+def bbox_wrapper(func):
+    """Adds a bboxes function argument to func and returns unchanged bboxes."""
+
+    def wrapper(images, bboxes, *args, **kwargs):
+        return (func(images, *args, **kwargs), bboxes)
+
+    return wrapper
+
+
+def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams):
+    """Return the function that corresponds to `name` and update `level` param."""
+    func = NAME_TO_FUNC[name]
+    args = level_to_arg(augmentation_hparams)[name](level)
+
+    # Check to see if prob is passed into function. This is used for operations
+    # where we alter bboxes independently.
+    # pytype:disable=wrong-arg-types
+    if 'prob' in inspect.getfullargspec(func)[0]:
+        args = tuple([prob] + list(args))
+    # pytype:enable=wrong-arg-types
+
+    # Add in replace arg if it is required for the function that is being called.
+    if 'replace' in inspect.getfullargspec(func)[0]:
+        # Make sure replace is the final argument
+        assert 'replace' == inspect.getfullargspec(func)[0][-1]
+        args = tuple(list(args) + [replace_value])
+
+    # Add bboxes as the second positional argument for the function if it does
+    # not already exist.
+    if 'bboxes' not in inspect.getfullargspec(func)[0]:
+        func = bbox_wrapper(func)
+    return (func, prob, args)
+
+
+def _apply_func_with_prob(func, image, args, prob, bboxes):
+    """Apply `func` to image w/ `args` as input with probability `prob`."""
+    assert isinstance(args, tuple)
+    assert 'bboxes' == inspect.getfullargspec(func)[0][1]
+
+    # If prob is a function argument, then this randomness is being handled
+    # inside the function, so make sure it is always called.
+    if 'prob' in inspect.getfullargspec(func)[0]:
+        prob = 1.0
+
+    # Apply the function with probability `prob`.
+    should_apply_op = np.floor(np.random.rand() + 0.5) >= 1
+    if should_apply_op:
+        augmented_image, augmented_bboxes = func(image, bboxes, *args)
+    else:
+        augmented_image, augmented_bboxes = (image, bboxes)
+    return augmented_image, augmented_bboxes
+
+
+def select_and_apply_random_policy(policies, image, bboxes):
+    """Select a random policy from `policies` and apply it to `image`."""
+    policy_to_select = np.random.randint(0, len(policies), dtype=np.int32)
+    # policy_to_select = 6 # for test
+    for (i, policy) in enumerate(policies):
+        if i == policy_to_select:
+            image, bboxes = policy(image, bboxes)
+    return (image, bboxes)
+
+
+def build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams):
+    """Build a policy from the given policies passed in and apply to image.
+
+    Args:
+        policies: list of lists of tuples in the form `(func, prob, level)`, `func`
+            is a string name of the augmentation function, `prob` is the probability
+            of applying the `func` operation, `level` is the input argument for
+            `func`.
+        image: numpy array that the resulting policy will be applied to.
+        bboxes:
+        augmentation_hparams: Hparams associated with the NAS learned policy.
+
+    Returns:
+        A version of image that now has data augmentation applied to it based on
+        the `policies` pass into the function. Additionally, returns bboxes if
+        a value for them is passed in that is not None
+    """
+    replace_value = [128, 128, 128]
+
+    # func is the string name of the augmentation function, prob is the
+    # probability of applying the operation and level is the parameter associated
+
+    # tf_policies are functions that take in an image and return an augmented
+    # image.
+    tf_policies = []
+    for policy in policies:
+        tf_policy = []
+        # Link string name to the correct python function and make sure the correct
+        # argument is passed into that function.
+        for policy_info in policy:
+            policy_info = list(
+                policy_info) + [replace_value, augmentation_hparams]
+
+            tf_policy.append(_parse_policy_info(*policy_info))
+        # Now build the tf policy that will apply the augmentation procedue
+        # on image.
+        def make_final_policy(tf_policy_):
+            def final_policy(image_, bboxes_):
+                for func, prob, args in tf_policy_:
+                    image_, bboxes_ = _apply_func_with_prob(func, image_, args,
+                                                            prob, bboxes_)
+                return image_, bboxes_
+
+            return final_policy
+
+        tf_policies.append(make_final_policy(tf_policy))
+
+    augmented_images, augmented_bboxes = select_and_apply_random_policy(
+        tf_policies, image, bboxes)
+    # If no bounding boxes were specified, then just return the images.
+    return (augmented_images, augmented_bboxes)
+
+
+# TODO(barretzoph): Add in ArXiv link once paper is out.
+def distort_image_with_autoaugment(image, bboxes, augmentation_name):
+    """Applies the AutoAugment policy to `image` and `bboxes`.
+
+    Args:
+        image: `Tensor` of shape [height, width, 3] representing an image.
+        bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are
+            normalized between [0, 1].
+        augmentation_name: The name of the AutoAugment policy to use. The available
+            options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for
+            all of the results in the paper and was found to achieve the best results
+            on the COCO dataset. `v1`, `v2` and `v3` are additional good policies
+            found on the COCO dataset that have slight variation in what operations
+            were used during the search procedure along with how many operations are
+            applied in parallel to a single image (2 vs 3).
+
+    Returns:
+        A tuple containing the augmented versions of `image` and `bboxes`.
+    """
+    available_policies = {
+        'v0': policy_v0,
+        'v1': policy_v1,
+        'v2': policy_v2,
+        'v3': policy_v3,
+        'test': policy_vtest
+    }
+    if augmentation_name not in available_policies:
+        raise ValueError('Invalid augmentation_name: {}'.format(
+            augmentation_name))
+
+    policy = available_policies[augmentation_name]()
+    augmentation_hparams = {}
+    return build_and_apply_nas_policy(policy, image, bboxes,
+                                      augmentation_hparams)
diff --git a/ppdet/data/transform/batch_operators.py b/ppdet/data/transform/batch_operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..fca49d4532af810d008978078ae7feb5793a8d1e
--- /dev/null
+++ b/ppdet/data/transform/batch_operators.py
@@ -0,0 +1,439 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import typing
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+import cv2
+import math
+import numpy as np
+from .operators import register_op, BaseOperator, Resize
+from .op_helper import jaccard_overlap
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'PadBatch',
+    'BatchRandomResize',
+    'Gt2YoloTarget',
+    'PadGT',
+]
+
+
+@register_op
+class PadBatch(BaseOperator):
+    """
+    Pad a batch of samples so they can be divisible by a stride.
+    The layout of each image should be 'CHW'.
+    Args:
+        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
+            height and width is divisible by `pad_to_stride`.
+    """
+
+    def __init__(self, pad_to_stride=0):
+        super(PadBatch, self).__init__()
+        self.pad_to_stride = pad_to_stride
+
+    def __call__(self, samples, context=None):
+        """
+        Args:
+            samples (list): a batch of sample, each is dict.
+        """
+        coarsest_stride = self.pad_to_stride
+
+        # multi scale input is nested list
+        if isinstance(samples,
+                      typing.Sequence) and len(samples) > 0 and isinstance(
+                          samples[0], typing.Sequence):
+            inner_samples = samples[0]
+        else:
+            inner_samples = samples
+
+        max_shape = np.array(
+            [data['image'].shape for data in inner_samples]).max(axis=0)
+        if coarsest_stride > 0:
+            max_shape[1] = int(
+                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
+            max_shape[2] = int(
+                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
+
+        for data in inner_samples:
+            im = data['image']
+            im_c, im_h, im_w = im.shape[:]
+            padding_im = np.zeros(
+                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
+            padding_im[:, :im_h, :im_w] = im
+            data['image'] = padding_im
+            if 'semantic' in data and data['semantic'] is not None:
+                semantic = data['semantic']
+                padding_sem = np.zeros(
+                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
+                padding_sem[:, :im_h, :im_w] = semantic
+                data['semantic'] = padding_sem
+            if 'gt_segm' in data and data['gt_segm'] is not None:
+                gt_segm = data['gt_segm']
+                padding_segm = np.zeros(
+                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
+                    dtype=np.uint8)
+                padding_segm[:, :im_h, :im_w] = gt_segm
+                data['gt_segm'] = padding_segm
+
+        return samples
+
+
+@register_op
+class BatchRandomResize(BaseOperator):
+    """
+    Resize image to target size randomly. random target_size and interpolation method
+    Args:
+        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
+        keep_ratio (bool): whether keep_raio or not, default true
+        interp (int): the interpolation method
+        random_size (bool): whether random select target size of image
+        random_interp (bool): whether random select interpolation method
+    """
+
+    def __init__(self,
+                 target_size,
+                 keep_ratio,
+                 interp=cv2.INTER_NEAREST,
+                 random_size=True,
+                 random_interp=False):
+        super(BatchRandomResize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+        self.interp = interp
+        assert isinstance(target_size, (
+            int, Sequence)), "target_size must be int, list or tuple"
+        if random_size and not isinstance(target_size, list):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. Must be List, now is {}".
+                format(type(target_size)))
+        self.target_size = target_size
+        self.random_size = random_size
+        self.random_interp = random_interp
+
+    def __call__(self, samples, context=None):
+        if self.random_size:
+            index = np.random.choice(len(self.target_size))
+            target_size = self.target_size[index]
+        else:
+            target_size = self.target_size
+
+        if self.random_interp:
+            interp = np.random.choice(self.interps)
+        else:
+            interp = self.interp
+
+        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
+        return resizer(samples, context=context)
+
+
+@register_op
+class Gt2YoloTarget(BaseOperator):
+    __shared__ = ['num_classes']
+    """
+    Generate YOLOv3 targets by groud truth data, this operator is only used in
+    fine grained YOLOv3 loss mode
+    """
+
+    def __init__(self,
+                 anchors,
+                 anchor_masks,
+                 downsample_ratios,
+                 num_classes=80,
+                 iou_thresh=1.):
+        super(Gt2YoloTarget, self).__init__()
+        self.anchors = anchors
+        self.anchor_masks = anchor_masks
+        self.downsample_ratios = downsample_ratios
+        self.num_classes = num_classes
+        self.iou_thresh = iou_thresh
+
+    def __call__(self, samples, context=None):
+        assert len(self.anchor_masks) == len(self.downsample_ratios), \
+            "anchor_masks', and 'downsample_ratios' should have same length."
+
+        h, w = samples[0]['image'].shape[1:3]
+        an_hw = np.array(self.anchors) / np.array([[w, h]])
+        for sample in samples:
+            gt_bbox = sample['gt_bbox']
+            gt_class = sample['gt_class']
+            if 'gt_score' not in sample:
+                sample['gt_score'] = np.ones(
+                    (gt_bbox.shape[0], 1), dtype=np.float32)
+            gt_score = sample['gt_score']
+            for i, (
+                    mask, downsample_ratio
+            ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)):
+                grid_h = int(h / downsample_ratio)
+                grid_w = int(w / downsample_ratio)
+                target = np.zeros(
+                    (len(mask), 6 + self.num_classes, grid_h, grid_w),
+                    dtype=np.float32)
+                for b in range(gt_bbox.shape[0]):
+                    gx, gy, gw, gh = gt_bbox[b, :]
+                    cls = gt_class[b]
+                    score = gt_score[b]
+                    if gw <= 0. or gh <= 0. or score <= 0.:
+                        continue
+
+                    # find best match anchor index
+                    best_iou = 0.
+                    best_idx = -1
+                    for an_idx in range(an_hw.shape[0]):
+                        iou = jaccard_overlap(
+                            [0., 0., gw, gh],
+                            [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])
+                        if iou > best_iou:
+                            best_iou = iou
+                            best_idx = an_idx
+
+                    gi = int(gx * grid_w)
+                    gj = int(gy * grid_h)
+
+                    # gtbox should be regresed in this layes if best match 
+                    # anchor index in anchor mask of this layer
+                    if best_idx in mask:
+                        best_n = mask.index(best_idx)
+
+                        # x, y, w, h, scale
+                        target[best_n, 0, gj, gi] = gx * grid_w - gi
+                        target[best_n, 1, gj, gi] = gy * grid_h - gj
+                        target[best_n, 2, gj, gi] = np.log(
+                            gw * w / self.anchors[best_idx][0])
+                        target[best_n, 3, gj, gi] = np.log(
+                            gh * h / self.anchors[best_idx][1])
+                        target[best_n, 4, gj, gi] = 2.0 - gw * gh
+
+                        # objectness record gt_score
+                        target[best_n, 5, gj, gi] = score
+
+                        # classification
+                        target[best_n, 6 + cls, gj, gi] = 1.
+
+                    # For non-matched anchors, calculate the target if the iou 
+                    # between anchor and gt is larger than iou_thresh
+                    if self.iou_thresh < 1:
+                        for idx, mask_i in enumerate(mask):
+                            if mask_i == best_idx: continue
+                            iou = jaccard_overlap(
+                                [0., 0., gw, gh],
+                                [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]])
+                            if iou > self.iou_thresh and target[idx, 5, gj,
+                                                                gi] == 0.:
+                                # x, y, w, h, scale
+                                target[idx, 0, gj, gi] = gx * grid_w - gi
+                                target[idx, 1, gj, gi] = gy * grid_h - gj
+                                target[idx, 2, gj, gi] = np.log(
+                                    gw * w / self.anchors[mask_i][0])
+                                target[idx, 3, gj, gi] = np.log(
+                                    gh * h / self.anchors[mask_i][1])
+                                target[idx, 4, gj, gi] = 2.0 - gw * gh
+
+                                # objectness record gt_score
+                                target[idx, 5, gj, gi] = score
+
+                                # classification
+                                target[idx, 6 + cls, gj, gi] = 1.
+                sample['target{}'.format(i)] = target
+
+            # remove useless gt_class and gt_score after target calculated
+            sample.pop('gt_class')
+            sample.pop('gt_score')
+
+        return samples
+
+
+@register_op
+class PadGT(BaseOperator):
+    """
+    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
+    The num_max_boxes is the largest for batch.
+    Args:
+        return_gt_mask (bool): If true, return `pad_gt_mask`,
+                                1 means bbox, 0 means no bbox.
+    """
+
+    def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0):
+        super(PadGT, self).__init__()
+        self.return_gt_mask = return_gt_mask
+        self.pad_img = pad_img
+        self.minimum_gtnum = minimum_gtnum
+
+    def _impad(self,
+               img: np.ndarray,
+               *,
+               shape=None,
+               padding=None,
+               pad_val=0,
+               padding_mode='constant') -> np.ndarray:
+        """Pad the given image to a certain shape or pad on all sides with
+        specified padding mode and padding value.
+
+        Args:
+            img (ndarray): Image to be padded.
+            shape (tuple[int]): Expected padding shape (h, w). Default: None.
+            padding (int or tuple[int]): Padding on each border. If a single int is
+                provided this is used to pad all borders. If tuple of length 2 is
+                provided this is the padding on left/right and top/bottom
+                respectively. If a tuple of length 4 is provided this is the
+                padding for the left, top, right and bottom borders respectively.
+                Default: None. Note that `shape` and `padding` can not be both
+                set.
+            pad_val (Number | Sequence[Number]): Values to be filled in padding
+                areas when padding_mode is 'constant'. Default: 0.
+            padding_mode (str): Type of padding. Should be: constant, edge,
+                reflect or symmetric. Default: constant.
+                - constant: pads with a constant value, this value is specified
+                with pad_val.
+                - edge: pads with the last value at the edge of the image.
+                - reflect: pads with reflection of image without repeating the last
+                value on the edge. For example, padding [1, 2, 3, 4] with 2
+                elements on both sides in reflect mode will result in
+                [3, 2, 1, 2, 3, 4, 3, 2].
+                - symmetric: pads with reflection of image repeating the last value
+                on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+                both sides in symmetric mode will result in
+                [2, 1, 1, 2, 3, 4, 4, 3]
+
+        Returns:
+            ndarray: The padded image.
+        """
+
+        assert (shape is not None) ^ (padding is not None)
+        if shape is not None:
+            width = max(shape[1] - img.shape[1], 0)
+            height = max(shape[0] - img.shape[0], 0)
+            padding = (0, 0, int(width), int(height))
+
+        # check pad_val
+        import numbers
+        if isinstance(pad_val, tuple):
+            assert len(pad_val) == img.shape[-1]
+        elif not isinstance(pad_val, numbers.Number):
+            raise TypeError('pad_val must be a int or a tuple. '
+                            f'But received {type(pad_val)}')
+
+        # check padding
+        if isinstance(padding, tuple) and len(padding) in [2, 4]:
+            if len(padding) == 2:
+                padding = (padding[0], padding[1], padding[0], padding[1])
+        elif isinstance(padding, numbers.Number):
+            padding = (padding, padding, padding, padding)
+        else:
+            raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                             f'But received {padding}')
+
+        # check padding mode
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+        border_type = {
+            'constant': cv2.BORDER_CONSTANT,
+            'edge': cv2.BORDER_REPLICATE,
+            'reflect': cv2.BORDER_REFLECT_101,
+            'symmetric': cv2.BORDER_REFLECT
+        }
+        img = cv2.copyMakeBorder(
+            img,
+            padding[1],
+            padding[3],
+            padding[0],
+            padding[2],
+            border_type[padding_mode],
+            value=pad_val)
+
+        return img
+
+    def checkmaxshape(self, samples):
+        maxh, maxw = 0, 0
+        for sample in samples:
+            h, w = sample['im_shape']
+            if h > maxh:
+                maxh = h
+            if w > maxw:
+                maxw = w
+        return (maxh, maxw)
+
+    def __call__(self, samples, context=None):
+        num_max_boxes = max([len(s['gt_bbox']) for s in samples])
+        num_max_boxes = max(self.minimum_gtnum, num_max_boxes)
+        if self.pad_img:
+            maxshape = self.checkmaxshape(samples)
+        for sample in samples:
+            if self.pad_img:
+                img = sample['image']
+                padimg = self._impad(img, shape=maxshape)
+                sample['image'] = padimg
+            if self.return_gt_mask:
+                sample['pad_gt_mask'] = np.zeros(
+                    (num_max_boxes, 1), dtype=np.float32)
+            if num_max_boxes == 0:
+                continue
+
+            num_gt = len(sample['gt_bbox'])
+            pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
+            pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
+            if num_gt > 0:
+                pad_gt_class[:num_gt] = sample['gt_class']
+                pad_gt_bbox[:num_gt] = sample['gt_bbox']
+            sample['gt_class'] = pad_gt_class
+            sample['gt_bbox'] = pad_gt_bbox
+            # pad_gt_mask
+            if 'pad_gt_mask' in sample:
+                sample['pad_gt_mask'][:num_gt] = 1
+            # gt_score
+            if 'gt_score' in sample:
+                pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_score[:num_gt] = sample['gt_score']
+                sample['gt_score'] = pad_gt_score
+            if 'is_crowd' in sample:
+                pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32)
+                if num_gt > 0:
+                    pad_is_crowd[:num_gt] = sample['is_crowd']
+                sample['is_crowd'] = pad_is_crowd
+            if 'difficult' in sample:
+                pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32)
+                if num_gt > 0:
+                    pad_diff[:num_gt] = sample['difficult']
+                sample['difficult'] = pad_diff
+            if 'gt_joints' in sample:
+                num_joints = sample['gt_joints'].shape[1]
+                pad_gt_joints = np.zeros(
+                    (num_max_boxes, num_joints, 3), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_joints[:num_gt] = sample['gt_joints']
+                sample['gt_joints'] = pad_gt_joints
+            if 'gt_areas' in sample:
+                pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_areas[:num_gt, 0] = sample['gt_areas']
+                sample['gt_areas'] = pad_gt_areas
+        return samples
diff --git a/ppdet/data/transform/gridmask_utils.py b/ppdet/data/transform/gridmask_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c18701556efa793a2d9bbced5f333059b4ab6236
--- /dev/null
+++ b/ppdet/data/transform/gridmask_utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+from PIL import Image
+
+
+class Gridmask(object):
+    def __init__(self,
+                 use_h=True,
+                 use_w=True,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=1,
+                 prob=0.7,
+                 upper_iter=360000):
+        super(Gridmask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.prob = prob
+        self.st_prob = prob
+        self.upper_iter = upper_iter
+
+    def __call__(self, x, curr_iter):
+        self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter)
+        if np.random.rand() > self.prob:
+            return x
+        h, w, _ = x.shape
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(2, h)
+        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.l, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.l, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2
+                    + w].astype(np.float32)
+
+        if self.mode == 1:
+            mask = 1 - mask
+        mask = np.expand_dims(mask, axis=-1)
+        if self.offset:
+            offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32)
+            x = (x * mask + offset * (1 - mask)).astype(x.dtype)
+        else:
+            x = (x * mask).astype(x.dtype)
+
+        return x
diff --git a/ppdet/data/transform/op_helper.py b/ppdet/data/transform/op_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c400306da8ec3ff605c0efac3e725ffd2e267a3
--- /dev/null
+++ b/ppdet/data/transform/op_helper.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this file contains helper methods for BBOX processing
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import random
+import math
+import cv2
+
+
+def meet_emit_constraint(src_bbox, sample_bbox):
+    center_x = (src_bbox[2] + src_bbox[0]) / 2
+    center_y = (src_bbox[3] + src_bbox[1]) / 2
+    if center_x >= sample_bbox[0] and \
+            center_x <= sample_bbox[2] and \
+            center_y >= sample_bbox[1] and \
+            center_y <= sample_bbox[3]:
+        return True
+    return False
+
+
+def clip_bbox(src_bbox):
+    src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)
+    src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)
+    src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)
+    src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)
+    return src_bbox
+
+
+def bbox_area(src_bbox):
+    if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:
+        return 0.
+    else:
+        width = src_bbox[2] - src_bbox[0]
+        height = src_bbox[3] - src_bbox[1]
+        return width * height
+
+
+def is_overlap(object_bbox, sample_bbox):
+    if object_bbox[0] >= sample_bbox[2] or \
+       object_bbox[2] <= sample_bbox[0] or \
+       object_bbox[1] >= sample_bbox[3] or \
+       object_bbox[3] <= sample_bbox[1]:
+        return False
+    else:
+        return True
+
+
+def filter_and_process(sample_bbox, bboxes, labels, scores=None,
+                       keypoints=None):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    new_keypoints = []
+    new_kp_ignore = []
+    for i in range(len(bboxes)):
+        new_bbox = [0, 0, 0, 0]
+        obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]
+        if not meet_emit_constraint(obj_bbox, sample_bbox):
+            continue
+        if not is_overlap(obj_bbox, sample_bbox):
+            continue
+        sample_width = sample_bbox[2] - sample_bbox[0]
+        sample_height = sample_bbox[3] - sample_bbox[1]
+        new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width
+        new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height
+        new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width
+        new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height
+        new_bbox = clip_bbox(new_bbox)
+        if bbox_area(new_bbox) > 0:
+            new_bboxes.append(new_bbox)
+            new_labels.append([labels[i][0]])
+            if scores is not None:
+                new_scores.append([scores[i][0]])
+            if keypoints is not None:
+                sample_keypoint = keypoints[0][i]
+                for j in range(len(sample_keypoint)):
+                    kp_len = sample_height if j % 2 else sample_width
+                    sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]
+                    sample_keypoint[j] = (
+                        sample_keypoint[j] - sample_coord) / kp_len
+                    sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)
+                new_keypoints.append(sample_keypoint)
+                new_kp_ignore.append(keypoints[1][i])
+
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    if keypoints is not None:
+        keypoints = np.array(new_keypoints)
+        new_kp_ignore = np.array(new_kp_ignore)
+        return bboxes, labels, scores, (keypoints, new_kp_ignore)
+    return bboxes, labels, scores
+
+
+def bbox_area_sampling(bboxes, labels, scores, target_size, min_size):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    for i, bbox in enumerate(bboxes):
+        w = float((bbox[2] - bbox[0]) * target_size)
+        h = float((bbox[3] - bbox[1]) * target_size)
+        if w * h < float(min_size * min_size):
+            continue
+        else:
+            new_bboxes.append(bbox)
+            new_labels.append(labels[i])
+            if scores is not None and scores.size != 0:
+                new_scores.append(scores[i])
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    return bboxes, labels, scores
+
+
+def generate_sample_bbox(sampler):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def generate_sample_bbox_square(sampler, image_width, image_height):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    if image_height < image_width:
+        bbox_width = bbox_height * image_height / image_width
+    else:
+        bbox_height = bbox_width * image_width / image_height
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,
+                         resize_width):
+    num_gt = len(bbox_labels)
+    # np.random.randint range: [low, high)
+    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
+
+    if num_gt != 0:
+        norm_xmin = bbox_labels[rand_idx][0]
+        norm_ymin = bbox_labels[rand_idx][1]
+        norm_xmax = bbox_labels[rand_idx][2]
+        norm_ymax = bbox_labels[rand_idx][3]
+
+        xmin = norm_xmin * image_width
+        ymin = norm_ymin * image_height
+        wid = image_width * (norm_xmax - norm_xmin)
+        hei = image_height * (norm_ymax - norm_ymin)
+        range_size = 0
+
+        area = wid * hei
+        for scale_ind in range(0, len(scale_array) - 1):
+            if area > scale_array[scale_ind] ** 2 and area < \
+                    scale_array[scale_ind + 1] ** 2:
+                range_size = scale_ind + 1
+                break
+
+        if area > scale_array[len(scale_array) - 2]**2:
+            range_size = len(scale_array) - 2
+
+        scale_choose = 0.0
+        if range_size == 0:
+            rand_idx_size = 0
+        else:
+            # np.random.randint range: [low, high)
+            rng_rand_size = np.random.randint(0, range_size + 1)
+            rand_idx_size = rng_rand_size % (range_size + 1)
+
+        if rand_idx_size == range_size:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = min(2.0 * scale_array[rand_idx_size],
+                                 2 * math.sqrt(wid * hei))
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+        else:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = 2.0 * scale_array[rand_idx_size]
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+
+        sample_bbox_size = wid * resize_width / scale_choose
+
+        w_off_orig = 0.0
+        h_off_orig = 0.0
+        if sample_bbox_size < max(image_height, image_width):
+            if wid <= sample_bbox_size:
+                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
+                                               xmin)
+            else:
+                w_off_orig = np.random.uniform(xmin,
+                                               xmin + wid - sample_bbox_size)
+
+            if hei <= sample_bbox_size:
+                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
+                                               ymin)
+            else:
+                h_off_orig = np.random.uniform(ymin,
+                                               ymin + hei - sample_bbox_size)
+
+        else:
+            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
+            h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0)
+
+        w_off_orig = math.floor(w_off_orig)
+        h_off_orig = math.floor(h_off_orig)
+
+        # Figure out top left coordinates.
+        w_off = float(w_off_orig / image_width)
+        h_off = float(h_off_orig / image_height)
+
+        sampled_bbox = [
+            w_off, h_off, w_off + float(sample_bbox_size / image_width),
+            h_off + float(sample_bbox_size / image_height)
+        ]
+        return sampled_bbox
+    else:
+        return 0
+
+
+def jaccard_overlap(sample_bbox, object_bbox):
+    if sample_bbox[0] >= object_bbox[2] or \
+        sample_bbox[2] <= object_bbox[0] or \
+        sample_bbox[1] >= object_bbox[3] or \
+        sample_bbox[3] <= object_bbox[1]:
+        return 0
+    intersect_xmin = max(sample_bbox[0], object_bbox[0])
+    intersect_ymin = max(sample_bbox[1], object_bbox[1])
+    intersect_xmax = min(sample_bbox[2], object_bbox[2])
+    intersect_ymax = min(sample_bbox[3], object_bbox[3])
+    intersect_size = (intersect_xmax - intersect_xmin) * (
+        intersect_ymax - intersect_ymin)
+    sample_bbox_size = bbox_area(sample_bbox)
+    object_bbox_size = bbox_area(object_bbox)
+    overlap = intersect_size / (
+        sample_bbox_size + object_bbox_size - intersect_size)
+    return overlap
+
+
+def intersect_bbox(bbox1, bbox2):
+    if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \
+        bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:
+        intersection_box = [0.0, 0.0, 0.0, 0.0]
+    else:
+        intersection_box = [
+            max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),
+            min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])
+        ]
+    return intersection_box
+
+
+def bbox_coverage(bbox1, bbox2):
+    inter_box = intersect_bbox(bbox1, bbox2)
+    intersect_size = bbox_area(inter_box)
+
+    if intersect_size > 0:
+        bbox1_size = bbox_area(bbox1)
+        return intersect_size / bbox1_size
+    else:
+        return 0.
+
+
+def satisfy_sample_constraint(sampler,
+                              sample_bbox,
+                              gt_bboxes,
+                              satisfy_all=False):
+    if sampler[6] == 0 and sampler[7] == 0:
+        return True
+    satisfied = []
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        overlap = jaccard_overlap(sample_bbox, object_bbox)
+        if sampler[6] != 0 and \
+                overlap < sampler[6]:
+            satisfied.append(False)
+            continue
+        if sampler[7] != 0 and \
+                overlap > sampler[7]:
+            satisfied.append(False)
+            continue
+        satisfied.append(True)
+        if not satisfy_all:
+            return True
+
+    if satisfy_all:
+        return np.all(satisfied)
+    else:
+        return False
+
+
+def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):
+    if sampler[6] == 0 and sampler[7] == 0:
+        has_jaccard_overlap = False
+    else:
+        has_jaccard_overlap = True
+    if sampler[8] == 0 and sampler[9] == 0:
+        has_object_coverage = False
+    else:
+        has_object_coverage = True
+
+    if not has_jaccard_overlap and not has_object_coverage:
+        return True
+    found = False
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        if has_jaccard_overlap:
+            overlap = jaccard_overlap(sample_bbox, object_bbox)
+            if sampler[6] != 0 and \
+                    overlap < sampler[6]:
+                continue
+            if sampler[7] != 0 and \
+                    overlap > sampler[7]:
+                continue
+            found = True
+        if has_object_coverage:
+            object_coverage = bbox_coverage(object_bbox, sample_bbox)
+            if sampler[8] != 0 and \
+                    object_coverage < sampler[8]:
+                continue
+            if sampler[9] != 0 and \
+                    object_coverage > sampler[9]:
+                continue
+            found = True
+        if found:
+            return True
+    return found
+
+
+def crop_image_sampling(img, sample_bbox, image_width, image_height,
+                        target_size):
+    # no clipping here
+    xmin = int(sample_bbox[0] * image_width)
+    xmax = int(sample_bbox[2] * image_width)
+    ymin = int(sample_bbox[1] * image_height)
+    ymax = int(sample_bbox[3] * image_height)
+
+    w_off = xmin
+    h_off = ymin
+    width = xmax - xmin
+    height = ymax - ymin
+    cross_xmin = max(0.0, float(w_off))
+    cross_ymin = max(0.0, float(h_off))
+    cross_xmax = min(float(w_off + width - 1.0), float(image_width))
+    cross_ymax = min(float(h_off + height - 1.0), float(image_height))
+    cross_width = cross_xmax - cross_xmin
+    cross_height = cross_ymax - cross_ymin
+
+    roi_xmin = 0 if w_off >= 0 else abs(w_off)
+    roi_ymin = 0 if h_off >= 0 else abs(h_off)
+    roi_width = cross_width
+    roi_height = cross_height
+
+    roi_y1 = int(roi_ymin)
+    roi_y2 = int(roi_ymin + roi_height)
+    roi_x1 = int(roi_xmin)
+    roi_x2 = int(roi_xmin + roi_width)
+
+    cross_y1 = int(cross_ymin)
+    cross_y2 = int(cross_ymin + cross_height)
+    cross_x1 = int(cross_xmin)
+    cross_x2 = int(cross_xmin + cross_width)
+
+    sample_img = np.zeros((height, width, 3))
+    sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \
+        img[cross_y1: cross_y2, cross_x1: cross_x2]
+
+    sample_img = cv2.resize(
+        sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)
+
+    return sample_img
+
+
+def is_poly(segm):
+    assert isinstance(segm, (list, dict)), \
+        "Invalid segm type: {}".format(type(segm))
+    return isinstance(segm, list)
+
+
+def gaussian_radius(bbox_size, min_overlap):
+    height, width = bbox_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
+    radius1 = (b1 + sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
+    radius2 = (b2 + sq2) / 2
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
+    radius3 = (b3 + sq3) / 2
+    return min(radius1, radius2, radius3)
+
+
+def draw_gaussian(heatmap, center, radius, k=1, delte=6):
+    diameter = 2 * radius + 1
+    sigma = diameter / delte
+    gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)
+
+    x, y = center
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+
+
+def gaussian2D(shape, sigma_x=1, sigma_y=1):
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
+                                                            sigma_y)))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_umich_gaussian(heatmap, center, radius, k=1):
+    """
+    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
+    """
+    diameter = 2 * radius + 1
+    gaussian = gaussian2D(
+        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
+
+    x, y = int(center[0]), int(center[1])
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def get_border(border, size):
+    i = 1
+    while size - border // i <= border // i:
+        i *= 2
+    return border // i
diff --git a/ppdet/data/transform/operators.py b/ppdet/data/transform/operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..4adb01245a25c31857f51c8cf23b8847fcb795a4
--- /dev/null
+++ b/ppdet/data/transform/operators.py
@@ -0,0 +1,4703 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# function:
+#    operators to process sample,
+#    eg: decode/resize/crop image
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+from numbers import Number, Integral
+
+import uuid
+import random
+import math
+import numpy as np
+import os
+import copy
+import logging
+import cv2
+from PIL import Image, ImageDraw
+import pickle
+import threading
+MUTEX = threading.Lock()
+
+import paddle
+from ppdet.core.workspace import serializable
+from ..reader import Compose
+
+from .op_helper import (satisfy_sample_constraint, filter_and_process,
+                        generate_sample_bbox, clip_bbox, data_anchor_sampling,
+                        satisfy_sample_constraint_coverage, crop_image_sampling,
+                        generate_sample_bbox_square, bbox_area_sampling,
+                        is_poly, get_border)
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+registered_ops = []
+
+
+def register_op(cls):
+    registered_ops.append(cls.__name__)
+    if not hasattr(BaseOperator, cls.__name__):
+        setattr(BaseOperator, cls.__name__, cls)
+    else:
+        raise KeyError("The {} class has been registered.".format(cls.__name__))
+    return serializable(cls)
+
+
+class BboxError(ValueError):
+    pass
+
+
+class ImageError(ValueError):
+    pass
+
+
+class BaseOperator(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
+        self._id = name + '_' + str(uuid.uuid4())[-6:]
+
+    def apply(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        return sample
+
+    def __call__(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        if isinstance(sample, Sequence):
+            for i in range(len(sample)):
+                sample[i] = self.apply(sample[i], context)
+        else:
+            sample = self.apply(sample, context)
+        return sample
+
+    def __str__(self):
+        return str(self._id)
+
+
+@register_op
+class RGBReverse(BaseOperator):
+    """RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine
+    """
+
+    def __init__(self):
+        super(RGBReverse, self).__init__()
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        sample['image'] = np.ascontiguousarray(im[:, :, ::-1])
+        return sample
+
+
+@register_op
+class RandomHSV(BaseOperator):
+    """
+    HSV color-space augmentation
+    """
+
+    def __init__(self, hgain=0.015, sgain=0.7, vgain=0.4):
+        super(RandomHSV, self).__init__()
+        self.gains = [hgain, sgain, vgain]
+
+    def __call__(self, sample, context=None):
+        im = sample['image']
+        r = np.random.uniform(-1, 1, 3) * self.gains + 1
+        hue, sat, val = cv2.split(cv2.cvtColor(im, cv2.COLOR_BGR2HSV))
+
+        x = np.arange(0, 256, dtype=r.dtype)
+        lut_hue = ((x * r[0]) % 180).astype(np.uint8)
+        lut_sat = np.clip(x * r[1], 0, 255).astype(np.uint8)
+        lut_val = np.clip(x * r[2], 0, 255).astype(np.uint8)
+
+        im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat),
+                            cv2.LUT(val, lut_val)))
+        im_hsv_ = cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR)
+        sample['image'] = im_hsv_.astype(np.float32, copy=False)
+        return sample
+
+
+@register_op
+class MosaicPerspective(BaseOperator):
+    """
+    Mosaic Data Augmentation and Perspective
+    The code is based on https://github.com/ultralytics/yolov5
+        and https://github.com/WongKinYiu/yolov7
+
+    1. get mosaic coords, _mosaic_preprocess, get mosaic_labels
+    2. random_perspective augment
+    3. copy_paste,mixup,paste_in
+    """
+
+    def __init__(self,
+                 target_size=[640, 640],
+                 mosaic_prob=1.0,
+                 mixup_prob=0.0,
+                 copy_paste_prob=0.0,
+                 paste_in_prob=0.0,
+                 fill_value=114,
+                 degrees=0.0,
+                 translate=0.1,
+                 scale=0.5,
+                 shear=0.0,
+                 perspective=0.0):
+        super(MosaicPerspective, self).__init__()
+        self.mosaic_prob = mosaic_prob
+        self.mixup_prob = mixup_prob
+        self.copy_paste_prob = copy_paste_prob  # no use
+        self.paste_in_prob = paste_in_prob
+
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.mosaic_border = (-target_size[0] // 2, -target_size[1] // 2)
+        self.fill_value = fill_value
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.perspective = perspective
+
+    def xywhn2xyxy(self, x, w=640, h=640, padw=0, padh=0):
+        # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+        y = np.copy(x)
+        y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw  # top left x
+        y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh  # top left y
+        y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw  # bottom right x
+        y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh  # bottom right y
+        return y
+
+    def _mosaic4_preprocess(self, sample):
+        s = self.target_size[0]
+        # select mosaic center (x, y)
+        yc, xc = (int(random.uniform(-x, 2 * s + x))
+                  for x in self.mosaic_border)
+        gt_bboxes = [x['gt_bbox'] for x in sample]
+        for i in range(len(sample)):
+            im = sample[i]['image']
+            h, w, c = im.shape
+            # x1a, y1a, x2a, y2a: large image
+            # x1b, y1b, x2b, y2b: small image
+            if i == 0:
+                # top left. background
+                image = np.full(
+                    (s * 2, s * 2, c), self.fill_value, dtype=np.uint8)
+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h
+            elif i == 1:
+                # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:
+                # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:
+                # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2,
+                                                                     yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+            image[y1a:y2a, x1a:x2a] = im[y1b:y2b, x1b:x2b]
+            padw = x1a - x1b
+            padh = y1a - y1b
+            gt_bboxes[i] = self.xywhn2xyxy(gt_bboxes[i], w, h, padw, padh)
+
+        gt_bboxes = np.concatenate(gt_bboxes, axis=0)
+        gt_bboxes = np.clip(gt_bboxes, 0, s * 2)
+        gt_classes = [x['gt_class'] for x in sample]
+        gt_classes = np.concatenate(gt_classes, axis=0)
+        return image, gt_classes, gt_bboxes
+
+    def letterbox_resize(self,
+                         img,
+                         gt_bboxes,
+                         new_shape=(640, 640),
+                         color=(114, 114, 114)):
+        shape = img.shape[:2]  # [height, width]
+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+        # r = min(r, 1.0)
+        ratio = r, r
+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
+        dw /= 2
+        dh /= 2
+        if shape[::-1] != new_unpad:
+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
+
+        gt_bboxes[:, 0] = ratio[0] * gt_bboxes[:, 0] + dw
+        gt_bboxes[:, 1] = ratio[1] * gt_bboxes[:, 1] + dh
+        gt_bboxes[:, 2] = ratio[0] * gt_bboxes[:, 2] + dw
+        gt_bboxes[:, 3] = ratio[1] * gt_bboxes[:, 3] + dh
+        return img, gt_bboxes
+
+    def random_perspective(self,
+                           im,
+                           gt_classes,
+                           gt_box,
+                           degrees=10,
+                           translate=.1,
+                           scale=.1,
+                           shear=10,
+                           perspective=0.0,
+                           border=(0, 0)):
+        targets = np.concatenate((gt_classes, gt_box), 1)
+        height = im.shape[0] + border[0] * 2  # shape(h,w,c)
+        width = im.shape[1] + border[1] * 2
+
+        # Center
+        C = np.eye(3)
+        C[0, 2] = -im.shape[1] / 2  # x translation (pixels)
+        C[1, 2] = -im.shape[0] / 2  # y translation (pixels)
+
+        # Perspective
+        P = np.eye(3)
+        P[2, 0] = random.uniform(-perspective,
+                                 perspective)  # x perspective (about y)
+        P[2, 1] = random.uniform(-perspective,
+                                 perspective)  # y perspective (about x)
+
+        # Rotation and Scale
+        R = np.eye(3)
+        a = random.uniform(-degrees, degrees)
+        # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+        s = random.uniform(1 - scale, 1 + scale)
+        # s = 2 ** random.uniform(-scale, scale)
+        R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+        # Shear
+        S = np.eye(3)
+        S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi /
+                           180)  # x shear (deg)
+        S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi /
+                           180)  # y shear (deg)
+
+        # Translation
+        T = np.eye(3)
+        T[0, 2] = random.uniform(
+            0.5 - translate, 0.5 + translate) * width  # x translation (pixels)
+        T[1, 2] = random.uniform(
+            0.5 - translate, 0.5 + translate) * height  # y translation (pixels)
+
+        # Combined rotation matrix
+        M = T @S @R @P @C  # order of operations (right to left) is IMPORTANT
+        if (border[0] != 0) or (border[1] != 0) or (
+                M != np.eye(3)).any():  # image changed
+            if perspective:
+                im = cv2.warpPerspective(
+                    im, M, dsize=(width, height), borderValue=(114, 114, 114))
+            else:  # affine
+                im = cv2.warpAffine(
+                    im,
+                    M[:2],
+                    dsize=(width, height),
+                    borderValue=(114, 114, 114))
+
+        # Transform label coordinates
+        n = len(targets)
+        if n:
+            use_segments = False
+            if 1:  # warp boxes
+                xy = np.ones((n * 4, 3))
+                xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(
+                    n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+                xy = xy @M.T  # transform
+                xy = (xy[:, :2] / xy[:, 2:3] if perspective else
+                      xy[:, :2]).reshape(n, 8)  # perspective rescale or affine
+
+                # create new boxes
+                x = xy[:, [0, 2, 4, 6]]
+                y = xy[:, [1, 3, 5, 7]]
+                new = np.concatenate(
+                    (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+                # clip
+                new[:, [0, 2]] = new[:, [0, 2]].clip(0, width)
+                new[:, [1, 3]] = new[:, [1, 3]].clip(0, height)
+
+            # filter candidates
+            i = self.box_candidates(
+                box1=targets[:, 1:5].T * s,
+                box2=new.T,
+                area_thr=0.01 if use_segments else 0.10)
+            targets = targets[i]
+            targets[:, 1:5] = new[i]
+
+        return im, targets[:, 0:1], targets[:, 1:5]
+
+    def box_candidates(self,
+                       box1,
+                       box2,
+                       wh_thr=2,
+                       ar_thr=100,
+                       area_thr=0.1,
+                       eps=1e-16):  # box1(4,n), box2(4,n)
+        # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
+        w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+        w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+        ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps))  # aspect ratio
+        return (w2 > wh_thr) & (h2 > wh_thr) & (
+            w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr)  # candidates
+
+    def clip_coords(self, boxes, shape):
+        # Clip bounding xyxy bounding boxes to image shape (height, width
+        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+        return boxes
+
+    def __call__(self, sample, context=None):
+        # current sample and other 3 samples to a new sample
+        if not isinstance(sample, Sequence):
+            return sample
+        # assert len(sample) == 5 or len(
+        #     sample) == 10, 'YOLOv5 Mosaic need 4 or 9 samples and 1 for mixup'
+
+        # 0.no mosaic
+        if random.random() >= self.mosaic_prob:
+            sample0 = sample[0]
+            sample0['image'], sample0['gt_bbox'] = self.letterbox_resize(
+                sample0['image'], sample0['gt_bbox'], self.target_size)
+            return sample0
+
+        random.shuffle(sample)
+
+        # 1._mosaic_preprocess
+        mosaic_img, mosaic_gt_classes, mosaic_gt_bboxes = self._mosaic4_preprocess(
+            sample[:4])
+
+        # 2.random_perspective
+        mosaic_img, mosaic_gt_classes, mosaic_gt_bboxes = self.random_perspective(
+            mosaic_img, mosaic_gt_classes, mosaic_gt_bboxes, self.degrees,
+            self.translate, self.scale, self.shear, self.perspective,
+            self.mosaic_border)
+
+        # 3.copy_paste
+        # 4.mixup
+        if len(mosaic_gt_bboxes) and random.random() < self.mixup_prob:
+            sample4 = sample[4]
+            img4, gt_bboxes = self.letterbox_resize(
+                sample4['image'], sample4['gt_bbox'], self.target_size)
+
+            r = np.random.beta(8.0, 8.0)
+            mosaic_img = (mosaic_img * r + img4 * (1 - r))  #.astype(np.uint8)
+            mosaic_gt_classes = np.concatenate(
+                (mosaic_gt_classes, sample4['gt_class']), 0)
+            mosaic_gt_bboxes = np.concatenate((mosaic_gt_bboxes, gt_bboxes), 0)
+
+        # 5.paste_in
+        # 6.clip
+        nl = len(mosaic_gt_bboxes)
+        eps = 1E-3
+        if nl:
+            mosaic_gt_bboxes[:, 0:4] = self.clip_coords(
+                mosaic_gt_bboxes[:, 0:4],
+                (mosaic_img.shape[0] - eps, mosaic_img.shape[1] - eps))
+
+        sample = sample[0]  # list to one sample
+        sample['image'] = mosaic_img.astype(np.uint8)
+        sample['gt_bbox'] = mosaic_gt_bboxes
+        sample['gt_class'] = mosaic_gt_classes
+
+        if 'difficult' in sample:
+            sample.pop('difficult')
+        if 'is_crowd' in sample:
+            sample.pop('is_crowd')
+        return sample
+
+
+@register_op
+class Decode(BaseOperator):
+    def __init__(self):
+        """ Transform the image data to numpy format following the rgb format
+        """
+        super(Decode, self).__init__()
+
+    def apply(self, sample, context=None):
+        """ load image if 'im_file' field is not empty but 'image' is"""
+        if 'image' not in sample:
+            with open(sample['im_file'], 'rb') as f:
+                sample['image'] = f.read()
+            sample.pop('im_file')
+
+        try:
+            im = sample['image']
+            data = np.frombuffer(im, dtype='uint8')
+            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+            if 'keep_ori_im' in sample and sample['keep_ori_im']:
+                sample['ori_image'] = im
+            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+        except:
+            im = sample['image']
+
+        sample['image'] = im
+        if 'h' not in sample:
+            sample['h'] = im.shape[0]
+        elif sample['h'] != im.shape[0]:
+            logger.warning(
+                "The actual image height: {} is not equal to the "
+                "height: {} in annotation, and update sample['h'] by actual "
+                "image height.".format(im.shape[0], sample['h']))
+            sample['h'] = im.shape[0]
+        if 'w' not in sample:
+            sample['w'] = im.shape[1]
+        elif sample['w'] != im.shape[1]:
+            logger.warning(
+                "The actual image width: {} is not equal to the "
+                "width: {} in annotation, and update sample['w'] by actual "
+                "image width.".format(im.shape[1], sample['w']))
+            sample['w'] = im.shape[1]
+
+        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+        return sample
+
+
+def _make_dirs(dirname):
+    try:
+        from pathlib import Path
+    except ImportError:
+        from pathlib2 import Path
+    Path(dirname).mkdir(exist_ok=True)
+
+
+@register_op
+class DecodeCache(BaseOperator):
+    def __init__(self, cache_root=None):
+        '''decode image and caching
+        '''
+        super(DecodeCache, self).__init__()
+
+        self.use_cache = False if cache_root is None else True
+        self.cache_root = cache_root
+
+        if cache_root is not None:
+            _make_dirs(cache_root)
+
+    def apply(self, sample, context=None):
+
+        if self.use_cache and os.path.exists(
+                self.cache_path(self.cache_root, sample['im_file'])):
+            path = self.cache_path(self.cache_root, sample['im_file'])
+            im = self.load(path)
+
+        else:
+            if 'image' not in sample:
+                with open(sample['im_file'], 'rb') as f:
+                    sample['image'] = f.read()
+
+            im = sample['image']
+            data = np.frombuffer(im, dtype='uint8')
+            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+            if 'keep_ori_im' in sample and sample['keep_ori_im']:
+                sample['ori_image'] = im
+            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+
+            if self.use_cache and not os.path.exists(
+                    self.cache_path(self.cache_root, sample['im_file'])):
+                path = self.cache_path(self.cache_root, sample['im_file'])
+                self.dump(im, path)
+
+        sample['image'] = im
+        sample['h'] = im.shape[0]
+        sample['w'] = im.shape[1]
+
+        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+
+        sample.pop('im_file')
+
+        return sample
+
+    @staticmethod
+    def cache_path(dir_oot, im_file):
+        return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')
+
+    @staticmethod
+    def load(path):
+        with open(path, 'rb') as f:
+            im = pickle.load(f)
+        return im
+
+    @staticmethod
+    def dump(obj, path):
+        MUTEX.acquire()
+        try:
+            with open(path, 'wb') as f:
+                pickle.dump(obj, f)
+
+        except Exception as e:
+            logger.warning('dump {} occurs exception {}'.format(path, str(e)))
+
+        finally:
+            MUTEX.release()
+
+
+@register_op
+class SniperDecodeCrop(BaseOperator):
+    def __init__(self):
+        super(SniperDecodeCrop, self).__init__()
+
+    def __call__(self, sample, context=None):
+        if 'image' not in sample:
+            with open(sample['im_file'], 'rb') as f:
+                sample['image'] = f.read()
+            sample.pop('im_file')
+
+        im = sample['image']
+        data = np.frombuffer(im, dtype='uint8')
+        im = cv2.imdecode(data, cv2.IMREAD_COLOR)  # BGR mode, but need RGB mode
+        if 'keep_ori_im' in sample and sample['keep_ori_im']:
+            sample['ori_image'] = im
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+
+        chip = sample['chip']
+        x1, y1, x2, y2 = [int(xi) for xi in chip]
+        im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[
+            1]), :]
+
+        sample['image'] = im
+        h = im.shape[0]
+        w = im.shape[1]
+        # sample['im_info'] = [h, w, 1.0]
+        sample['h'] = h
+        sample['w'] = w
+
+        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+        return sample
+
+
+@register_op
+class Permute(BaseOperator):
+    def __init__(self):
+        """
+        Change the channel to be (C, H, W)
+        """
+        super(Permute, self).__init__()
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        im = im.transpose((2, 0, 1))
+        sample['image'] = im
+
+        if 'pre_image' in sample:
+            pre_im = sample['pre_image']
+            pre_im = pre_im.transpose((2, 0, 1))
+            sample['pre_image'] = pre_im
+        return sample
+
+
+@register_op
+class Lighting(BaseOperator):
+    """
+    Lighting the image by eigenvalues and eigenvectors
+    Args:
+        eigval (list): eigenvalues
+        eigvec (list): eigenvectors
+        alphastd (float): random weight of lighting, 0.1 by default
+    """
+
+    def __init__(self, eigval, eigvec, alphastd=0.1):
+        super(Lighting, self).__init__()
+        self.alphastd = alphastd
+        self.eigval = np.array(eigval).astype('float32')
+        self.eigvec = np.array(eigvec).astype('float32')
+
+    def apply(self, sample, context=None):
+        alpha = np.random.normal(scale=self.alphastd, size=(3, ))
+        sample['image'] += np.dot(self.eigvec, self.eigval * alpha)
+
+        if 'pre_image' in sample:
+            sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha)
+        return sample
+
+
+@register_op
+class RandomErasingImage(BaseOperator):
+    def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3):
+        """
+        Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896
+        Args:
+            prob (float): probability to carry out random erasing
+            lower (float): lower limit of the erasing area ratio
+            higher (float): upper limit of the erasing area ratio
+            aspect_ratio (float): aspect ratio of the erasing region
+        """
+        super(RandomErasingImage, self).__init__()
+        self.prob = prob
+        self.lower = lower
+        self.higher = higher
+        self.aspect_ratio = aspect_ratio
+
+    def apply(self, sample, context=None):
+        gt_bbox = sample['gt_bbox']
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image is not a numpy array.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError("{}: image is not 3-dimensional.".format(self))
+
+        for idx in range(gt_bbox.shape[0]):
+            if self.prob <= np.random.rand():
+                continue
+
+            x1, y1, x2, y2 = gt_bbox[idx, :]
+            w_bbox = x2 - x1
+            h_bbox = y2 - y1
+            area = w_bbox * h_bbox
+
+            target_area = random.uniform(self.lower, self.higher) * area
+            aspect_ratio = random.uniform(self.aspect_ratio,
+                                          1 / self.aspect_ratio)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < w_bbox and h < h_bbox:
+                off_y1 = random.randint(0, int(h_bbox - h))
+                off_x1 = random.randint(0, int(w_bbox - w))
+                im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int(
+                    x1 + off_x1 + w), :] = 0
+        sample['image'] = im
+        return sample
+
+
+@register_op
+class NormalizeImage(BaseOperator):
+    def __init__(self,
+                 mean=[0.485, 0.456, 0.406],
+                 std=[0.229, 0.224, 0.225],
+                 is_scale=True,
+                 norm_type='mean_std'):
+        """
+        Args:
+            mean (list): the pixel mean
+            std (list): the pixel variance
+            is_scale (bool): scale the pixel to [0,1]
+            norm_type (str): type in ['mean_std', 'none']
+        """
+        super(NormalizeImage, self).__init__()
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+        if not (isinstance(self.mean, list) and isinstance(self.std, list) and
+                isinstance(self.is_scale, bool) and
+                self.norm_type in ['mean_std', 'none']):
+            raise TypeError("{}: input type is invalid.".format(self))
+        from functools import reduce
+        if reduce(lambda x, y: x * y, self.std) == 0:
+            raise ValueError('{}: std is invalid!'.format(self))
+
+    def apply(self, sample, context=None):
+        """Normalize the image.
+        Operators:
+            1.(optional) Scale the pixel to [0,1]
+            2.(optional) Each pixel minus mean and is divided by std
+        """
+        im = sample['image']
+
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+
+        sample['image'] = im
+
+        if 'pre_image' in sample:
+            pre_im = sample['pre_image']
+            pre_im = pre_im.astype(np.float32, copy=False)
+            if self.is_scale:
+                scale = 1.0 / 255.0
+                pre_im *= scale
+
+            if self.norm_type == 'mean_std':
+                mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+                std = np.array(self.std)[np.newaxis, np.newaxis, :]
+                pre_im -= mean
+                pre_im /= std
+            sample['pre_image'] = pre_im
+
+        return sample
+
+
+@register_op
+class GridMask(BaseOperator):
+    def __init__(self,
+                 use_h=True,
+                 use_w=True,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=1,
+                 prob=0.7,
+                 upper_iter=360000):
+        """
+        GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086
+        Args:
+            use_h (bool): whether to mask vertically
+            use_w (boo;): whether to mask horizontally
+            rotate (float): angle for the mask to rotate
+            offset (float): mask offset
+            ratio (float): mask ratio
+            mode (int): gridmask mode
+            prob (float): max probability to carry out gridmask
+            upper_iter (int): suggested to be equal to global max_iter
+        """
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.prob = prob
+        self.upper_iter = upper_iter
+
+        from .gridmask_utils import Gridmask
+        self.gridmask_op = Gridmask(
+            use_h,
+            use_w,
+            rotate=rotate,
+            offset=offset,
+            ratio=ratio,
+            mode=mode,
+            prob=prob,
+            upper_iter=upper_iter)
+
+    def apply(self, sample, context=None):
+        sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter'])
+        return sample
+
+
+@register_op
+class RandomDistort(BaseOperator):
+    """Random color distortion.
+    Args:
+        hue (list): hue settings. in [lower, upper, probability] format.
+        saturation (list): saturation settings. in [lower, upper, probability] format.
+        contrast (list): contrast settings. in [lower, upper, probability] format.
+        brightness (list): brightness settings. in [lower, upper, probability] format.
+        random_apply (bool): whether to apply in random (yolo) or fixed (SSD)
+            order.
+        count (int): the number of doing distrot
+        random_channel (bool): whether to swap channels randomly
+    """
+
+    def __init__(self,
+                 hue=[-18, 18, 0.5],
+                 saturation=[0.5, 1.5, 0.5],
+                 contrast=[0.5, 1.5, 0.5],
+                 brightness=[0.5, 1.5, 0.5],
+                 random_apply=True,
+                 count=4,
+                 random_channel=False,
+                 prob=1.0):
+        super(RandomDistort, self).__init__()
+        self.hue = hue
+        self.saturation = saturation
+        self.contrast = contrast
+        self.brightness = brightness
+        self.random_apply = random_apply
+        self.count = count
+        self.random_channel = random_channel
+        self.prob = prob
+
+    def apply_hue(self, img):
+        low, high, prob = self.hue
+        if np.random.uniform(0., 1.) < prob:
+            return img
+
+        img = img.astype(np.float32)
+        # it works, but result differ from HSV version
+        delta = np.random.uniform(low, high)
+        u = np.cos(delta * np.pi)
+        w = np.sin(delta * np.pi)
+        bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]])
+        tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321],
+                         [0.211, -0.523, 0.311]])
+        ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647],
+                          [1.0, -1.107, 1.705]])
+        t = np.dot(np.dot(ityiq, bt), tyiq).T
+        img = np.dot(img, t)
+        return img
+
+    def apply_saturation(self, img):
+        low, high, prob = self.saturation
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        # it works, but result differ from HSV version
+        gray = img * np.array([[[0.299, 0.587, 0.114]]], dtype=np.float32)
+        gray = gray.sum(axis=2, keepdims=True)
+        gray *= (1.0 - delta)
+        img *= delta
+        img += gray
+        return img
+
+    def apply_contrast(self, img):
+        low, high, prob = self.contrast
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        img *= delta
+        return img
+
+    def apply_brightness(self, img):
+        low, high, prob = self.brightness
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        img += delta
+        return img
+
+    def apply(self, sample, context=None):
+        if random.random() > self.prob:
+            return sample
+        img = sample['image']
+        if self.random_apply:
+            functions = [
+                self.apply_brightness, self.apply_contrast,
+                self.apply_saturation, self.apply_hue
+            ]
+            distortions = np.random.permutation(functions)[:self.count]
+            for func in distortions:
+                img = func(img)
+            sample['image'] = img
+            return sample
+
+        img = self.apply_brightness(img)
+        mode = np.random.randint(0, 2)
+
+        if mode:
+            img = self.apply_contrast(img)
+
+        img = self.apply_saturation(img)
+        img = self.apply_hue(img)
+
+        if not mode:
+            img = self.apply_contrast(img)
+
+        if self.random_channel:
+            if np.random.randint(0, 2):
+                img = img[..., np.random.permutation(3)]
+        sample['image'] = img
+        return sample
+
+
+@register_op
+class PhotoMetricDistortion(BaseOperator):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        super(PhotoMetricDistortion, self).__init__()
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def apply(self, results, context=None):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        img = results['image']
+        img = img.astype(np.float32)
+        # random brightness
+        if np.random.randint(2):
+            delta = np.random.uniform(-self.brightness_delta,
+                                      self.brightness_delta)
+            img += delta
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = np.random.randint(2)
+        if mode == 1:
+            if np.random.randint(2):
+                alpha = np.random.uniform(self.contrast_lower,
+                                          self.contrast_upper)
+                img *= alpha
+
+        # convert color from BGR to HSV
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+
+        # random saturation
+        if np.random.randint(2):
+            img[..., 1] *= np.random.uniform(self.saturation_lower,
+                                             self.saturation_upper)
+
+        # random hue
+        if np.random.randint(2):
+            img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta)
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
+
+        # random contrast
+        if mode == 0:
+            if np.random.randint(2):
+                alpha = np.random.uniform(self.contrast_lower,
+                                          self.contrast_upper)
+                img *= alpha
+
+        # randomly swap channels
+        if np.random.randint(2):
+            img = img[..., np.random.permutation(3)]
+
+        results['image'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+@register_op
+class AutoAugment(BaseOperator):
+    def __init__(self, autoaug_type="v1"):
+        """
+        Args:
+            autoaug_type (str): autoaug type, support v0, v1, v2, v3, test
+        """
+        super(AutoAugment, self).__init__()
+        self.autoaug_type = autoaug_type
+
+    def apply(self, sample, context=None):
+        """
+        Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172
+        """
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image is not a numpy array.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError("{}: image is not 3-dimensional.".format(self))
+        if len(gt_bbox) == 0:
+            return sample
+
+        height, width, _ = im.shape
+        norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32)
+        norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height)
+        norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width)
+        norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height)
+        norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width)
+
+        from .autoaugment_utils import distort_image_with_autoaugment
+        im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox,
+                                                          self.autoaug_type)
+
+        gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width)
+        gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height)
+        gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width)
+        gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height)
+
+        sample['image'] = im
+        sample['gt_bbox'] = gt_bbox
+        return sample
+
+
+@register_op
+class RandomFlip(BaseOperator):
+    def __init__(self, prob=0.5):
+        """
+        Args:
+            prob (float): the probability of flipping image
+        """
+        super(RandomFlip, self).__init__()
+        self.prob = prob
+        if not (isinstance(self.prob, float)):
+            raise TypeError("{}: input type is invalid.".format(self))
+
+    def apply_segm(self, segms, height, width):
+        def _flip_poly(poly, width):
+            flipped_poly = np.array(poly)
+            flipped_poly[0::2] = width - np.array(poly[0::2])
+            return flipped_poly.tolist()
+
+        def _flip_rle(rle, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[:, ::-1]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        flipped_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                flipped_segms.append([_flip_poly(poly, width) for poly in segm])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                flipped_segms.append(_flip_rle(segm, height, width))
+        return flipped_segms
+
+    def apply_keypoint(self, gt_keypoint, width):
+        for i in range(gt_keypoint.shape[1]):
+            if i % 2 == 0:
+                old_x = gt_keypoint[:, i].copy()
+                gt_keypoint[:, i] = width - old_x
+        return gt_keypoint
+
+    def apply_image(self, image):
+        return image[:, ::-1, :]
+
+    def apply_bbox(self, bbox, width):
+        oldx1 = bbox[:, 0].copy()
+        oldx2 = bbox[:, 2].copy()
+        bbox[:, 0] = width - oldx2
+        bbox[:, 2] = width - oldx1
+        return bbox
+
+    def apply(self, sample, context=None):
+        """Filp the image and bounding box.
+        Operators:
+            1. Flip the image numpy.
+            2. Transform the bboxes' x coordinates.
+              (Must judge whether the coordinates are normalized!)
+            3. Transform the segmentations' x coordinates.
+              (Must judge whether the coordinates are normalized!)
+        Output:
+            sample: the image, bounding box and segmentation part
+                    in sample are flipped.
+        """
+        if np.random.uniform(0, 1) < self.prob:
+            im = sample['image']
+            height, width = im.shape[:2]
+            im = self.apply_image(im)
+            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+                sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width)
+            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+                sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height,
+                                                    width)
+            if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
+                sample['gt_keypoint'] = self.apply_keypoint(
+                    sample['gt_keypoint'], width)
+
+            if 'semantic' in sample and sample['semantic']:
+                sample['semantic'] = sample['semantic'][:, ::-1]
+
+            if 'gt_segm' in sample and sample['gt_segm'].any():
+                sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]
+
+            sample['flipped'] = True
+            sample['image'] = im
+        return sample
+
+
+@register_op
+class Resize(BaseOperator):
+    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
+        """
+        Resize image to target size. if keep_ratio is True, 
+        resize the image's long side to the maximum of target_size
+        if keep_ratio is False, resize the image to target size(h, w)
+        Args:
+            target_size (int|list): image target size
+            keep_ratio (bool): whether keep_ratio or not, default true
+            interp (int): the interpolation method
+        """
+        super(Resize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def apply_image(self, image, scale):
+        im_scale_x, im_scale_y = scale
+
+        return cv2.resize(
+            image,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+
+    def apply_bbox(self, bbox, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        bbox[:, 0::2] *= im_scale_x
+        bbox[:, 1::2] *= im_scale_y
+        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
+        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
+        return bbox
+
+    def apply_area(self, area, scale):
+        im_scale_x, im_scale_y = scale
+        return area * im_scale_x * im_scale_y
+
+    def apply_joints(self, joints, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        joints[..., 0] *= im_scale_x
+        joints[..., 1] *= im_scale_y
+        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
+        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
+        return joints
+
+    def apply_segm(self, segms, im_size, scale):
+        def _resize_poly(poly, im_scale_x, im_scale_y):
+            resized_poly = np.array(poly).astype('float32')
+            resized_poly[0::2] *= im_scale_x
+            resized_poly[1::2] *= im_scale_y
+            return resized_poly.tolist()
+
+        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, im_h, im_w)
+
+            mask = mask_util.decode(rle)
+            mask = cv2.resize(
+                mask,
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        im_h, im_w = im_size
+        im_scale_x, im_scale_y = scale
+        resized_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                resized_segms.append([
+                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
+                ])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                resized_segms.append(
+                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
+
+        return resized_segms
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+
+        # apply image
+        if len(im.shape) == 3:
+            im_shape = im.shape
+        else:
+            im_shape = im[0].shape
+
+        if self.keep_ratio:
+            im_size_min = np.min(im_shape[0:2])
+            im_size_max = np.max(im_shape[0:2])
+
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+
+            im_scale = min(target_size_min / im_size_min,
+                           target_size_max / im_size_max)
+
+            resize_h = int(im_scale * float(im_shape[0]) + 0.5)
+            resize_w = int(im_scale * float(im_shape[1]) + 0.5)
+
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / im_shape[0]
+            im_scale_x = resize_w / im_shape[1]
+
+        if len(im.shape) == 3:
+            im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
+            sample['image'] = im.astype(np.float32)
+        else:
+            resized_images = []
+            for one_im in im:
+                applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y])
+                resized_images.append(applied_im)
+
+            sample['image'] = np.array(resized_images)
+
+        # 2d keypoints resize
+        if 'kps2d' in sample.keys():
+            kps2d = sample['kps2d']
+            kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x
+            kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y
+
+            sample['kps2d'] = kps2d
+
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
+                                                [im_scale_x, im_scale_y],
+                                                [resize_w, resize_h])
+
+        # apply areas
+        if 'gt_areas' in sample:
+            sample['gt_areas'] = self.apply_area(sample['gt_areas'],
+                                                 [im_scale_x, im_scale_y])
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
+                                                [im_scale_x, im_scale_y])
+
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm,
+                    None,
+                    None,
+                    fx=im_scale_x,
+                    fy=im_scale_y,
+                    interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+
+        if 'gt_joints' in sample:
+            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
+                                                    [im_scale_x, im_scale_y],
+                                                    [resize_w, resize_h])
+
+        return sample
+
+
+@register_op
+class MultiscaleTestResize(BaseOperator):
+    def __init__(self,
+                 origin_target_size=[800, 1333],
+                 target_size=[],
+                 interp=cv2.INTER_LINEAR,
+                 use_flip=True):
+        """
+        Rescale image to the each size in target size, and capped at max_size.
+        Args:
+            origin_target_size (list): origin target size of image
+            target_size (list): A list of target sizes of image.
+            interp (int): the interpolation method.
+            use_flip (bool): whether use flip augmentation.
+        """
+        super(MultiscaleTestResize, self).__init__()
+        self.interp = interp
+        self.use_flip = use_flip
+
+        if not isinstance(target_size, Sequence):
+            raise TypeError(
+                "Type of target_size is invalid. Must be List or Tuple, now is {}".
+                format(type(target_size)))
+        self.target_size = target_size
+
+        if not isinstance(origin_target_size, Sequence):
+            raise TypeError(
+                "Type of origin_target_size is invalid. Must be List or Tuple, now is {}".
+                format(type(origin_target_size)))
+
+        self.origin_target_size = origin_target_size
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy for multi-scale test.
+        """
+        samples = []
+        resizer = Resize(
+            self.origin_target_size, keep_ratio=True, interp=self.interp)
+        samples.append(resizer(sample.copy(), context))
+        if self.use_flip:
+            flipper = RandomFlip(1.1)
+            samples.append(flipper(sample.copy(), context=context))
+
+        for size in self.target_size:
+            resizer = Resize(size, keep_ratio=True, interp=self.interp)
+            samples.append(resizer(sample.copy(), context))
+
+        return samples
+
+
+@register_op
+class RandomResize(BaseOperator):
+    def __init__(self,
+                 target_size,
+                 keep_ratio=True,
+                 interp=cv2.INTER_LINEAR,
+                 random_range=False,
+                 random_size=True,
+                 random_interp=False):
+        """
+        Resize image to target size randomly. random target_size and interpolation method
+        Args:
+            target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
+            keep_ratio (bool): whether keep_raio or not, default true
+            interp (int): the interpolation method
+            random_range (bool): whether random select target size of image, the target_size must be 
+                a [[min_short_edge, long_edge], [max_short_edge, long_edge]]
+            random_size (bool): whether random select target size of image
+            random_interp (bool): whether random select interpolation method
+        """
+        super(RandomResize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+        assert isinstance(target_size, (
+            Integral, Sequence)), "target_size must be Integer, List or Tuple"
+        if (random_range or random_size) and not isinstance(target_size,
+                                                            Sequence):
+            raise TypeError(
+                "Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}".
+                format(type(target_size)))
+        if random_range and not len(target_size) == 2:
+            raise TypeError(
+                "target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True."
+            )
+        self.target_size = target_size
+        self.random_range = random_range
+        self.random_size = random_size
+        self.random_interp = random_interp
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        if self.random_range:
+            short_edge = np.random.randint(self.target_size[0][0],
+                                           self.target_size[1][0] + 1)
+            long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1)
+            target_size = [short_edge, long_edge]
+        else:
+            if self.random_size:
+                target_size = random.choice(self.target_size)
+            else:
+                target_size = self.target_size
+
+        if self.random_interp:
+            interp = random.choice(self.interps)
+        else:
+            interp = self.interp
+
+        resizer = Resize(target_size, self.keep_ratio, interp)
+        return resizer(sample, context=context)
+
+
+@register_op
+class RandomExpand(BaseOperator):
+    """Random expand the canvas.
+    Args:
+        ratio (float): maximum expansion ratio.
+        prob (float): probability to expand.
+        fill_value (list): color value used to fill the canvas. in RGB order.
+    """
+
+    def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):
+        super(RandomExpand, self).__init__()
+        assert ratio > 1.01, "expand ratio must be larger than 1.01"
+        self.ratio = ratio
+        self.prob = prob
+        assert isinstance(fill_value, (Number, Sequence)), \
+            "fill value must be either float or sequence"
+        if isinstance(fill_value, Number):
+            fill_value = (fill_value, ) * 3
+        if not isinstance(fill_value, tuple):
+            fill_value = tuple(fill_value)
+        self.fill_value = fill_value
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0., 1.) < self.prob:
+            return sample
+
+        im = sample['image']
+        height, width = im.shape[:2]
+        ratio = np.random.uniform(1., self.ratio)
+        h = int(height * ratio)
+        w = int(width * ratio)
+        if not h > height or not w > width:
+            return sample
+        y = np.random.randint(0, h - height)
+        x = np.random.randint(0, w - width)
+        offsets, size = [x, y], [h, w]
+
+        pad = Pad(size,
+                  pad_mode=-1,
+                  offsets=offsets,
+                  fill_value=self.fill_value)
+
+        return pad(sample, context=context)
+
+
+@register_op
+class CropWithSampling(BaseOperator):
+    def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True):
+        """
+        Args:
+            batch_sampler (list): Multiple sets of different
+                                  parameters for cropping.
+            satisfy_all (bool): whether all boxes must satisfy.
+            e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]
+           [max sample, max trial, min scale, max scale,
+            min aspect ratio, max aspect ratio,
+            min overlap, max overlap]
+            avoid_no_bbox (bool): whether to avoid the
+                                  situation where the box does not appear.
+        """
+        super(CropWithSampling, self).__init__()
+        self.batch_sampler = batch_sampler
+        self.satisfy_all = satisfy_all
+        self.avoid_no_bbox = avoid_no_bbox
+
+    def apply(self, sample, context):
+        """
+        Crop the image and modify bounding box.
+        Operators:
+            1. Scale the image width and height.
+            2. Crop the image according to a radom sample.
+            3. Rescale the bounding box.
+            4. Determine if the new bbox is satisfied in the new image.
+        Returns:
+            sample: the image, bounding box are replaced.
+        """
+        assert 'image' in sample, "image data not found"
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        im_height, im_width = im.shape[:2]
+        gt_score = None
+        if 'gt_score' in sample:
+            gt_score = sample['gt_score']
+        sampled_bbox = []
+        gt_bbox = gt_bbox.tolist()
+        for sampler in self.batch_sampler:
+            found = 0
+            for i in range(sampler[1]):
+                if found >= sampler[0]:
+                    break
+                sample_bbox = generate_sample_bbox(sampler)
+                if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox,
+                                             self.satisfy_all):
+                    sampled_bbox.append(sample_bbox)
+                    found = found + 1
+        im = np.array(im)
+        while sampled_bbox:
+            idx = int(np.random.uniform(0, len(sampled_bbox)))
+            sample_bbox = sampled_bbox.pop(idx)
+            sample_bbox = clip_bbox(sample_bbox)
+            crop_bbox, crop_class, crop_score = \
+                filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score)
+            if self.avoid_no_bbox:
+                if len(crop_bbox) < 1:
+                    continue
+            xmin = int(sample_bbox[0] * im_width)
+            xmax = int(sample_bbox[2] * im_width)
+            ymin = int(sample_bbox[1] * im_height)
+            ymax = int(sample_bbox[3] * im_height)
+            im = im[ymin:ymax, xmin:xmax]
+            sample['image'] = im
+            sample['gt_bbox'] = crop_bbox
+            sample['gt_class'] = crop_class
+            sample['gt_score'] = crop_score
+            return sample
+        return sample
+
+
+@register_op
+class CropWithDataAchorSampling(BaseOperator):
+    def __init__(self,
+                 batch_sampler,
+                 anchor_sampler=None,
+                 target_size=None,
+                 das_anchor_scales=[16, 32, 64, 128],
+                 sampling_prob=0.5,
+                 min_size=8.,
+                 avoid_no_bbox=True):
+        """
+        Args:
+            anchor_sampler (list): anchor_sampling sets of different
+                                  parameters for cropping.
+            batch_sampler (list): Multiple sets of different
+                                  parameters for cropping.
+              e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]]
+                  [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]]
+              [max sample, max trial, min scale, max scale,
+               min aspect ratio, max aspect ratio,
+               min overlap, max overlap, min coverage, max coverage]
+            target_size (int): target image size.
+            das_anchor_scales (list[float]): a list of anchor scales in data
+                anchor smapling.
+            min_size (float): minimum size of sampled bbox.
+            avoid_no_bbox (bool): whether to avoid the
+                                  situation where the box does not appear.
+        """
+        super(CropWithDataAchorSampling, self).__init__()
+        self.anchor_sampler = anchor_sampler
+        self.batch_sampler = batch_sampler
+        self.target_size = target_size
+        self.sampling_prob = sampling_prob
+        self.min_size = min_size
+        self.avoid_no_bbox = avoid_no_bbox
+        self.das_anchor_scales = np.array(das_anchor_scales)
+
+    def apply(self, sample, context):
+        """
+        Crop the image and modify bounding box.
+        Operators:
+            1. Scale the image width and height.
+            2. Crop the image according to a radom sample.
+            3. Rescale the bounding box.
+            4. Determine if the new bbox is satisfied in the new image.
+        Returns:
+            sample: the image, bounding box are replaced.
+        """
+        assert 'image' in sample, "image data not found"
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        image_height, image_width = im.shape[:2]
+        gt_bbox[:, 0] /= image_width
+        gt_bbox[:, 1] /= image_height
+        gt_bbox[:, 2] /= image_width
+        gt_bbox[:, 3] /= image_height
+        gt_score = None
+        if 'gt_score' in sample:
+            gt_score = sample['gt_score']
+        sampled_bbox = []
+        gt_bbox = gt_bbox.tolist()
+
+        prob = np.random.uniform(0., 1.)
+        if prob > self.sampling_prob:  # anchor sampling
+            assert self.anchor_sampler
+            for sampler in self.anchor_sampler:
+                found = 0
+                for i in range(sampler[1]):
+                    if found >= sampler[0]:
+                        break
+                    sample_bbox = data_anchor_sampling(
+                        gt_bbox, image_width, image_height,
+                        self.das_anchor_scales, self.target_size)
+                    if sample_bbox == 0:
+                        break
+                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
+                                                          gt_bbox):
+                        sampled_bbox.append(sample_bbox)
+                        found = found + 1
+            im = np.array(im)
+            while sampled_bbox:
+                idx = int(np.random.uniform(0, len(sampled_bbox)))
+                sample_bbox = sampled_bbox.pop(idx)
+
+                if 'gt_keypoint' in sample.keys():
+                    keypoints = (sample['gt_keypoint'],
+                                 sample['keypoint_ignore'])
+                    crop_bbox, crop_class, crop_score, gt_keypoints = \
+                        filter_and_process(sample_bbox, gt_bbox, gt_class,
+                                scores=gt_score,
+                                keypoints=keypoints)
+                else:
+                    crop_bbox, crop_class, crop_score = filter_and_process(
+                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
+                crop_bbox, crop_class, crop_score = bbox_area_sampling(
+                    crop_bbox, crop_class, crop_score, self.target_size,
+                    self.min_size)
+
+                if self.avoid_no_bbox:
+                    if len(crop_bbox) < 1:
+                        continue
+                im = crop_image_sampling(im, sample_bbox, image_width,
+                                         image_height, self.target_size)
+                height, width = im.shape[:2]
+                crop_bbox[:, 0] *= width
+                crop_bbox[:, 1] *= height
+                crop_bbox[:, 2] *= width
+                crop_bbox[:, 3] *= height
+                sample['image'] = im
+                sample['gt_bbox'] = crop_bbox
+                sample['gt_class'] = crop_class
+                if 'gt_score' in sample:
+                    sample['gt_score'] = crop_score
+                if 'gt_keypoint' in sample.keys():
+                    sample['gt_keypoint'] = gt_keypoints[0]
+                    sample['keypoint_ignore'] = gt_keypoints[1]
+                return sample
+            return sample
+
+        else:
+            for sampler in self.batch_sampler:
+                found = 0
+                for i in range(sampler[1]):
+                    if found >= sampler[0]:
+                        break
+                    sample_bbox = generate_sample_bbox_square(
+                        sampler, image_width, image_height)
+                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
+                                                          gt_bbox):
+                        sampled_bbox.append(sample_bbox)
+                        found = found + 1
+            im = np.array(im)
+            while sampled_bbox:
+                idx = int(np.random.uniform(0, len(sampled_bbox)))
+                sample_bbox = sampled_bbox.pop(idx)
+                sample_bbox = clip_bbox(sample_bbox)
+
+                if 'gt_keypoint' in sample.keys():
+                    keypoints = (sample['gt_keypoint'],
+                                 sample['keypoint_ignore'])
+                    crop_bbox, crop_class, crop_score, gt_keypoints = \
+                        filter_and_process(sample_bbox, gt_bbox, gt_class,
+                                scores=gt_score,
+                                keypoints=keypoints)
+                else:
+                    crop_bbox, crop_class, crop_score = filter_and_process(
+                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
+                # sampling bbox according the bbox area
+                crop_bbox, crop_class, crop_score = bbox_area_sampling(
+                    crop_bbox, crop_class, crop_score, self.target_size,
+                    self.min_size)
+
+                if self.avoid_no_bbox:
+                    if len(crop_bbox) < 1:
+                        continue
+                xmin = int(sample_bbox[0] * image_width)
+                xmax = int(sample_bbox[2] * image_width)
+                ymin = int(sample_bbox[1] * image_height)
+                ymax = int(sample_bbox[3] * image_height)
+                im = im[ymin:ymax, xmin:xmax]
+                height, width = im.shape[:2]
+                crop_bbox[:, 0] *= width
+                crop_bbox[:, 1] *= height
+                crop_bbox[:, 2] *= width
+                crop_bbox[:, 3] *= height
+                sample['image'] = im
+                sample['gt_bbox'] = crop_bbox
+                sample['gt_class'] = crop_class
+                if 'gt_score' in sample:
+                    sample['gt_score'] = crop_score
+                if 'gt_keypoint' in sample.keys():
+                    sample['gt_keypoint'] = gt_keypoints[0]
+                    sample['keypoint_ignore'] = gt_keypoints[1]
+                return sample
+            return sample
+
+
+@register_op
+class RandomCrop(BaseOperator):
+    """Random crop image and bboxes.
+    Args:
+        aspect_ratio (list): aspect ratio of cropped region.
+            in [min, max] format.
+        thresholds (list): iou thresholds for decide a valid bbox crop.
+        scaling (list): ratio between a cropped region and the original image.
+             in [min, max] format.
+        num_attempts (int): number of tries before giving up.
+        allow_no_crop (bool): allow return without actually cropping them.
+        cover_all_box (bool): ensure all bboxes are covered in the final crop.
+        is_mask_crop(bool): whether crop the segmentation.
+    """
+
+    def __init__(self,
+                 aspect_ratio=[.5, 2.],
+                 thresholds=[.0, .1, .3, .5, .7, .9],
+                 scaling=[.3, 1.],
+                 num_attempts=50,
+                 allow_no_crop=True,
+                 cover_all_box=False,
+                 is_mask_crop=False,
+                 ioumode="iou",
+                 prob=1.0):
+        super(RandomCrop, self).__init__()
+        self.aspect_ratio = aspect_ratio
+        self.thresholds = thresholds
+        self.scaling = scaling
+        self.num_attempts = num_attempts
+        self.allow_no_crop = allow_no_crop
+        self.cover_all_box = cover_all_box
+        self.is_mask_crop = is_mask_crop
+        self.ioumode = ioumode
+        self.prob = prob
+
+    def crop_segms(self, segms, valid_ids, crop, height, width):
+        def _crop_poly(segm, crop):
+            xmin, ymin, xmax, ymax = crop
+            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
+            crop_p = np.array(crop_coord).reshape(4, 2)
+            crop_p = Polygon(crop_p)
+
+            crop_segm = list()
+            for poly in segm:
+                poly = np.array(poly).reshape(len(poly) // 2, 2)
+                polygon = Polygon(poly)
+                if not polygon.is_valid:
+                    exterior = polygon.exterior
+                    multi_lines = exterior.intersection(exterior)
+                    polygons = shapely.ops.polygonize(multi_lines)
+                    polygon = MultiPolygon(polygons)
+                multi_polygon = list()
+                if isinstance(polygon, MultiPolygon):
+                    multi_polygon = copy.deepcopy(polygon)
+                else:
+                    multi_polygon.append(copy.deepcopy(polygon))
+                for per_polygon in multi_polygon:
+                    inter = per_polygon.intersection(crop_p)
+                    if not inter:
+                        continue
+                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
+                        for part in inter:
+                            if not isinstance(part, Polygon):
+                                continue
+                            part = np.squeeze(
+                                np.array(part.exterior.coords[:-1]).reshape(1,
+                                                                            -1))
+                            part[0::2] -= xmin
+                            part[1::2] -= ymin
+                            crop_segm.append(part.tolist())
+                    elif isinstance(inter, Polygon):
+                        crop_poly = np.squeeze(
+                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
+                        crop_poly[0::2] -= xmin
+                        crop_poly[1::2] -= ymin
+                        crop_segm.append(crop_poly.tolist())
+                    else:
+                        continue
+            return crop_segm
+
+        def _crop_rle(rle, crop, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        crop_segms = []
+        for id in valid_ids:
+            segm = segms[id]
+            if is_poly(segm):
+                import copy
+                import shapely.ops
+                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
+                logging.getLogger("shapely").setLevel(logging.WARNING)
+                # Polygon format
+                crop_segms.append(_crop_poly(segm, crop))
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                crop_segms.append(_crop_rle(segm, crop, height, width))
+        return crop_segms
+
+    def set_fake_bboxes(self, sample):
+        sample['gt_bbox'] = np.array(
+            [
+                [32, 32, 128, 128],
+                [32, 32, 128, 256],
+                [32, 64, 128, 128],
+                [32, 64, 128, 256],
+                [64, 64, 128, 256],
+                [64, 64, 256, 256],
+                [64, 32, 128, 256],
+                [64, 32, 128, 256],
+                [96, 32, 128, 256],
+                [96, 32, 128, 256],
+            ],
+            dtype=np.float32)
+        sample['gt_class'] = np.array(
+            [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32)
+        return sample
+
+    def apply(self, sample, context=None):
+        if random.random() > self.prob:
+            return sample
+
+        if 'gt_bbox' not in sample:
+            # only used in semi-det as unsup data
+            sample = self.set_fake_bboxes(sample)
+            sample = self.random_crop(sample, fake_bboxes=True)
+            return sample
+
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
+            return sample
+        sample = self.random_crop(sample)
+        return sample
+
+    def random_crop(self, sample, fake_bboxes=False):
+        h, w = sample['image'].shape[:2]
+        gt_bbox = sample['gt_bbox']
+
+        # NOTE Original method attempts to generate one candidate for each
+        # threshold then randomly sample one from the resulting list.
+        # Here a short circuit approach is taken, i.e., randomly choose a
+        # threshold and attempt to find a valid crop, and simply return the
+        # first one found.
+        # The probability is not exactly the same, kinda resembling the
+        # "Monty Hall" problem. Actually carrying out the attempts will affect
+        # observability (just like opening doors in the "Monty Hall" game).
+        thresholds = list(self.thresholds)
+        if self.allow_no_crop:
+            thresholds.append('no_crop')
+        np.random.shuffle(thresholds)
+
+        for thresh in thresholds:
+            if thresh == 'no_crop':
+                return sample
+
+            found = False
+            for i in range(self.num_attempts):
+                scale = np.random.uniform(*self.scaling)
+                if self.aspect_ratio is not None:
+                    min_ar, max_ar = self.aspect_ratio
+                    aspect_ratio = np.random.uniform(
+                        max(min_ar, scale**2), min(max_ar, scale**-2))
+                    h_scale = scale / np.sqrt(aspect_ratio)
+                    w_scale = scale * np.sqrt(aspect_ratio)
+                else:
+                    h_scale = np.random.uniform(*self.scaling)
+                    w_scale = np.random.uniform(*self.scaling)
+                crop_h = h * h_scale
+                crop_w = w * w_scale
+                if self.aspect_ratio is None:
+                    if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
+                        continue
+
+                crop_h = int(crop_h)
+                crop_w = int(crop_w)
+                crop_y = np.random.randint(0, h - crop_h)
+                crop_x = np.random.randint(0, w - crop_w)
+                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
+                if self.ioumode == "iof":
+                    iou = self._gtcropiou_matrix(
+                        gt_bbox, np.array(
+                            [crop_box], dtype=np.float32))
+                elif self.ioumode == "iou":
+                    iou = self._iou_matrix(
+                        gt_bbox, np.array(
+                            [crop_box], dtype=np.float32))
+                if iou.max() < thresh:
+                    continue
+
+                if self.cover_all_box and iou.min() < thresh:
+                    continue
+
+                cropped_box, valid_ids = self._crop_box_with_center_constraint(
+                    gt_bbox, np.array(
+                        crop_box, dtype=np.float32))
+                if valid_ids.size > 0:
+                    found = True
+                    break
+
+            if found:
+                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
+                        'gt_poly']) > 0:
+                    crop_polys = self.crop_segms(
+                        sample['gt_poly'],
+                        valid_ids,
+                        np.array(
+                            crop_box, dtype=np.int64),
+                        h,
+                        w)
+                    if [] in crop_polys:
+                        delete_id = list()
+                        valid_polys = list()
+                        for id, crop_poly in enumerate(crop_polys):
+                            if crop_poly == []:
+                                delete_id.append(id)
+                            else:
+                                valid_polys.append(crop_poly)
+                        valid_ids = np.delete(valid_ids, delete_id)
+                        if len(valid_polys) == 0:
+                            return sample
+                        sample['gt_poly'] = valid_polys
+                    else:
+                        sample['gt_poly'] = crop_polys
+
+                if 'gt_segm' in sample:
+                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
+                                                        crop_box)
+                    sample['gt_segm'] = np.take(
+                        sample['gt_segm'], valid_ids, axis=0)
+
+                sample['image'] = self._crop_image(sample['image'], crop_box)
+                if fake_bboxes == True:
+                    return sample
+
+                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
+                sample['gt_class'] = np.take(
+                    sample['gt_class'], valid_ids, axis=0)
+                if 'gt_score' in sample:
+                    sample['gt_score'] = np.take(
+                        sample['gt_score'], valid_ids, axis=0)
+
+                if 'is_crowd' in sample:
+                    sample['is_crowd'] = np.take(
+                        sample['is_crowd'], valid_ids, axis=0)
+
+                if 'difficult' in sample:
+                    sample['difficult'] = np.take(
+                        sample['difficult'], valid_ids, axis=0)
+
+                if 'gt_joints' in sample:
+                    sample['gt_joints'] = self._crop_joints(sample['gt_joints'],
+                                                            crop_box)
+
+                return sample
+
+        return sample
+
+    def _iou_matrix(self, a, b):
+        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
+        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+        area_o = (area_a[:, np.newaxis] + area_b - area_i)
+        return area_i / (area_o + 1e-10)
+
+    def _gtcropiou_matrix(self, a, b):
+        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
+        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+        area_o = (area_a[:, np.newaxis] + area_b - area_i)
+        return area_i / (area_a + 1e-10)
+
+    def _crop_box_with_center_constraint(self, box, crop):
+        cropped_box = box.copy()
+
+        cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
+        cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
+        cropped_box[:, :2] -= crop[:2]
+        cropped_box[:, 2:] -= crop[:2]
+
+        centers = (box[:, :2] + box[:, 2:]) / 2
+        valid = np.logical_and(crop[:2] <= centers,
+                               centers < crop[2:]).all(axis=1)
+        valid = np.logical_and(
+            valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
+
+        return cropped_box, np.where(valid)[0]
+
+    def _crop_image(self, img, crop):
+        x1, y1, x2, y2 = crop
+        return img[y1:y2, x1:x2, :]
+
+    def _crop_segm(self, segm, crop):
+        x1, y1, x2, y2 = crop
+        return segm[:, y1:y2, x1:x2]
+
+    def _crop_joints(self, joints, crop):
+        x1, y1, x2, y2 = crop
+        joints[joints[..., 0] > x2, :] = 0
+        joints[joints[..., 1] > y2, :] = 0
+        joints[joints[..., 0] < x1, :] = 0
+        joints[joints[..., 1] < y1, :] = 0
+        joints[..., 0] -= x1
+        joints[..., 1] -= y1
+        return joints
+
+
+@register_op
+class RandomScaledCrop(BaseOperator):
+    """Resize image and bbox based on long side (with optional random scaling),
+       then crop or pad image to target size.
+    Args:
+        target_size (int|list): target size, "hw" format.
+        scale_range (list): random scale range.
+        interp (int): interpolation method, default to `cv2.INTER_LINEAR`.
+        fill_value (float|list|tuple): color value used to fill the canvas,
+            in RGB order.
+    """
+
+    def __init__(self,
+                 target_size=512,
+                 scale_range=[.1, 2.],
+                 interp=cv2.INTER_LINEAR,
+                 fill_value=(123.675, 116.28, 103.53)):
+        super(RandomScaledCrop, self).__init__()
+        assert isinstance(target_size, (
+            Integral, Sequence)), "target_size must be Integer, List or Tuple"
+        if isinstance(target_size, Integral):
+            target_size = [target_size, ] * 2
+
+        self.target_size = target_size
+        self.scale_range = scale_range
+        self.interp = interp
+        assert isinstance(fill_value, (Number, Sequence)), \
+            "fill value must be either float or sequence"
+        if isinstance(fill_value, Number):
+            fill_value = (fill_value, ) * 3
+        if not isinstance(fill_value, tuple):
+            fill_value = tuple(fill_value)
+        self.fill_value = fill_value
+
+    def apply_image(self, img, output_size, offset_x, offset_y):
+        th, tw = self.target_size
+        rh, rw = output_size
+        img = cv2.resize(
+            img, (rw, rh), interpolation=self.interp).astype(np.float32)
+        canvas = np.ones([th, tw, 3], dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[:min(th, rh), :min(tw, rw)] = \
+            img[offset_y:offset_y + th, offset_x:offset_x + tw]
+        return canvas
+
+    def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y):
+        th, tw = self.target_size
+        shift_array = np.array(
+            [
+                offset_x,
+                offset_y,
+            ] * 2, dtype=np.float32)
+        boxes = gt_bbox * scale - shift_array
+        boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw)
+        boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th)
+        # filter boxes with no area
+        area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1)
+        valid = (area > 1.).nonzero()[0]
+        return boxes[valid], gt_class[valid], valid
+
+    def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None):
+        th, tw = self.target_size
+        rh, rw = output_size
+        out_segms = []
+        for segm in segms:
+            segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST)
+            segm = segm.astype(np.float32)
+            canvas = np.zeros([th, tw], dtype=segm.dtype)
+            canvas[:min(th, rh), :min(tw, rw)] = \
+                segm[offset_y:offset_y + th, offset_x:offset_x + tw]
+            out_segms.append(canvas)
+        out_segms = np.stack(out_segms)
+        return out_segms if valid is None else out_segms[valid]
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        h, w = img.shape[:2]
+        random_scale = np.random.uniform(*self.scale_range)
+        target_scale_size = [t * random_scale for t in self.target_size]
+        # Compute actual rescaling applied to image.
+        scale = min(target_scale_size[0] / h, target_scale_size[1] / w)
+        output_size = [int(round(h * scale)), int(round(w * scale))]
+        # get offset
+        offset_x = int(
+            max(0, np.random.uniform(0., output_size[1] - self.target_size[1])))
+        offset_y = int(
+            max(0, np.random.uniform(0., output_size[0] - self.target_size[0])))
+
+        # apply to image
+        sample['image'] = self.apply_image(img, output_size, offset_x, offset_y)
+
+        # apply to bbox
+        valid = None
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox(
+                sample['gt_bbox'], sample['gt_class'], scale, offset_x,
+                offset_y)
+
+        # apply to segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size,
+                                                offset_x, offset_y, valid)
+
+        sample['im_shape'] = np.asarray(output_size, dtype=np.float32)
+        scale_factor = sample['scale_factor']
+        sample['scale_factor'] = np.asarray(
+            [scale_factor[0] * scale, scale_factor[1] * scale],
+            dtype=np.float32)
+
+        return sample
+
+
+@register_op
+class Cutmix(BaseOperator):
+    def __init__(self, alpha=1.5, beta=1.5):
+        """ 
+        CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899
+        Cutmix image and gt_bbbox/gt_score
+        Args:
+             alpha (float): alpha parameter of beta distribute
+             beta (float): beta parameter of beta distribute
+        """
+        super(Cutmix, self).__init__()
+        self.alpha = alpha
+        self.beta = beta
+        if self.alpha <= 0.0:
+            raise ValueError("alpha shold be positive in {}".format(self))
+        if self.beta <= 0.0:
+            raise ValueError("beta shold be positive in {}".format(self))
+
+    def apply_image(self, img1, img2, factor):
+        """ _rand_bbox """
+        h = max(img1.shape[0], img2.shape[0])
+        w = max(img1.shape[1], img2.shape[1])
+        cut_rat = np.sqrt(1. - factor)
+
+        cut_w = np.int32(w * cut_rat)
+        cut_h = np.int32(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w - 1)
+        bby1 = np.clip(cy - cut_h // 2, 0, h - 1)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w - 1)
+        bby2 = np.clip(cy + cut_h // 2, 0, h - 1)
+
+        img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32')
+        img_1_pad[:img1.shape[0], :img1.shape[1], :] = \
+            img1.astype('float32')
+        img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32')
+        img_2_pad[:img2.shape[0], :img2.shape[1], :] = \
+            img2.astype('float32')
+        img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :]
+        return img_1_pad
+
+    def __call__(self, sample, context=None):
+        if not isinstance(sample, Sequence):
+            return sample
+
+        assert len(sample) == 2, 'cutmix need two samples'
+
+        factor = np.random.beta(self.alpha, self.beta)
+        factor = max(0.0, min(1.0, factor))
+        if factor >= 1.0:
+            return sample[0]
+        if factor <= 0.0:
+            return sample[1]
+        img1 = sample[0]['image']
+        img2 = sample[1]['image']
+        img = self.apply_image(img1, img2, factor)
+        gt_bbox1 = sample[0]['gt_bbox']
+        gt_bbox2 = sample[1]['gt_bbox']
+        gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
+        gt_class1 = sample[0]['gt_class']
+        gt_class2 = sample[1]['gt_class']
+        gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
+        gt_score1 = np.ones_like(sample[0]['gt_class'])
+        gt_score2 = np.ones_like(sample[1]['gt_class'])
+        gt_score = np.concatenate(
+            (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
+        result = copy.deepcopy(sample[0])
+        result['image'] = img
+        result['gt_bbox'] = gt_bbox
+        result['gt_score'] = gt_score
+        result['gt_class'] = gt_class
+        if 'is_crowd' in sample[0]:
+            is_crowd1 = sample[0]['is_crowd']
+            is_crowd2 = sample[1]['is_crowd']
+            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
+            result['is_crowd'] = is_crowd
+        if 'difficult' in sample[0]:
+            is_difficult1 = sample[0]['difficult']
+            is_difficult2 = sample[1]['difficult']
+            is_difficult = np.concatenate(
+                (is_difficult1, is_difficult2), axis=0)
+            result['difficult'] = is_difficult
+        return result
+
+
+@register_op
+class Mixup(BaseOperator):
+    def __init__(self, alpha=1.5, beta=1.5):
+        """ Mixup image and gt_bbbox/gt_score
+        Args:
+            alpha (float): alpha parameter of beta distribute
+            beta (float): beta parameter of beta distribute
+        """
+        super(Mixup, self).__init__()
+        self.alpha = alpha
+        self.beta = beta
+        if self.alpha <= 0.0:
+            raise ValueError("alpha shold be positive in {}".format(self))
+        if self.beta <= 0.0:
+            raise ValueError("beta shold be positive in {}".format(self))
+
+    def apply_image(self, img1, img2, factor):
+        h = max(img1.shape[0], img2.shape[0])
+        w = max(img1.shape[1], img2.shape[1])
+        img = np.zeros((h, w, img1.shape[2]), 'float32')
+        img[:img1.shape[0], :img1.shape[1], :] = \
+            img1.astype('float32') * factor
+        img[:img2.shape[0], :img2.shape[1], :] += \
+            img2.astype('float32') * (1.0 - factor)
+        return img.astype('uint8')
+
+    def __call__(self, sample, context=None):
+        if not isinstance(sample, Sequence):
+            return sample
+
+        assert len(sample) == 2, 'mixup need two samples'
+
+        factor = np.random.beta(self.alpha, self.beta)
+        factor = max(0.0, min(1.0, factor))
+        if factor >= 1.0:
+            return sample[0]
+        if factor <= 0.0:
+            return sample[1]
+        im = self.apply_image(sample[0]['image'], sample[1]['image'], factor)
+        result = copy.deepcopy(sample[0])
+        result['image'] = im
+        # apply bbox and score
+        if 'gt_bbox' in sample[0]:
+            gt_bbox1 = sample[0]['gt_bbox']
+            gt_bbox2 = sample[1]['gt_bbox']
+            gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
+            result['gt_bbox'] = gt_bbox
+        if 'gt_class' in sample[0]:
+            gt_class1 = sample[0]['gt_class']
+            gt_class2 = sample[1]['gt_class']
+            gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
+            result['gt_class'] = gt_class
+
+            gt_score1 = np.ones_like(sample[0]['gt_class'])
+            gt_score2 = np.ones_like(sample[1]['gt_class'])
+            gt_score = np.concatenate(
+                (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
+            result['gt_score'] = gt_score.astype('float32')
+        if 'is_crowd' in sample[0]:
+            is_crowd1 = sample[0]['is_crowd']
+            is_crowd2 = sample[1]['is_crowd']
+            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
+            result['is_crowd'] = is_crowd
+        if 'difficult' in sample[0]:
+            is_difficult1 = sample[0]['difficult']
+            is_difficult2 = sample[1]['difficult']
+            is_difficult = np.concatenate(
+                (is_difficult1, is_difficult2), axis=0)
+            result['difficult'] = is_difficult
+
+        if 'gt_ide' in sample[0]:
+            gt_ide1 = sample[0]['gt_ide']
+            gt_ide2 = sample[1]['gt_ide']
+            gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0)
+            result['gt_ide'] = gt_ide
+        return result
+
+
+@register_op
+class NormalizeBox(BaseOperator):
+    """Transform the bounding box's coornidates to [0,1]."""
+
+    def __init__(self):
+        super(NormalizeBox, self).__init__()
+
+    def apply(self, sample, context):
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        height, width, _ = im.shape
+        for i in range(gt_bbox.shape[0]):
+            gt_bbox[i][0] = gt_bbox[i][0] / width
+            gt_bbox[i][1] = gt_bbox[i][1] / height
+            gt_bbox[i][2] = gt_bbox[i][2] / width
+            gt_bbox[i][3] = gt_bbox[i][3] / height
+        sample['gt_bbox'] = gt_bbox
+
+        if 'gt_keypoint' in sample.keys():
+            gt_keypoint = sample['gt_keypoint']
+
+            for i in range(gt_keypoint.shape[1]):
+                if i % 2:
+                    gt_keypoint[:, i] = gt_keypoint[:, i] / height
+                else:
+                    gt_keypoint[:, i] = gt_keypoint[:, i] / width
+            sample['gt_keypoint'] = gt_keypoint
+
+        return sample
+
+
+@register_op
+class BboxXYXY2XYWH(BaseOperator):
+    """
+    Convert bbox XYXY format to XYWH format.
+    """
+
+    def __init__(self):
+        super(BboxXYXY2XYWH, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2]
+        bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2.
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class PadBox(BaseOperator):
+    def __init__(self, num_max_boxes=50):
+        """
+        Pad zeros to bboxes if number of bboxes is less than num_max_boxes.
+        Args:
+            num_max_boxes (int): the max number of bboxes
+        """
+        self.num_max_boxes = num_max_boxes
+        super(PadBox, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        gt_num = min(self.num_max_boxes, len(bbox))
+        num_max = self.num_max_boxes
+        # fields = context['fields'] if context else []
+        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
+        if gt_num > 0:
+            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
+        sample['gt_bbox'] = pad_bbox
+        if 'gt_class' in sample:
+            pad_class = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_class[:gt_num] = sample['gt_class'][:gt_num, 0]
+            sample['gt_class'] = pad_class
+        if 'gt_score' in sample:
+            pad_score = np.zeros((num_max, ), dtype=np.float32)
+            if gt_num > 0:
+                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
+            sample['gt_score'] = pad_score
+        # in training, for example in op ExpandImage,
+        # the bbox and gt_class is expandded, but the difficult is not,
+        # so, judging by it's length
+        if 'difficult' in sample:
+            pad_diff = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
+            sample['difficult'] = pad_diff
+        if 'is_crowd' in sample:
+            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
+            sample['is_crowd'] = pad_crowd
+        if 'gt_ide' in sample:
+            pad_ide = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
+            sample['gt_ide'] = pad_ide
+        return sample
+
+
+@register_op
+class DebugVisibleImage(BaseOperator):
+    """
+    In debug mode, visualize images according to `gt_box`.
+    (Currently only supported when not cropping and flipping image.)
+    """
+
+    def __init__(self, output_dir='output/debug', is_normalized=False):
+        super(DebugVisibleImage, self).__init__()
+        self.is_normalized = is_normalized
+        self.output_dir = output_dir
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+        if not isinstance(self.is_normalized, bool):
+            raise TypeError("{}: input type is invalid.".format(self))
+
+    def apply(self, sample, context=None):
+        image = Image.fromarray(sample['image'].astype(np.uint8))
+        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
+        width = sample['w']
+        height = sample['h']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        draw = ImageDraw.Draw(image)
+        for i in range(gt_bbox.shape[0]):
+            if self.is_normalized:
+                gt_bbox[i][0] = gt_bbox[i][0] * width
+                gt_bbox[i][1] = gt_bbox[i][1] * height
+                gt_bbox[i][2] = gt_bbox[i][2] * width
+                gt_bbox[i][3] = gt_bbox[i][3] * height
+
+            xmin, ymin, xmax, ymax = gt_bbox[i]
+            draw.line(
+                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
+                 (xmin, ymin)],
+                width=2,
+                fill='green')
+            # draw label
+            text = str(gt_class[i][0])
+            tw, th = draw.textsize(text)
+            draw.rectangle(
+                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
+            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
+
+        if 'gt_keypoint' in sample.keys():
+            gt_keypoint = sample['gt_keypoint']
+            if self.is_normalized:
+                for i in range(gt_keypoint.shape[1]):
+                    if i % 2:
+                        gt_keypoint[:, i] = gt_keypoint[:, i] * height
+                    else:
+                        gt_keypoint[:, i] = gt_keypoint[:, i] * width
+            for i in range(gt_keypoint.shape[0]):
+                keypoint = gt_keypoint[i]
+                for j in range(int(keypoint.shape[0] / 2)):
+                    x1 = round(keypoint[2 * j]).astype(np.int32)
+                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)
+                    draw.ellipse(
+                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
+        save_path = os.path.join(self.output_dir, out_file_name)
+        image.save(save_path, quality=95)
+        return sample
+
+
+@register_op
+class Pad(BaseOperator):
+    def __init__(self,
+                 size=None,
+                 size_divisor=32,
+                 pad_mode=0,
+                 offsets=None,
+                 fill_value=(127.5, 127.5, 127.5)):
+        """
+        Pad image to a specified size or multiple of size_divisor.
+        Args:
+            size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None
+            size_divisor (int): size divisor, default 32
+            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
+                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
+            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
+            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
+        """
+        super(Pad, self).__init__()
+
+        if not isinstance(size, (int, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. \
+                            Must be List, now is {}".format(type(size)))
+
+        if isinstance(size, int):
+            size = [size, size]
+
+        assert pad_mode in [
+            -1, 0, 1, 2
+        ], 'currently only supports four modes [-1, 0, 1, 2]'
+        if pad_mode == -1:
+            assert offsets, 'if pad_mode is -1, offsets should not be None'
+
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_mode = pad_mode
+        self.fill_value = fill_value
+        self.offsets = offsets
+
+    def apply_segm(self, segms, offsets, im_size, size):
+        def _expand_poly(poly, x, y):
+            expanded_poly = np.array(poly)
+            expanded_poly[0::2] += x
+            expanded_poly[1::2] += y
+            return expanded_poly.tolist()
+
+        def _expand_rle(rle, x, y, height, width, h, w):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            expanded_mask = np.full((h, w), 0).astype(mask.dtype)
+            expanded_mask[y:y + height, x:x + width] = mask
+            rle = mask_util.encode(
+                np.array(
+                    expanded_mask, order='F', dtype=np.uint8))
+            return rle
+
+        x, y = offsets
+        height, width = im_size
+        h, w = size
+        expanded_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                expanded_segms.append(
+                    [_expand_poly(poly, x, y) for poly in segm])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                expanded_segms.append(
+                    _expand_rle(segm, x, y, height, width, h, w))
+        return expanded_segms
+
+    def apply_bbox(self, bbox, offsets):
+        return bbox + np.array(offsets * 2, dtype=np.float32)
+
+    def apply_keypoint(self, keypoints, offsets):
+        n = len(keypoints[0]) // 2
+        return keypoints + np.array(offsets * n, dtype=np.float32)
+
+    def apply_image(self, image, offsets, im_size, size):
+        x, y = offsets
+        im_h, im_w = im_size
+        h, w = size
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
+        return canvas
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        im_h, im_w = im.shape[:2]
+        if self.size:
+            h, w = self.size
+            assert (
+                im_h <= h and im_w <= w
+            ), '(h, w) of target size should be greater than (im_h, im_w)'
+        else:
+            h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
+            w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
+
+        if h == im_h and w == im_w:
+            sample['image'] = im.astype(np.float32)
+            return sample
+
+        if self.pad_mode == -1:
+            offset_x, offset_y = self.offsets
+        elif self.pad_mode == 0:
+            offset_y, offset_x = 0, 0
+        elif self.pad_mode == 1:
+            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
+        else:
+            offset_y, offset_x = h - im_h, w - im_w
+
+        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
+
+        sample['image'] = self.apply_image(im, offsets, im_size, size)
+
+        if self.pad_mode == 0:
+            return sample
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets)
+
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets,
+                                                im_size, size)
+
+        if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
+            sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'],
+                                                        offsets)
+
+        return sample
+
+
+@register_op
+class Poly2Mask(BaseOperator):
+    """
+    gt poly to mask annotations.
+    Args:
+        del_poly (bool): Whether to delete poly after generating mask. Default: False.
+    """
+
+    def __init__(self, del_poly=False):
+        super(Poly2Mask, self).__init__()
+        import pycocotools.mask as maskUtils
+        self.maskutils = maskUtils
+        self.del_poly = del_poly
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
+            rle = self.maskutils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = self.maskutils.decode(rle)
+        return mask
+
+    def apply(self, sample, context=None):
+        assert 'gt_poly' in sample
+        im_h, im_w = sample['im_shape']
+        masks = [
+            self._poly2mask(gt_poly, im_h, im_w)
+            for gt_poly in sample['gt_poly']
+        ]
+        sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+        if self.del_poly:
+            del (sample['gt_poly'])
+
+        return sample
+
+
+@register_op
+class AugmentHSV(BaseOperator):
+    """ 
+    Augment the SV channel of image data.
+    Args:
+        fraction (float): the fraction for augment. Default: 0.5.
+        is_bgr (bool): whether the image is BGR mode. Default: True.
+        hgain (float): H channel gains
+        sgain (float): S channel gains
+        vgain (float): V channel gains
+    """
+
+    def __init__(self,
+                 fraction=0.50,
+                 is_bgr=True,
+                 hgain=None,
+                 sgain=None,
+                 vgain=None):
+        super(AugmentHSV, self).__init__()
+        self.fraction = fraction
+        self.is_bgr = is_bgr
+        self.hgain = hgain
+        self.sgain = sgain
+        self.vgain = vgain
+        self.use_hsvgain = False if hgain is None else True
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        if self.is_bgr:
+            img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+        else:
+            img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+
+        if self.use_hsvgain:
+            hsv_augs = np.random.uniform(
+                -1, 1, 3) * [self.hgain, self.sgain, self.vgain]
+            # random selection of h, s, v
+            hsv_augs *= np.random.randint(0, 2, 3)
+            img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
+            img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
+            img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)
+
+        else:
+            S = img_hsv[:, :, 1].astype(np.float32)
+            V = img_hsv[:, :, 2].astype(np.float32)
+
+            a = (random.random() * 2 - 1) * self.fraction + 1
+            S *= a
+            if a > 1:
+                np.clip(S, a_min=0, a_max=255, out=S)
+
+            a = (random.random() * 2 - 1) * self.fraction + 1
+            V *= a
+            if a > 1:
+                np.clip(V, a_min=0, a_max=255, out=V)
+
+            img_hsv[:, :, 1] = S.astype(np.uint8)
+            img_hsv[:, :, 2] = V.astype(np.uint8)
+
+        if self.is_bgr:
+            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
+        else:
+            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)
+
+        sample['image'] = img.astype(np.float32)
+        return sample
+
+
+@register_op
+class Norm2PixelBbox(BaseOperator):
+    """
+    Transform the bounding box's coornidates which is in [0,1] to pixels.
+    """
+
+    def __init__(self):
+        super(Norm2PixelBbox, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        height, width = sample['image'].shape[:2]
+        bbox[:, 0::2] = bbox[:, 0::2] * width
+        bbox[:, 1::2] = bbox[:, 1::2] * height
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class BboxCXCYWH2XYXY(BaseOperator):
+    """
+    Convert bbox CXCYWH format to XYXY format.
+    [center_x, center_y, width, height] -> [x0, y0, x1, y1]
+    """
+
+    def __init__(self):
+        super(BboxCXCYWH2XYXY, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox0 = sample['gt_bbox']
+        bbox = bbox0.copy()
+
+        bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2.
+        bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2.
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class RandomResizeCrop(BaseOperator):
+    """Random resize and crop image and bboxes.
+    Args:
+        resizes (list): resize image to one of resizes. if keep_ratio is True and mode is
+        'long', resize the image's long side to the maximum of target_size, if keep_ratio is
+        True and mode is 'short', resize the image's short side to the minimum of target_size.
+        cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...]
+        mode (str): resize mode, `long` or `short`. Details see resizes. 
+        prob (float): probability of this op.
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): the interpolation method
+        thresholds (list): iou thresholds for decide a valid bbox crop.
+        num_attempts (int): number of tries before giving up.
+        allow_no_crop (bool): allow return without actually cropping them.
+        cover_all_box (bool): ensure all bboxes are covered in the final crop.
+        is_mask_crop(bool): whether crop the segmentation.
+    """
+
+    def __init__(self,
+                 resizes,
+                 cropsizes,
+                 prob=0.5,
+                 mode='short',
+                 keep_ratio=True,
+                 interp=cv2.INTER_LINEAR,
+                 num_attempts=3,
+                 cover_all_box=False,
+                 allow_no_crop=False,
+                 thresholds=[0.3, 0.5, 0.7],
+                 is_mask_crop=False,
+                 ioumode="iou"):
+        super(RandomResizeCrop, self).__init__()
+
+        self.resizes = resizes
+        self.cropsizes = cropsizes
+        self.prob = prob
+        self.mode = mode
+        self.ioumode = ioumode
+
+        self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp)
+        self.croper = RandomCrop(
+            num_attempts=num_attempts,
+            cover_all_box=cover_all_box,
+            thresholds=thresholds,
+            allow_no_crop=allow_no_crop,
+            is_mask_crop=is_mask_crop)
+
+    def _format_size(self, size):
+        if isinstance(size, Integral):
+            size = (size, size)
+        return size
+
+    def apply(self, sample, context=None):
+        if random.random() < self.prob:
+            _resize = self._format_size(random.choice(self.resizes))
+            _cropsize = self._format_size(random.choice(self.cropsizes))
+            sample = self._resize(
+                self.resizer,
+                sample,
+                size=_resize,
+                mode=self.mode,
+                context=context)
+            sample = self._random_crop(
+                self.croper, sample, size=_cropsize, context=context)
+        return sample
+
+    @staticmethod
+    def _random_crop(croper, sample, size, context=None):
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
+            return sample
+
+        self = croper
+        h, w = sample['image'].shape[:2]
+        gt_bbox = sample['gt_bbox']
+        cropsize = size
+        min_crop = min(cropsize)
+        max_crop = max(cropsize)
+
+        thresholds = list(self.thresholds)
+        np.random.shuffle(thresholds)
+
+        for thresh in thresholds:
+            found = False
+            for _ in range(self.num_attempts):
+
+                crop_h = random.randint(min_crop, min(h, max_crop))
+                crop_w = random.randint(min_crop, min(w, max_crop))
+
+                crop_y = random.randint(0, h - crop_h)
+                crop_x = random.randint(0, w - crop_w)
+
+                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
+                if self.ioumode == "iof":
+                    iou = self._gtcropiou_matrix(
+                        gt_bbox, np.array(
+                            [crop_box], dtype=np.float32))
+                elif self.ioumode == "iou":
+                    iou = self._iou_matrix(
+                        gt_bbox, np.array(
+                            [crop_box], dtype=np.float32))
+                if iou.max() < thresh:
+                    continue
+
+                if self.cover_all_box and iou.min() < thresh:
+                    continue
+
+                cropped_box, valid_ids = self._crop_box_with_center_constraint(
+                    gt_bbox, np.array(
+                        crop_box, dtype=np.float32))
+                if valid_ids.size > 0:
+                    found = True
+                    break
+
+            if found:
+                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
+                        'gt_poly']) > 0:
+                    crop_polys = self.crop_segms(
+                        sample['gt_poly'],
+                        valid_ids,
+                        np.array(
+                            crop_box, dtype=np.int64),
+                        h,
+                        w)
+                    if [] in crop_polys:
+                        delete_id = list()
+                        valid_polys = list()
+                        for id, crop_poly in enumerate(crop_polys):
+                            if crop_poly == []:
+                                delete_id.append(id)
+                            else:
+                                valid_polys.append(crop_poly)
+                        valid_ids = np.delete(valid_ids, delete_id)
+                        if len(valid_polys) == 0:
+                            return sample
+                        sample['gt_poly'] = valid_polys
+                    else:
+                        sample['gt_poly'] = crop_polys
+
+                if 'gt_segm' in sample:
+                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
+                                                        crop_box)
+                    sample['gt_segm'] = np.take(
+                        sample['gt_segm'], valid_ids, axis=0)
+
+                sample['image'] = self._crop_image(sample['image'], crop_box)
+                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
+                sample['gt_class'] = np.take(
+                    sample['gt_class'], valid_ids, axis=0)
+                if 'gt_score' in sample:
+                    sample['gt_score'] = np.take(
+                        sample['gt_score'], valid_ids, axis=0)
+
+                if 'is_crowd' in sample:
+                    sample['is_crowd'] = np.take(
+                        sample['is_crowd'], valid_ids, axis=0)
+
+                if 'gt_areas' in sample:
+                    sample['gt_areas'] = np.take(
+                        sample['gt_areas'], valid_ids, axis=0)
+
+                if 'gt_joints' in sample:
+                    gt_joints = self._crop_joints(sample['gt_joints'], crop_box)
+                    sample['gt_joints'] = gt_joints[valid_ids]
+                return sample
+
+        return sample
+
+    @staticmethod
+    def _resize(resizer, sample, size, mode='short', context=None):
+        self = resizer
+        im = sample['image']
+        target_size = size
+
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        # apply image
+        im_shape = im.shape
+        if self.keep_ratio:
+
+            im_size_min = np.min(im_shape[0:2])
+            im_size_max = np.max(im_shape[0:2])
+
+            target_size_min = np.min(target_size)
+            target_size_max = np.max(target_size)
+
+            if mode == 'long':
+                im_scale = min(target_size_min / im_size_min,
+                               target_size_max / im_size_max)
+            else:
+                im_scale = max(target_size_min / im_size_min,
+                               target_size_max / im_size_max)
+
+            resize_h = int(im_scale * float(im_shape[0]) + 0.5)
+            resize_w = int(im_scale * float(im_shape[1]) + 0.5)
+
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = target_size
+            im_scale_y = resize_h / im_shape[0]
+            im_scale_x = resize_w / im_shape[1]
+
+        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
+        sample['image'] = im
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
+                                                [im_scale_x, im_scale_y],
+                                                [resize_w, resize_h])
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
+                                                [im_scale_x, im_scale_y])
+
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm,
+                    None,
+                    None,
+                    fx=im_scale_x,
+                    fy=im_scale_y,
+                    interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+
+        if 'gt_joints' in sample:
+            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
+                                                    [im_scale_x, im_scale_y],
+                                                    [resize_w, resize_h])
+
+        return sample
+
+
+@register_op
+class RandomSelect(BaseOperator):
+    """
+    Randomly choose a transformation between transforms1 and transforms2,
+    and the probability of choosing transforms1 is p.
+
+    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py
+
+    """
+
+    def __init__(self, transforms1, transforms2, p=0.5):
+        super(RandomSelect, self).__init__()
+        self.transforms1 = Compose(transforms1)
+        self.transforms2 = Compose(transforms2)
+        self.p = p
+
+    def apply(self, sample, context=None):
+        if random.random() < self.p:
+            return self.transforms1(sample)
+        return self.transforms2(sample)
+
+
+@register_op
+class RandomShortSideResize(BaseOperator):
+    def __init__(self,
+                 short_side_sizes,
+                 max_size=None,
+                 interp=cv2.INTER_LINEAR,
+                 random_interp=False):
+        """
+        Resize the image randomly according to the short side. If max_size is not None,
+        the long side is scaled according to max_size. The whole process will be keep ratio.
+        Args:
+            short_side_sizes (list|tuple): Image target short side size.
+            max_size (int): The size of the longest side of image after resize.
+            interp (int): The interpolation method.
+            random_interp (bool): Whether random select interpolation method.
+        """
+        super(RandomShortSideResize, self).__init__()
+
+        assert isinstance(short_side_sizes,
+                          Sequence), "short_side_sizes must be List or Tuple"
+
+        self.short_side_sizes = short_side_sizes
+        self.max_size = max_size
+        self.interp = interp
+        self.random_interp = random_interp
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+
+    def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
+        h, w = image_shape
+        max_clip = False
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(max_size * min_original_size / max_original_size)
+                max_clip = True
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (w, h)
+
+        if w < h:
+            ow = size
+            oh = int(round(size * h / w)) if not max_clip else max_size
+        else:
+            oh = size
+            ow = int(round(size * w / h)) if not max_clip else max_size
+
+        return (ow, oh)
+
+    def resize(self,
+               sample,
+               target_size,
+               max_size=None,
+               interp=cv2.INTER_LINEAR):
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,
+                                                      max_size)
+        im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
+            0] / im.shape[1]
+
+        sample['image'] = cv2.resize(im, target_size, interpolation=interp)
+        sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(
+                sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],
+                                                [im_scale_x, im_scale_y])
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                target_size,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+
+        if 'gt_joints' in sample:
+            sample['gt_joints'] = self.apply_joints(
+                sample['gt_joints'], [im_scale_x, im_scale_y], target_size)
+
+        # apply areas
+        if 'gt_areas' in sample:
+            sample['gt_areas'] = self.apply_area(sample['gt_areas'],
+                                                 [im_scale_x, im_scale_y])
+
+        return sample
+
+    def apply_bbox(self, bbox, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        bbox[:, 0::2] *= im_scale_x
+        bbox[:, 1::2] *= im_scale_y
+        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
+        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
+        return bbox.astype('float32')
+
+    def apply_joints(self, joints, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        joints[..., 0] *= im_scale_x
+        joints[..., 1] *= im_scale_y
+        # joints[joints[..., 0] >= resize_w, :] = 0
+        # joints[joints[..., 1] >= resize_h, :] = 0
+        # joints[joints[..., 0] < 0, :] = 0
+        # joints[joints[..., 1] < 0, :] = 0
+        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
+        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
+        return joints
+
+    def apply_area(self, area, scale):
+        im_scale_x, im_scale_y = scale
+        return area * im_scale_x * im_scale_y
+
+    def apply_segm(self, segms, im_size, scale):
+        def _resize_poly(poly, im_scale_x, im_scale_y):
+            resized_poly = np.array(poly).astype('float32')
+            resized_poly[0::2] *= im_scale_x
+            resized_poly[1::2] *= im_scale_y
+            return resized_poly.tolist()
+
+        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, im_h, im_w)
+
+            mask = mask_util.decode(rle)
+            mask = cv2.resize(
+                mask,
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        im_h, im_w = im_size
+        im_scale_x, im_scale_y = scale
+        resized_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                resized_segms.append([
+                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
+                ])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                resized_segms.append(
+                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
+
+        return resized_segms
+
+    def apply(self, sample, context=None):
+        target_size = random.choice(self.short_side_sizes)
+        interp = random.choice(
+            self.interps) if self.random_interp else self.interp
+
+        return self.resize(sample, target_size, self.max_size, interp)
+
+
+@register_op
+class RandomShortSideRangeResize(RandomShortSideResize):
+    def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False):
+        """
+        Resize the image randomly according to the short side. If max_size is not None,
+        the long side is scaled according to max_size. The whole process will be keep ratio.
+        Args:
+            short_side_sizes (list|tuple): Image target short side size.
+            interp (int): The interpolation method.
+            random_interp (bool): Whether random select interpolation method.
+        """
+        super(RandomShortSideRangeResize, self).__init__(scales, None, interp,
+                                                         random_interp)
+
+        assert isinstance(scales,
+                          Sequence), "short_side_sizes must be List or Tuple"
+
+        self.scales = scales
+
+    def random_sample(self, img_scales):
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long), max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short), max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale
+
+    def apply(self, sample, context=None):
+        long_edge, short_edge = self.random_sample(self.short_side_sizes)
+        # print("target size:{}".format((long_edge, short_edge)))
+        interp = random.choice(
+            self.interps) if self.random_interp else self.interp
+
+        return self.resize(sample, short_edge, long_edge, interp)
+
+
+@register_op
+class RandomSizeCrop(BaseOperator):
+    """
+    Cut the image randomly according to `min_size` and `max_size`
+    Args:
+        min_size (int): Min size for edges of cropped image.
+        max_size (int): Max size for edges of cropped image. If it
+                        is set to larger than length of the input image,
+                        the output will keep the origin length.
+        keep_empty (bool): Whether to keep the cropped result with no object.
+                           If it is set to False, the no-object result will not
+                           be returned, replaced by the original input.
+    """
+
+    def __init__(self, min_size, max_size, keep_empty=True):
+        super(RandomSizeCrop, self).__init__()
+        self.min_size = min_size
+        self.max_size = max_size
+        self.keep_empty = keep_empty
+
+        from paddle.vision.transforms.functional import crop as paddle_crop
+        self.paddle_crop = paddle_crop
+
+    @staticmethod
+    def get_crop_params(img_shape, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img_shape (list|tuple): Image's height and width.
+            output_size (list|tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        h, w = img_shape
+        th, tw = output_size
+
+        if h + 1 < th or w + 1 < tw:
+            raise ValueError(
+                "Required crop size {} is larger then input image size {}".
+                format((th, tw), (h, w)))
+
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = random.randint(0, h - th + 1)
+        j = random.randint(0, w - tw + 1)
+        return i, j, th, tw
+
+    def crop(self, sample, region):
+        keep_index = None
+        # apply bbox and check whether the cropped result is valid
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            croped_bbox = self.apply_bbox(sample['gt_bbox'], region)
+            bbox = croped_bbox.reshape([-1, 2, 2])
+            area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
+            keep_index = np.where(area > 0)[0]
+
+            if not self.keep_empty and len(keep_index) == 0:
+                # When keep_empty is set to False, cropped with no-object will
+                # not be used and return the origin content.
+                return sample
+
+            sample['gt_bbox'] = croped_bbox[keep_index] if len(
+                keep_index) > 0 else np.zeros(
+                    [0, 4], dtype=np.float32)
+            sample['gt_class'] = sample['gt_class'][keep_index] if len(
+                keep_index) > 0 else np.zeros(
+                    [0, 1], dtype=np.float32)
+            if 'gt_score' in sample:
+                sample['gt_score'] = sample['gt_score'][keep_index] if len(
+                    keep_index) > 0 else np.zeros(
+                        [0, 1], dtype=np.float32)
+            if 'is_crowd' in sample:
+                sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
+                    keep_index) > 0 else np.zeros(
+                        [0, 1], dtype=np.float32)
+            if 'gt_areas' in sample:
+                sample['gt_areas'] = np.take(
+                    sample['gt_areas'], keep_index, axis=0)
+
+        image_shape = sample['image'].shape[:2]
+        sample['image'] = self.paddle_crop(sample['image'], *region)
+        sample['im_shape'] = np.array(
+            sample['image'].shape[:2], dtype=np.float32)
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
+                                                image_shape)
+            sample['gt_poly'] = np.array(sample['gt_poly'])
+            if keep_index is not None and len(keep_index) > 0:
+                sample['gt_poly'] = sample['gt_poly'][keep_index]
+            sample['gt_poly'] = sample['gt_poly'].tolist()
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            i, j, h, w = region
+            sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
+            if keep_index is not None and len(keep_index) > 0:
+                sample['gt_segm'] = sample['gt_segm'][keep_index]
+
+        if 'gt_joints' in sample:
+            gt_joints = self._crop_joints(sample['gt_joints'], region)
+            sample['gt_joints'] = gt_joints
+            if keep_index is not None:
+                sample['gt_joints'] = sample['gt_joints'][keep_index]
+
+        return sample
+
+    def apply_bbox(self, bbox, region):
+        i, j, h, w = region
+        region_size = np.asarray([w, h])
+        crop_bbox = bbox - np.asarray([j, i, j, i])
+        crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
+        crop_bbox = crop_bbox.clip(min=0)
+        return crop_bbox.reshape([-1, 4]).astype('float32')
+
+    def _crop_joints(self, joints, region):
+        y1, x1, h, w = region
+        x2 = x1 + w
+        y2 = y1 + h
+        # x1, y1, x2, y2 = crop
+        joints[..., 0] -= x1
+        joints[..., 1] -= y1
+        joints[joints[..., 0] > w, :] = 0
+        joints[joints[..., 1] > h, :] = 0
+        joints[joints[..., 0] < 0, :] = 0
+        joints[joints[..., 1] < 0, :] = 0
+        return joints
+
+    def apply_segm(self, segms, region, image_shape):
+        def _crop_poly(segm, crop):
+            xmin, ymin, xmax, ymax = crop
+            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
+            crop_p = np.array(crop_coord).reshape(4, 2)
+            crop_p = Polygon(crop_p)
+
+            crop_segm = list()
+            for poly in segm:
+                poly = np.array(poly).reshape(len(poly) // 2, 2)
+                polygon = Polygon(poly)
+                if not polygon.is_valid:
+                    exterior = polygon.exterior
+                    multi_lines = exterior.intersection(exterior)
+                    polygons = shapely.ops.polygonize(multi_lines)
+                    polygon = MultiPolygon(polygons)
+                multi_polygon = list()
+                if isinstance(polygon, MultiPolygon):
+                    multi_polygon = copy.deepcopy(polygon)
+                else:
+                    multi_polygon.append(copy.deepcopy(polygon))
+                for per_polygon in multi_polygon:
+                    inter = per_polygon.intersection(crop_p)
+                    if not inter:
+                        continue
+                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
+                        for part in inter:
+                            if not isinstance(part, Polygon):
+                                continue
+                            part = np.squeeze(
+                                np.array(part.exterior.coords[:-1]).reshape(1,
+                                                                            -1))
+                            part[0::2] -= xmin
+                            part[1::2] -= ymin
+                            crop_segm.append(part.tolist())
+                    elif isinstance(inter, Polygon):
+                        crop_poly = np.squeeze(
+                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
+                        crop_poly[0::2] -= xmin
+                        crop_poly[1::2] -= ymin
+                        crop_segm.append(crop_poly.tolist())
+                    else:
+                        continue
+            return crop_segm
+
+        def _crop_rle(rle, crop, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        i, j, h, w = region
+        crop = [j, i, j + w, i + h]
+        height, width = image_shape
+        crop_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                import copy
+                import shapely.ops
+                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
+                # Polygon format
+                crop_segms.append(_crop_poly(segm, crop))
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                crop_segms.append(_crop_rle(segm, crop, height, width))
+        return crop_segms
+
+    def apply(self, sample, context=None):
+        h = random.randint(self.min_size,
+                           min(sample['image'].shape[0], self.max_size))
+        w = random.randint(self.min_size,
+                           min(sample['image'].shape[1], self.max_size))
+
+        region = self.get_crop_params(sample['image'].shape[:2], [h, w])
+        return self.crop(sample, region)
+
+
+@register_op
+class CenterRandColor(BaseOperator):
+    """Random color for CenterNet series models.
+    Args:
+        saturation (float): saturation settings.
+        contrast (float): contrast settings.
+        brightness (float): brightness settings.
+    """
+
+    def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4):
+        super(CenterRandColor, self).__init__()
+        self.saturation = saturation
+        self.contrast = contrast
+        self.brightness = brightness
+
+    def apply_saturation(self, img, img_gray):
+        alpha = 1. + np.random.uniform(
+            low=-self.saturation, high=self.saturation)
+        self._blend(alpha, img, img_gray[:, :, None])
+        return img
+
+    def apply_contrast(self, img, img_gray):
+        alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast)
+        img_mean = img_gray.mean()
+        self._blend(alpha, img, img_mean)
+        return img
+
+    def apply_brightness(self, img, img_gray):
+        alpha = 1 + np.random.uniform(
+            low=-self.brightness, high=self.brightness)
+        img *= alpha
+        return img
+
+    def _blend(self, alpha, img, img_mean):
+        img *= alpha
+        img_mean *= (1 - alpha)
+        img += img_mean
+
+    def apply(self, sample, context=None):
+        functions = [
+            self.apply_brightness,
+            self.apply_contrast,
+            self.apply_saturation,
+        ]
+
+        img = sample['image']
+        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        distortions = np.random.permutation(functions)
+        for func in distortions:
+            img = func(img, img_gray)
+        sample['image'] = img
+
+        if 'pre_image' in sample:
+            pre_img = sample['pre_image']
+            pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY)
+            pre_distortions = np.random.permutation(functions)
+            for func in pre_distortions:
+                pre_img = func(pre_img, pre_img_gray)
+            sample['pre_image'] = pre_img
+
+        return sample
+
+
+@register_op
+class Mosaic(BaseOperator):
+    """ Mosaic operator for image and gt_bboxes
+    The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py
+
+    1. get mosaic coords
+    2. clip bbox and get mosaic_labels
+    3. random_affine augment
+    4. Mixup augment as copypaste (optinal), not used in tiny/nano
+
+    Args:
+        prob (float): probability of using Mosaic, 1.0 as default
+        input_dim (list[int]): input shape
+        degrees (list[2]): the rotate range to apply, transform range is [min, max]
+        translate (list[2]): the translate range to apply, transform range is [min, max]
+        scale (list[2]): the scale range to apply, transform range is [min, max]
+        shear (list[2]): the shear range to apply, transform range is [min, max]
+        enable_mixup (bool): whether to enable Mixup or not
+        mixup_prob (float): probability of using Mixup, 1.0 as default
+        mixup_scale (list[int]): scale range of Mixup
+        remove_outside_box (bool): whether remove outside boxes, False as
+            default in COCO dataset, True in MOT dataset
+    """
+
+    def __init__(self,
+                 prob=1.0,
+                 input_dim=[640, 640],
+                 degrees=[-10, 10],
+                 translate=[-0.1, 0.1],
+                 scale=[0.1, 2],
+                 shear=[-2, 2],
+                 enable_mixup=True,
+                 mixup_prob=1.0,
+                 mixup_scale=[0.5, 1.5],
+                 remove_outside_box=False):
+        super(Mosaic, self).__init__()
+        self.prob = prob
+        if isinstance(input_dim, Integral):
+            input_dim = [input_dim, input_dim]
+        self.input_dim = input_dim
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.enable_mixup = enable_mixup
+        self.mixup_prob = mixup_prob
+        self.mixup_scale = mixup_scale
+        self.remove_outside_box = remove_outside_box
+
+    def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w):
+        # (x1, y1, x2, y2) means coords in large image,
+        # small_coords means coords in small image in mosaic aug.
+        if mosaic_idx == 0:
+            # top left
+            x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
+            small_coords = w - (x2 - x1), h - (y2 - y1), w, h
+        elif mosaic_idx == 1:
+            # top right
+            x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
+            small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h
+        elif mosaic_idx == 2:
+            # bottom left
+            x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
+            small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h)
+        elif mosaic_idx == 3:
+            # bottom right
+            x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2,
+                                                                   yc + h)
+            small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
+
+        return (x1, y1, x2, y2), small_coords
+
+    def random_affine_augment(self,
+                              img,
+                              labels=[],
+                              input_dim=[640, 640],
+                              degrees=[-10, 10],
+                              scales=[0.1, 2],
+                              shears=[-2, 2],
+                              translates=[-0.1, 0.1]):
+        # random rotation and scale
+        degree = random.uniform(degrees[0], degrees[1])
+        scale = random.uniform(scales[0], scales[1])
+        assert scale > 0, "Argument scale should be positive."
+        R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale)
+        M = np.ones([2, 3])
+
+        # random shear
+        shear = random.uniform(shears[0], shears[1])
+        shear_x = math.tan(shear * math.pi / 180)
+        shear_y = math.tan(shear * math.pi / 180)
+        M[0] = R[0] + shear_y * R[1]
+        M[1] = R[1] + shear_x * R[0]
+
+        # random translation
+        translate = random.uniform(translates[0], translates[1])
+        translation_x = translate * input_dim[0]
+        translation_y = translate * input_dim[1]
+        M[0, 2] = translation_x
+        M[1, 2] = translation_y
+
+        # warpAffine
+        img = cv2.warpAffine(
+            img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114))
+
+        num_gts = len(labels)
+        if num_gts > 0:
+            # warp corner points
+            corner_points = np.ones((4 * num_gts, 3))
+            corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+                4 * num_gts, 2)  # x1y1, x2y2, x1y2, x2y1
+            # apply affine transform
+            corner_points = corner_points @M.T
+            corner_points = corner_points.reshape(num_gts, 8)
+
+            # create new boxes
+            corner_xs = corner_points[:, 0::2]
+            corner_ys = corner_points[:, 1::2]
+            new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1),
+                                         corner_xs.max(1), corner_ys.max(1)))
+            new_bboxes = new_bboxes.reshape(4, num_gts).T
+
+            # clip boxes
+            new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0])
+            new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1])
+            labels[:, :4] = new_bboxes
+
+        return img, labels
+
+    def __call__(self, sample, context=None):
+        if not isinstance(sample, Sequence):
+            return sample
+
+        assert len(
+            sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup."
+        if np.random.uniform(0., 1.) > self.prob:
+            return sample[0]
+
+        mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], []
+        input_h, input_w = self.input_dim
+        yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
+        xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
+        mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8)
+
+        # 1. get mosaic coords
+        for mosaic_idx, sp in enumerate(sample[:4]):
+            img = sp['image']
+            gt_bbox = sp['gt_bbox']
+            h0, w0 = img.shape[:2]
+            scale = min(1. * input_h / h0, 1. * input_w / w0)
+            img = cv2.resize(
+                img, (int(w0 * scale), int(h0 * scale)),
+                interpolation=cv2.INTER_LINEAR)
+            (h, w, c) = img.shape[:3]
+
+            # suffix l means large image, while s means small image in mosaic aug.
+            (l_x1, l_y1, l_x2, l_y2), (
+                s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords(
+                    mosaic_idx, xc, yc, w, h, input_h, input_w)
+
+            mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
+            padw, padh = l_x1 - s_x1, l_y1 - s_y1
+
+            # Normalized xywh to pixel xyxy format
+            _gt_bbox = gt_bbox.copy()
+            if len(gt_bbox) > 0:
+                _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw
+                _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh
+                _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw
+                _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh
+
+            mosaic_gt_bbox.append(_gt_bbox)
+            mosaic_gt_class.append(sp['gt_class'])
+            if 'is_crowd' in sp:
+                mosaic_is_crowd.append(sp['is_crowd'])
+            if 'difficult' in sp:
+                mosaic_difficult.append(sp['difficult'])
+
+        # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd])
+        if len(mosaic_gt_bbox):
+            mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0)
+            mosaic_gt_class = np.concatenate(mosaic_gt_class, 0)
+            if mosaic_is_crowd:
+                mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0)
+                mosaic_labels = np.concatenate([
+                    mosaic_gt_bbox,
+                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
+                    mosaic_is_crowd.astype(mosaic_gt_bbox.dtype)
+                ], 1)
+            elif mosaic_difficult:
+                mosaic_difficult = np.concatenate(mosaic_difficult, 0)
+                mosaic_labels = np.concatenate([
+                    mosaic_gt_bbox,
+                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
+                    mosaic_difficult.astype(mosaic_gt_bbox.dtype)
+                ], 1)
+            else:
+                mosaic_labels = np.concatenate([
+                    mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype)
+                ], 1)
+            if self.remove_outside_box:
+                # for MOT dataset
+                flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w
+                flag2 = mosaic_gt_bbox[:, 2] > 0
+                flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h
+                flag4 = mosaic_gt_bbox[:, 3] > 0
+                flag_all = flag1 * flag2 * flag3 * flag4
+                mosaic_labels = mosaic_labels[flag_all]
+            else:
+                mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0,
+                                              2 * input_w)
+                mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0,
+                                              2 * input_h)
+                mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0,
+                                              2 * input_w)
+                mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0,
+                                              2 * input_h)
+        else:
+            mosaic_labels = np.zeros((1, 6))
+
+        # 3. random_affine augment
+        mosaic_img, mosaic_labels = self.random_affine_augment(
+            mosaic_img,
+            mosaic_labels,
+            input_dim=self.input_dim,
+            degrees=self.degrees,
+            translates=self.translate,
+            scales=self.scale,
+            shears=self.shear)
+
+        # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177
+        # optinal, not used(enable_mixup=False) in tiny/nano
+        if (self.enable_mixup and not len(mosaic_labels) == 0 and
+                random.random() < self.mixup_prob):
+            sample_mixup = sample[4]
+            mixup_img = sample_mixup['image']
+            if 'is_crowd' in sample_mixup:
+                cp_labels = np.concatenate([
+                    sample_mixup['gt_bbox'],
+                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),
+                    sample_mixup['is_crowd'].astype(mosaic_labels.dtype)
+                ], 1)
+            elif 'difficult' in sample_mixup:
+                cp_labels = np.concatenate([
+                    sample_mixup['gt_bbox'],
+                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),
+                    sample_mixup['difficult'].astype(mosaic_labels.dtype)
+                ], 1)
+            else:
+                cp_labels = np.concatenate([
+                    sample_mixup['gt_bbox'],
+                    sample_mixup['gt_class'].astype(mosaic_labels.dtype)
+                ], 1)
+            mosaic_img, mosaic_labels = self.mixup_augment(
+                mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img)
+
+        sample0 = sample[0]
+        sample0['image'] = mosaic_img.astype(np.uint8)  # can not be float32
+        sample0['h'] = float(mosaic_img.shape[0])
+        sample0['w'] = float(mosaic_img.shape[1])
+        sample0['im_shape'][0] = sample0['h']
+        sample0['im_shape'][1] = sample0['w']
+        sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32)
+        sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32)
+        if 'is_crowd' in sample[0]:
+            sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32)
+        if 'difficult' in sample[0]:
+            sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32)
+        return sample0
+
+    def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels,
+                      img):
+        jit_factor = random.uniform(*self.mixup_scale)
+        FLIP = random.uniform(0, 1) > 0.5
+        if len(img.shape) == 3:
+            cp_img = np.ones(
+                (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
+        else:
+            cp_img = np.ones(input_dim, dtype=np.uint8) * 114
+
+        cp_scale_ratio = min(input_dim[0] / img.shape[0],
+                             input_dim[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img, (int(img.shape[1] * cp_scale_ratio),
+                  int(img.shape[0] * cp_scale_ratio)),
+            interpolation=cv2.INTER_LINEAR)
+
+        cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[
+            1] * cp_scale_ratio)] = resized_img
+
+        cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor),
+                                     int(cp_img.shape[0] * jit_factor)))
+        cp_scale_ratio *= jit_factor
+
+        if FLIP:
+            cp_img = cp_img[:, ::-1, :]
+
+        origin_h, origin_w = cp_img.shape[:2]
+        target_h, target_w = origin_img.shape[:2]
+        padded_img = np.zeros(
+            (max(origin_h, target_h), max(origin_w, target_w), 3),
+            dtype=np.uint8)
+        padded_img[:origin_h, :origin_w] = cp_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:
+                                        x_offset + target_w]
+
+        # adjust boxes
+        cp_bboxes_origin_np = cp_labels[:, :4].copy()
+        cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] *
+                                               cp_scale_ratio, 0, origin_w)
+        cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] *
+                                               cp_scale_ratio, 0, origin_h)
+
+        if FLIP:
+            cp_bboxes_origin_np[:, 0::2] = (
+                origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
+        cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
+        if self.remove_outside_box:
+            # for MOT dataset
+            cp_bboxes_transformed_np[:, 0::2] -= x_offset
+            cp_bboxes_transformed_np[:, 1::2] -= y_offset
+        else:
+            cp_bboxes_transformed_np[:, 0::2] = np.clip(
+                cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
+            cp_bboxes_transformed_np[:, 1::2] = np.clip(
+                cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)
+
+        cls_labels = cp_labels[:, 4:5].copy()
+        box_labels = cp_bboxes_transformed_np
+        if cp_labels.shape[-1] == 6:
+            crd_labels = cp_labels[:, 5:6].copy()
+            labels = np.hstack((box_labels, cls_labels, crd_labels))
+        else:
+            labels = np.hstack((box_labels, cls_labels))
+        if self.remove_outside_box:
+            labels = labels[labels[:, 0] < target_w]
+            labels = labels[labels[:, 2] > 0]
+            labels = labels[labels[:, 1] < target_h]
+            labels = labels[labels[:, 3] > 0]
+
+        origin_labels = np.vstack((origin_labels, labels))
+        origin_img = origin_img.astype(np.float32)
+        origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(
+            np.float32)
+
+        return origin_img.astype(np.uint8), origin_labels
+
+
+@register_op
+class PadResize(BaseOperator):
+    """ PadResize for image and gt_bbbox
+
+    Args:
+        target_size (list[int]): input shape
+        fill_value (float): pixel value of padded image
+    """
+
+    def __init__(self, target_size, fill_value=114):
+        super(PadResize, self).__init__()
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.fill_value = fill_value
+
+    def _resize(self, img, bboxes, labels):
+        ratio = min(self.target_size[0] / img.shape[0],
+                    self.target_size[1] / img.shape[1])
+        w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio)
+        resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
+
+        if len(bboxes) > 0:
+            bboxes *= ratio
+            mask = np.minimum(bboxes[:, 2] - bboxes[:, 0],
+                              bboxes[:, 3] - bboxes[:, 1]) > 1
+            bboxes = bboxes[mask]
+            labels = labels[mask]
+        return resized_img, bboxes, labels
+
+    def _pad(self, img):
+        h, w, _ = img.shape
+        if h == self.target_size[0] and w == self.target_size[1]:
+            return img
+        padded_img = np.full(
+            (self.target_size[0], self.target_size[1], 3),
+            self.fill_value,
+            dtype=np.uint8)
+        padded_img[:h, :w] = img
+        return padded_img
+
+    def apply(self, sample, context=None):
+        image = sample['image']
+        bboxes = sample['gt_bbox']
+        labels = sample['gt_class']
+        image, bboxes, labels = self._resize(image, bboxes, labels)
+        sample['image'] = self._pad(image).astype(np.float32)
+        sample['gt_bbox'] = bboxes
+        sample['gt_class'] = labels
+        if 'is_crowd' in sample:
+            sample.pop('is_crowd')
+        if 'difficult' in sample:
+            sample.pop('difficult')
+        return sample
+
+
+@register_op
+class RandomShift(BaseOperator):
+    """
+    Randomly shift image
+
+    Args:
+        prob (float): probability to do random shift.
+        max_shift (int): max shift pixels
+        filter_thr (int): filter gt bboxes if one side is smaller than this
+    """
+
+    def __init__(self, prob=0.5, max_shift=32, filter_thr=1):
+        super(RandomShift, self).__init__()
+        self.prob = prob
+        self.max_shift = max_shift
+        self.filter_thr = filter_thr
+
+    def calc_shift_coor(self, im_h, im_w, shift_h, shift_w):
+        return [
+            max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w),
+            min(im_h, im_h + shift_h)
+        ]
+
+    def apply(self, sample, context=None):
+        if random.random() > self.prob:
+            return sample
+
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        im_h, im_w = im.shape[:2]
+        shift_h = random.randint(-self.max_shift, self.max_shift)
+        shift_w = random.randint(-self.max_shift, self.max_shift)
+
+        gt_bbox[:, 0::2] += shift_w
+        gt_bbox[:, 1::2] += shift_h
+        gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w)
+        gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h)
+        gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0]
+        gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1]
+        keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr)
+        if not keep.any():
+            return sample
+
+        gt_bbox = gt_bbox[keep]
+        gt_class = gt_class[keep]
+
+        # shift image
+        coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w)
+        # shift frame to the opposite direction
+        coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w)
+        canvas = np.zeros_like(im)
+        canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \
+            = im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]]
+
+        sample['image'] = canvas
+        sample['gt_bbox'] = gt_bbox
+        sample['gt_class'] = gt_class
+        return sample
+
+
+@register_op
+class StrongAugImage(BaseOperator):
+    def __init__(self, transforms):
+        super(StrongAugImage, self).__init__()
+        self.transforms = Compose(transforms)
+
+    def apply(self, sample, context=None):
+        im = sample
+        im['image'] = sample['image'].astype('uint8')
+        results = self.transforms(im)
+        sample['image'] = results['image'].astype('uint8')
+        return sample
+
+
+@register_op
+class RandomColorJitter(BaseOperator):
+    def __init__(self,
+                 prob=0.8,
+                 brightness=0.4,
+                 contrast=0.4,
+                 saturation=0.4,
+                 hue=0.1):
+        super(RandomColorJitter, self).__init__()
+        self.prob = prob
+        self.brightness = brightness
+        self.contrast = contrast
+        self.saturation = saturation
+        self.hue = hue
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0, 1) < self.prob:
+            from paddle.vision.transforms import ColorJitter
+            transform = ColorJitter(self.brightness, self.contrast,
+                                    self.saturation, self.hue)
+            sample['image'] = transform(sample['image'].astype(np.uint8))
+            sample['image'] = sample['image'].astype(np.float32)
+        return sample
+
+
+@register_op
+class RandomGrayscale(BaseOperator):
+    def __init__(self, prob=0.2):
+        super(RandomGrayscale, self).__init__()
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0, 1) < self.prob:
+            from paddle.vision.transforms import Grayscale
+            transform = Grayscale(num_output_channels=3)
+            sample['image'] = transform(sample['image'])
+        return sample
+
+
+@register_op
+class RandomGaussianBlur(BaseOperator):
+    def __init__(self, prob=0.5, sigma=[0.1, 2.0]):
+        super(RandomGaussianBlur, self).__init__()
+        self.prob = prob
+        self.sigma = sigma
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0, 1) < self.prob:
+            sigma = np.random.uniform(self.sigma[0], self.sigma[1])
+            im = cv2.GaussianBlur(sample['image'], (23, 23), sigma)
+            sample['image'] = im
+        return sample
+
+
+@register_op
+class RandomErasing(BaseOperator):
+    def __init__(self,
+                 prob=0.5,
+                 scale=(0.02, 0.33),
+                 ratio=(0.3, 3.3),
+                 value=0,
+                 inplace=False):
+        super(RandomErasing, self).__init__()
+        assert isinstance(scale,
+                          (tuple, list)), "scale should be a tuple or list"
+        assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
+                ), "scale should be of kind (min, max) and in range [0, 1]"
+        assert isinstance(ratio,
+                          (tuple, list)), "ratio should be a tuple or list"
+        assert (ratio[0] >= 0 and
+                ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
+        assert isinstance(
+            value, (Number, str, tuple,
+                    list)), "value should be a number, tuple, list or str"
+        if isinstance(value, str) and value != "random":
+            raise ValueError("value must be 'random' when type is str")
+        self.prob = prob
+        self.scale = scale
+        self.ratio = ratio
+        self.value = value
+        self.inplace = inplace
+
+    def _erase(self, img, i, j, h, w, v, inplace=False):
+        if not inplace:
+            img = img.copy()
+        img[i:i + h, j:j + w, ...] = v
+        return img
+
+    def _get_param(self, img, scale, ratio, value):
+        shape = np.asarray(img).astype(np.uint8).shape
+        h, w, c = shape[-3], shape[-2], shape[-1]
+        img_area = h * w
+        log_ratio = np.log(ratio)
+        for _ in range(1):
+            erase_area = np.random.uniform(*scale) * img_area
+            aspect_ratio = np.exp(np.random.uniform(*log_ratio))
+            erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))
+            erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))
+            if erase_h >= h or erase_w >= w:
+                continue
+
+            if value is None:
+                v = np.random.normal(size=[erase_h, erase_w, c]) * 255
+            else:
+                v = np.array(value)[None, None, :]
+            top = np.random.randint(0, h - erase_h + 1)
+            left = np.random.randint(0, w - erase_w + 1)
+            return top, left, erase_h, erase_w, v
+        return 0, 0, h, w, img
+
+    def apply(self, sample, context=None):
+        if random.random() < self.prob:
+            if isinstance(self.value, Number):
+                value = [self.value]
+            elif isinstance(self.value, str):
+                value = None
+            else:
+                value = self.value
+            if value is not None and not (len(value) == 1 or len(value) == 3):
+                raise ValueError(
+                    "Value should be a single number or a sequence with length equals to image's channel."
+                )
+            im = sample['image']
+            top, left, erase_h, erase_w, v = self._get_param(im, self.scale,
+                                                             self.ratio, value)
+            im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace)
+            sample['image'] = im
+        return sample
+
+
+@register_op
+class RandomErasingCrop(BaseOperator):
+    def __init__(self):
+        super(RandomErasingCrop, self).__init__()
+        self.transform1 = RandomErasing(
+            prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value="random")
+        self.transform2 = RandomErasing(
+            prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value="random")
+        self.transform3 = RandomErasing(
+            prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value="random")
+
+    def apply(self, sample, context=None):
+        sample = self.transform1(sample)
+        sample = self.transform2(sample)
+        sample = self.transform3(sample)
+        return sample
+
+
+@register_op
+class DecodeNormResize(BaseOperator):
+    def __init__(self, target_size, to_rgb=False, mosaic=True):
+        super(DecodeNormResize, self).__init__()
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.to_rgb = to_rgb
+        self.mosaic = mosaic
+
+    def bbox_norm(self, sample):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        height, width = sample['image'].shape[:2]
+        y = bbox.copy()
+        y[:, 0] = ((bbox[:, 0] + bbox[:, 2]) / 2) / width  # x center
+        y[:, 1] = ((bbox[:, 1] + bbox[:, 3]) / 2) / height  # y center
+        y[:, 2] = (bbox[:, 2] - bbox[:, 0]) / width  # width
+        y[:, 3] = (bbox[:, 3] - bbox[:, 1]) / height  # height
+        sample['gt_bbox'] = y
+        return sample
+
+    def load_resized_img(self, sample, target_size):
+        if 'image' not in sample:
+            img_file = sample['im_file']
+            sample['image'] = cv2.imread(img_file)  # BGR
+            sample.pop('im_file')
+        im = sample['image']
+        sample = self.bbox_norm(sample)
+
+        if 'keep_ori_im' in sample and sample['keep_ori_im']:
+            sample['ori_image'] = im
+
+        if 'h' not in sample:
+            sample['h'] = im.shape[0]
+        elif sample['h'] != im.shape[0]:
+            logger.warning(
+                "The actual image height: {} is not equal to the "
+                "height: {} in annotation, and update sample['h'] by actual "
+                "image height.".format(im.shape[0], sample['h']))
+            sample['h'] = im.shape[0]
+        if 'w' not in sample:
+            sample['w'] = im.shape[1]
+        elif sample['w'] != im.shape[1]:
+            logger.warning(
+                "The actual image width: {} is not equal to the "
+                "width: {} in annotation, and update sample['w'] by actual "
+                "image width.".format(im.shape[1], sample['w']))
+            sample['w'] = im.shape[1]
+
+        sample['im_shape'] = np.array(
+            im.shape[:2], dtype=np.float32)  # original shape
+
+        # get resized img
+        r = min(target_size[0] / im.shape[0], target_size[1] / im.shape[1])
+        if r != 1:  # if sizes are not equal
+            resized_img = cv2.resize(
+                im, (int(im.shape[1] * r), int(im.shape[0] * r)),
+                interpolation=cv2.INTER_LINEAR if (self.mosaic or r > 1) else
+                cv2.INTER_AREA)  ########## .astype(np.uint8)
+        else:
+            resized_img = im
+
+        h, w = resized_img.shape[:2]
+        if self.to_rgb:
+            resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
+
+        sample['image'] = resized_img
+        sample['scale_factor'] = np.array(
+            [h / im.shape[0], w / im.shape[1]], dtype=np.float32)
+        return sample
+
+    def apply(self, sample, context=None):
+        sample = self.load_resized_img(sample, self.target_size)
+        return sample
+
+
+@register_op
+class DecodeNormResizeCache(BaseOperator):
+    def __init__(self, cache_root, target_size, to_rgb=False, mosaic=True):
+        super(DecodeNormResizeCache, self).__init__()
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.to_rgb = to_rgb
+        self.mosaic = mosaic
+
+        self.use_cache = False if cache_root is None else True
+        self.cache_root = cache_root
+        if cache_root is not None:
+            _make_dirs(cache_root)
+
+    def bbox_norm(self, sample):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        height, width = sample['image'].shape[:2]
+        y = bbox.copy()
+        y[:, 0] = ((bbox[:, 0] + bbox[:, 2]) / 2) / width  # x center
+        y[:, 1] = ((bbox[:, 1] + bbox[:, 3]) / 2) / height  # y center
+        y[:, 2] = (bbox[:, 2] - bbox[:, 0]) / width  # width
+        y[:, 3] = (bbox[:, 3] - bbox[:, 1]) / height  # height
+        sample['gt_bbox'] = y
+        return sample
+
+    def load_resized_img(self, sample, target_size):
+        if self.use_cache and os.path.exists(
+                self.cache_path(self.cache_root, sample['im_file'])):
+            path = self.cache_path(self.cache_root, sample['im_file'])
+            im = self.load(path)
+            # im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
+            sample['image'] = im
+        else:
+            if 'image' not in sample:
+                img_file = sample['im_file']
+                sample['image'] = cv2.imread(img_file)  # BGR
+                sample.pop('im_file')
+            im = sample['image']
+
+        sample = self.bbox_norm(sample)
+
+        if 'keep_ori_im' in sample and sample['keep_ori_im']:
+            sample['ori_image'] = im
+
+        if 'h' not in sample:
+            sample['h'] = im.shape[0]
+        elif sample['h'] != im.shape[0]:
+            logger.warning(
+                "The actual image height: {} is not equal to the "
+                "height: {} in annotation, and update sample['h'] by actual "
+                "image height.".format(im.shape[0], sample['h']))
+            sample['h'] = im.shape[0]
+        if 'w' not in sample:
+            sample['w'] = im.shape[1]
+        elif sample['w'] != im.shape[1]:
+            logger.warning(
+                "The actual image width: {} is not equal to the "
+                "width: {} in annotation, and update sample['w'] by actual "
+                "image width.".format(im.shape[1], sample['w']))
+            sample['w'] = im.shape[1]
+
+        sample['im_shape'] = np.array(
+            im.shape[:2], dtype=np.float32)  # original shape
+
+        # get resized img
+        r = min(target_size[0] / im.shape[0], target_size[1] / im.shape[1])
+        if r != 1:  # if sizes are not equal
+            resized_img = cv2.resize(
+                im, (int(im.shape[1] * r), int(im.shape[0] * r)),
+                interpolation=cv2.INTER_LINEAR if (self.mosaic or r > 1) else
+                cv2.INTER_AREA)  ########## .astype(np.uint8)
+        else:
+            resized_img = im
+
+        h, w = resized_img.shape[:2]
+        if self.to_rgb:
+            resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
+
+        sample['image'] = resized_img
+        sample['scale_factor'] = np.array(
+            [h / im.shape[0], w / im.shape[1]], dtype=np.float32)
+        return sample
+
+    def apply(self, sample, context=None):
+        sample = self.load_resized_img(sample, self.target_size)
+        return sample
+
+    @staticmethod
+    def cache_path(dir_oot, im_file):
+        return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')
+
+    @staticmethod
+    def load(path):
+        with open(path, 'rb') as f:
+            im = pickle.load(f)
+        return im
+
+    @staticmethod
+    def dump(obj, path):
+        MUTEX.acquire()
+        try:
+            with open(path, 'wb') as f:
+                pickle.dump(obj, f)
+
+        except Exception as e:
+            logger.warning('dump {} occurs exception {}'.format(path, str(e)))
+
+        finally:
+            MUTEX.release()
+
+
+@register_op
+class YOLOv5KeepRatioResize(BaseOperator):
+    # only used for yolov5 rect eval to get higher mAP
+    # only apply to image
+
+    def __init__(self,
+                 target_size,
+                 keep_ratio=True,
+                 batch_shapes=True,
+                 size_divisor=32,
+                 extra_pad_ratio=0.5):
+        super(YOLOv5KeepRatioResize, self).__init__()
+        assert keep_ratio == True
+        self.keep_ratio = keep_ratio
+        self.batch_shapes = batch_shapes
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+        self.size_divisor = size_divisor
+        self.extra_pad_ratio = extra_pad_ratio
+
+    def _get_rescale_ratio(self, old_size, scale):
+        w, h = old_size
+        if isinstance(scale, (float, int)):
+            if scale <= 0:
+                raise ValueError(f'Invalid scale {scale}, must be positive.')
+            scale_factor = scale
+        elif isinstance(scale, (tuple, list)):
+            max_long_edge = max(scale)
+            max_short_edge = min(scale)
+            scale_factor = min(max_long_edge / max(h, w),
+                               max_short_edge / min(h, w))
+        else:
+            raise TypeError('Scale must be a number or tuple of int, '
+                            f'but got {type(scale)}')
+        return scale_factor
+
+    def apply_image(self, image):
+        original_h, original_w = image.shape[:2]
+        ratio = self._get_rescale_ratio(
+            (original_h, original_w), self.target_size)
+        if ratio != 1:
+            # resize image according to the shape
+            # NOTE: We are currently testing on COCO that modifying
+            # this code will not affect the results.
+            # If you find that it has an effect on your results,
+            # please feel free to contact us.
+            image = cv2.resize(
+                image, (int(original_w * ratio), int(original_h * ratio)),
+                interpolation=cv2.INTER_AREA if ratio < 1 else cv2.INTER_LINEAR)
+
+        resized_h, resized_w = image.shape[:2]
+        scale_ratio_h = resized_h / original_h
+        scale_ratio_w = resized_w / original_w
+        return image, (resized_h, resized_w), (scale_ratio_h, scale_ratio_w)
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        im, (resize_h, resize_w), (
+            scale_ratio_h, scale_ratio_w) = self.apply_image(sample['image'])
+        # (427, 640) (480, 640)
+        sample['image'] = im.astype(np.float32)
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        sample['scale_factor'] = np.asarray(
+            [scale_ratio_h, scale_ratio_w], dtype=np.float32)
+
+        shapes = [[1, 1]]
+        aspect_ratio = resize_h / resize_w
+        shapes = [aspect_ratio, 1
+                  ] if aspect_ratio < 1 else [1, 1 / aspect_ratio]
+        batch_shapes = np.ceil(
+            np.array(shapes) * 640 / self.size_divisor +
+            self.extra_pad_ratio).astype(np.int64) * self.size_divisor
+        sample['batch_shape'] = batch_shapes
+        return sample
+
+
+@register_op
+class LetterResize(BaseOperator):
+    # only used for yolov5 rect eval to get higher mAP
+    # only apply to image
+
+    def __init__(self,
+                 scale=[640, 640],
+                 pad_val=144,
+                 use_mini_pad=False,
+                 stretch_only=False,
+                 allow_scale_up=False,
+                 half_pad_param=False):
+        super(LetterResize, self).__init__()
+        self.scale = scale
+        self.pad_val = pad_val
+        if isinstance(pad_val, (int, float)):
+            pad_val = dict(img=pad_val, seg=255)
+        assert isinstance(
+            pad_val, dict), f'pad_val must be dict, but got {type(pad_val)}'
+
+        self.use_mini_pad = use_mini_pad
+        self.stretch_only = stretch_only
+        self.allow_scale_up = allow_scale_up
+        self.half_pad_param = half_pad_param
+
+    def _resize_img(self, results):
+        image = results['image']
+        # Use batch_shape if a batch_shape policy is configured
+        if 'batch_shape' in results:
+            scale = tuple(results['batch_shape'])  # hw
+        else:
+            scale = self.scale[::-1]  # wh -> hw
+
+        image_shape = image.shape[:2]  # height, width
+
+        # Scale ratio (new / old)
+        ratio = min(scale[0] / image_shape[0], scale[1] / image_shape[1])
+        # (448, 672) / (427, 640) = 1.0491803278688525
+        # (512, 672) / (480, 640) = 1.05
+
+        # only scale down, do not scale up (for better test mAP)
+        if not self.allow_scale_up:
+            ratio = min(ratio, 1.0)
+
+        ratio = [ratio, ratio]  # float -> (float, float) for (height, width)
+
+        # compute the best size of the image
+        no_pad_shape = (int(round(image_shape[0] * ratio[0])),
+                        int(round(image_shape[1] * ratio[1])))
+        # [427, 640]  [480, 640]
+        # padding height & width
+        padding_h, padding_w = [
+            scale[0] - no_pad_shape[0], scale[1] - no_pad_shape[1]
+        ]  # [21, 32] 32, 32
+        if self.use_mini_pad:
+            # minimum rectangle padding
+            padding_w, padding_h = np.mod(padding_w, 32), np.mod(padding_h, 32)
+        elif self.stretch_only:
+            # stretch to the specified size directly
+            padding_h, padding_w = 0.0, 0.0
+            no_pad_shape = (scale[0], scale[1])
+            ratio = [scale[0] / image_shape[0],
+                     scale[1] / image_shape[1]]  # height, width ratios
+
+        if image_shape != no_pad_shape:
+            # compare with no resize and padding size
+            image = cv2.resize(
+                image, (no_pad_shape[1], no_pad_shape[0]),
+                interpolation='bilinear')
+
+        scale_factor = (no_pad_shape[1] / image_shape[1],
+                        no_pad_shape[0] / image_shape[0])
+
+        if 'scale_factor' in results:
+            results['scale_factor_origin'] = results['scale_factor']
+        results['scale_factor'] = scale_factor
+
+        # padding
+        top_padding, left_padding = int(round(padding_h // 2 - 0.1)), int(
+            round(padding_w // 2 - 0.1))
+        bottom_padding = padding_h - top_padding
+        right_padding = padding_w - left_padding
+
+        padding_list = [
+            top_padding, bottom_padding, left_padding, right_padding
+        ]  # [10, 11, 16, 16]  [16, 16, 16, 16]
+        if top_padding != 0 or bottom_padding != 0 or \
+                left_padding != 0 or right_padding != 0:
+            if isinstance(self.pad_val, int) and image.ndim == 3:
+                self.pad_val = tuple(
+                    self.pad_val for _ in range(image.shape[2]))
+            # image = cv2.impad(
+            #     img=image,
+            #     padding=(padding_list[2], padding_list[0], padding_list[3],
+            #              padding_list[1]),
+            #     pad_val=pad_val,
+            #     padding_mode='constant')
+            top, bottom, left, right = padding_list
+            image = cv2.copyMakeBorder(
+                image,
+                top,
+                bottom,
+                left,
+                right,
+                cv2.BORDER_CONSTANT,
+                value=self.pad_val)
+            # (448, 672, 3) 
+
+        results['image'] = image.astype(np.float32)
+        results['im0_shape'] = np.asarray(image_shape, dtype=np.float32)
+        results['im_shape'] = np.asarray([image.shape[:2]], dtype=np.float32)
+
+        if 'pad_param' in results:
+            results['pad_param_origin'] = results['pad_param'] * \
+                                          np.repeat(ratio, 2)
+
+        if self.half_pad_param:
+            results['pad_param'] = np.array(
+                [padding_h / 2, padding_h / 2, padding_w / 2, padding_w / 2],
+                dtype=np.float32)
+        else:
+            # We found in object detection, using padding list with
+            # int type can get higher mAP.
+            results['pad_param'] = np.array(padding_list, dtype=np.float32)
+
+        return results
+
+    def apply(self, sample, context=None):
+        sample = self._resize_img(sample)
+        # if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+        #     sample = self._resize_bboxes(sample)
+        if 'scale_factor_origin' in sample:
+            scale_factor_origin = sample.pop('scale_factor_origin')
+            scale_ratio_h, scale_ratio_w = (
+                sample['scale_factor'][0] * scale_factor_origin[0],
+                sample['scale_factor'][1] * scale_factor_origin[1])
+            sample['scale_factor'] = np.asarray(
+                [scale_ratio_h, scale_ratio_w], dtype=np.float32)
+
+        if 'pad_param_origin' in sample:
+            pad_param_origin = sample.pop('pad_param_origin')
+            sample['pad_param'] += pad_param_origin
+
+        return sample
diff --git a/ppdet/data/utils.py b/ppdet/data/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..02573e61484bc5ef07353dbef124c8afa54ccc64
--- /dev/null
+++ b/ppdet/data/utils.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numbers
+import numpy as np
+
+try:
+    from collections.abc import Sequence, Mapping
+except:
+    from collections import Sequence, Mapping
+
+
+def default_collate_fn(batch):
+    """
+    Default batch collating function for :code:`paddle.io.DataLoader`,
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array, this
+    function will parse input data recursively and stack number,
+    numpy array and paddle.Tensor datas as batch datas. e.g. for
+    following input data:
+    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
+    
+    
+    This default collate function zipped each number and numpy array
+    field together and stack each field as the batch field as follows:
+    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
+    Args:  
+        batch(list of sample data): batch should be a list of sample data.
+    
+    Returns:
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
+    """
+    sample = batch[0]
+    if isinstance(sample, np.ndarray):
+        batch = np.stack(batch, axis=0)
+        return batch
+    elif isinstance(sample, numbers.Number):
+        batch = np.array(batch)
+        return batch
+    elif isinstance(sample, (str, bytes)):
+        return batch
+    elif isinstance(sample, Mapping):
+        return {
+            key: default_collate_fn([d[key] for d in batch])
+            for key in sample
+        }
+    elif isinstance(sample, Sequence):
+        sample_fields_num = len(sample)
+        if not all(len(sample) == sample_fields_num for sample in iter(batch)):
+            raise RuntimeError(
+                "fileds number not same among samples in a batch")
+        return [default_collate_fn(fields) for fields in zip(*batch)]
+
+    raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
+                    "dict, list, number, but got {}".format(type(sample)))
diff --git a/ppdet/engine/__init__.py b/ppdet/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..729d8c3ee85d3047f0bd023dfd989acb1488c719
--- /dev/null
+++ b/ppdet/engine/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import trainer
+from .trainer import *
+
+from . import callbacks
+from .callbacks import *
+
+from . import env
+from .env import *
+
+__all__ = trainer.__all__ \
+        + callbacks.__all__ \
+        + env.__all__
+
+from . import trainer_ssod
+from .trainer_ssod import *
+__all__ = __all__ + trainer_ssod.__all__
diff --git a/ppdet/engine/callbacks.py b/ppdet/engine/callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..82f6e5fa9607e244743af0e0836739468b9703d0
--- /dev/null
+++ b/ppdet/engine/callbacks.py
@@ -0,0 +1,490 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import datetime
+import six
+import copy
+import json
+
+import paddle
+import paddle.distributed as dist
+
+from ppdet.utils.checkpoint import save_model
+from ppdet.metrics import get_infer_results
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = [
+    'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',
+    'VisualDLWriter'
+]
+
+
+class Callback(object):
+    def __init__(self, model):
+        self.model = model
+
+    def on_step_begin(self, status):
+        pass
+
+    def on_step_end(self, status):
+        pass
+
+    def on_epoch_begin(self, status):
+        pass
+
+    def on_epoch_end(self, status):
+        pass
+
+    def on_train_begin(self, status):
+        pass
+
+    def on_train_end(self, status):
+        pass
+
+
+class ComposeCallback(object):
+    def __init__(self, callbacks):
+        callbacks = [c for c in list(callbacks) if c is not None]
+        for c in callbacks:
+            assert isinstance(
+                c, Callback), "callback should be subclass of Callback"
+        self._callbacks = callbacks
+
+    def on_step_begin(self, status):
+        for c in self._callbacks:
+            c.on_step_begin(status)
+
+    def on_step_end(self, status):
+        for c in self._callbacks:
+            c.on_step_end(status)
+
+    def on_epoch_begin(self, status):
+        for c in self._callbacks:
+            c.on_epoch_begin(status)
+
+    def on_epoch_end(self, status):
+        for c in self._callbacks:
+            c.on_epoch_end(status)
+
+    def on_train_begin(self, status):
+        for c in self._callbacks:
+            c.on_train_begin(status)
+
+    def on_train_end(self, status):
+        for c in self._callbacks:
+            c.on_train_end(status)
+
+
+class LogPrinter(Callback):
+    def __init__(self, model):
+        super(LogPrinter, self).__init__(model)
+
+    def on_step_end(self, status):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            mode = status['mode']
+            if mode == 'train':
+                epoch_id = status['epoch_id']
+                step_id = status['step_id']
+                steps_per_epoch = status['steps_per_epoch']
+                training_staus = status['training_staus']
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+
+                epoches = self.model.cfg.epoch
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                logs = training_staus.log()
+                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
+                if step_id % self.model.cfg.log_iter == 0:
+                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
+                    eta_sec = eta_steps * batch_time.global_avg
+                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                    ips = float(batch_size) / batch_time.avg
+                    max_mem_reserved_str = ""
+                    max_mem_allocated_str = ""
+                    if paddle.device.is_compiled_with_cuda():
+                        max_mem_reserved_str = f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB d"
+                        max_mem_allocated_str = f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB"
+                    fmt = ' '.join([
+                        'Epoch: [{}]',
+                        '[{' + space_fmt + '}/{}]',
+                        'eta: {eta}',
+                        'lr: {lr:.6f}',
+                        '{meters}',
+                        'batch_cost: {btime}',
+                        'data_cost: {dtime}',
+                        'ips: {ips:.4f} images/s',
+                        '{max_mem_reserved_str}',
+                        '{max_mem_allocated_str}'
+                    ])
+                    fmt = fmt.format(
+                        epoch_id,
+                        step_id,
+                        steps_per_epoch,
+                        eta=eta_str,
+                        lr=status['learning_rate'],
+                        meters=logs,
+                        btime=str(batch_time),
+                        dtime=str(data_time),
+                        ips=ips,
+                        max_mem_reserved_str=max_mem_reserved_str,
+                        max_mem_allocated_str=max_mem_allocated_str)
+                    logger.info(fmt)
+            if mode == 'eval':
+                step_id = status['step_id']
+                if step_id % 100 == 0:
+                    logger.info("Eval iter: {}".format(step_id))
+
+    def on_epoch_end(self, status):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            mode = status['mode']
+            if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+                logger.info('Total sample number: {}, average FPS: {}'.format(
+                    sample_num, sample_num / cost_time))
+
+
+class Checkpointer(Callback):
+    def __init__(self, model):
+        super(Checkpointer, self).__init__(model)
+        self.best_ap = -1000.
+        self.save_dir = os.path.join(self.model.cfg.save_dir,
+                                     self.model.cfg.filename)
+        if hasattr(self.model.model, 'student_model'):
+            self.weight = self.model.model.student_model
+        else:
+            self.weight = self.model.model
+
+    def on_epoch_end(self, status):
+        # Checkpointer only performed during training
+        mode = status['mode']
+        epoch_id = status['epoch_id']
+        weight = None
+        save_name = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                end_epoch = self.model.cfg.epoch
+                if (
+                        epoch_id + 1
+                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
+                    save_name = str(
+                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
+                    weight = self.weight.state_dict()
+            elif mode == 'eval':
+                if 'save_best_model' in status and status['save_best_model']:
+                    for metric in self.model._metrics:
+                        map_res = metric.get_results()
+                        eval_func = "ap"
+                        if 'bbox' in map_res:
+                            key = 'bbox'
+                        elif 'keypoint' in map_res:
+                            key = 'keypoint'
+                        else:
+                            key = 'mask'
+                        if key not in map_res:
+                            logger.warning("Evaluation results empty, this may be due to " \
+                                        "training iterations being too few or not " \
+                                        "loading the correct weights.")
+                            return
+                        if map_res[key][0] >= self.best_ap:
+                            self.best_ap = map_res[key][0]
+                            save_name = 'best_model'
+                            weight = self.weight.state_dict()
+                        logger.info("Best test {} {} is {:0.3f}.".format(
+                            key, eval_func, abs(self.best_ap)))
+            if weight:
+                if self.model.use_ema:
+                    exchange_save_model = status.get('exchange_save_model',
+                                                     False)
+                    if not exchange_save_model:
+                        # save model and ema_model
+                        save_model(
+                            status['weight'],
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=weight)
+                    else:
+                        # save model(student model) and ema_model(teacher model)
+                        # in DenseTeacher SSOD, the teacher model will be higher,
+                        # so exchange when saving pdparams
+                        student_model = status['weight']  # model
+                        teacher_model = weight  # ema_model
+                        save_model(
+                            teacher_model,
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=student_model)
+                        del teacher_model
+                        del student_model
+                else:
+                    save_model(weight, self.model.optimizer, self.save_dir,
+                               save_name, epoch_id + 1)
+
+
+class VisualDLWriter(Callback):
+    """
+    Use VisualDL to log data or image
+    """
+
+    def __init__(self, model):
+        super(VisualDLWriter, self).__init__(model)
+
+        assert six.PY3, "VisualDL requires Python >= 3.5"
+        try:
+            from visualdl import LogWriter
+        except Exception as e:
+            logger.error('visualdl not found, plaese install visualdl. '
+                         'for example: `pip install visualdl`.')
+            raise e
+        self.vdl_writer = LogWriter(
+            model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))
+        self.vdl_loss_step = 0
+        self.vdl_mAP_step = 0
+        self.vdl_image_step = 0
+        self.vdl_image_frame = 0
+
+    def on_step_end(self, status):
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                training_staus = status['training_staus']
+                for loss_name, loss_value in training_staus.get().items():
+                    self.vdl_writer.add_scalar(loss_name, loss_value,
+                                               self.vdl_loss_step)
+                self.vdl_loss_step += 1
+            elif mode == 'test':
+                ori_image = status['original_image']
+                result_image = status['result_image']
+                self.vdl_writer.add_image(
+                    "original/frame_{}".format(self.vdl_image_frame), ori_image,
+                    self.vdl_image_step)
+                self.vdl_writer.add_image(
+                    "result/frame_{}".format(self.vdl_image_frame),
+                    result_image, self.vdl_image_step)
+                self.vdl_image_step += 1
+                # each frame can display ten pictures at most.
+                if self.vdl_image_step % 10 == 0:
+                    self.vdl_image_step = 0
+                    self.vdl_image_frame += 1
+
+    def on_epoch_end(self, status):
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'eval':
+                for metric in self.model._metrics:
+                    for key, map_value in metric.get_results().items():
+                        self.vdl_writer.add_scalar("{}-mAP".format(key),
+                                                   map_value[0],
+                                                   self.vdl_mAP_step)
+                self.vdl_mAP_step += 1
+
+
+class WandbCallback(Callback):
+    def __init__(self, model):
+        super(WandbCallback, self).__init__(model)
+
+        try:
+            import wandb
+            self.wandb = wandb
+        except Exception as e:
+            logger.error('wandb not found, please install wandb. '
+                         'Use: `pip install wandb`.')
+            raise e
+
+        self.wandb_params = model.cfg.get('wandb', None)
+        self.save_dir = os.path.join(self.model.cfg.save_dir,
+                                     self.model.cfg.filename)
+        if self.wandb_params is None:
+            self.wandb_params = {}
+        for k, v in model.cfg.items():
+            if k.startswith("wandb_"):
+                self.wandb_params.update({k.lstrip("wandb_"): v})
+
+        self._run = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            _ = self.run
+            self.run.config.update(self.model.cfg)
+            self.run.define_metric("epoch")
+            self.run.define_metric("eval/*", step_metric="epoch")
+
+        self.best_ap = -1000.
+        self.fps = []
+
+    @property
+    def run(self):
+        if self._run is None:
+            if self.wandb.run is not None:
+                logger.info(
+                    "There is an ongoing wandb run which will be used"
+                    "for logging. Please use `wandb.finish()` to end that"
+                    "if the behaviour is not intended")
+                self._run = self.wandb.run
+            else:
+                self._run = self.wandb.init(**self.wandb_params)
+        return self._run
+
+    def save_model(self,
+                   optimizer,
+                   save_dir,
+                   save_name,
+                   last_epoch,
+                   ema_model=None,
+                   ap=None,
+                   fps=None,
+                   tags=None):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            model_path = os.path.join(save_dir, save_name)
+            metadata = {}
+            metadata["last_epoch"] = last_epoch
+            if ap:
+                metadata["ap"] = ap
+
+            if fps:
+                metadata["fps"] = fps
+
+            if ema_model is None:
+                ema_artifact = self.wandb.Artifact(
+                    name="ema_model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+
+                ema_artifact.add_file(model_path + ".pdema", name="model_ema")
+                model_artifact.add_file(model_path + ".pdparams", name="model")
+
+                self.run.log_artifact(ema_artifact, aliases=tags)
+                self.run.log_artfact(model_artifact, aliases=tags)
+            else:
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+                model_artifact.add_file(model_path + ".pdparams", name="model")
+                self.run.log_artifact(model_artifact, aliases=tags)
+
+    def on_step_end(self, status):
+
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                training_status = status['training_staus'].get()
+                for k, v in training_status.items():
+                    training_status[k] = float(v)
+
+                # calculate ips, data_cost, batch_cost
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                ips = float(batch_size) / float(batch_time.avg)
+                data_cost = float(data_time.avg)
+                batch_cost = float(batch_time.avg)
+
+                metrics = {"train/" + k: v for k, v in training_status.items()}
+
+                metrics["train/ips"] = ips
+                metrics["train/data_cost"] = data_cost
+                metrics["train/batch_cost"] = batch_cost
+
+                self.fps.append(ips)
+                self.run.log(metrics)
+
+    def on_epoch_end(self, status):
+        mode = status['mode']
+        epoch_id = status['epoch_id']
+        save_name = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                fps = sum(self.fps) / len(self.fps)
+                self.fps = []
+
+                end_epoch = self.model.cfg.epoch
+                if (
+                        epoch_id + 1
+                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
+                    save_name = str(
+                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
+                    tags = ["latest", "epoch_{}".format(epoch_id)]
+                    self.save_model(
+                        self.model.optimizer,
+                        self.save_dir,
+                        save_name,
+                        epoch_id + 1,
+                        self.model.use_ema,
+                        fps=fps,
+                        tags=tags)
+            if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+
+                fps = sample_num / cost_time
+
+                merged_dict = {}
+                for metric in self.model._metrics:
+                    for key, map_value in metric.get_results().items():
+                        merged_dict["eval/{}-mAP".format(key)] = map_value[0]
+                merged_dict["epoch"] = status["epoch_id"]
+                merged_dict["eval/fps"] = sample_num / cost_time
+
+                self.run.log(merged_dict)
+
+                if 'save_best_model' in status and status['save_best_model']:
+                    for metric in self.model._metrics:
+                        map_res = metric.get_results()
+                        if 'bbox' in map_res:
+                            key = 'bbox'
+                        elif 'keypoint' in map_res:
+                            key = 'keypoint'
+                        else:
+                            key = 'mask'
+                        if key not in map_res:
+                            logger.warning("Evaluation results empty, this may be due to " \
+                                        "training iterations being too few or not " \
+                                        "loading the correct weights.")
+                            return
+                        if map_res[key][0] >= self.best_ap:
+                            self.best_ap = map_res[key][0]
+                            save_name = 'best_model'
+                            tags = ["best", "epoch_{}".format(epoch_id)]
+
+                            self.save_model(
+                                self.model.optimizer,
+                                self.save_dir,
+                                save_name,
+                                last_epoch=epoch_id + 1,
+                                ema_model=self.model.use_ema,
+                                ap=abs(self.best_ap),
+                                fps=fps,
+                                tags=tags)
+
+    def on_train_end(self, status):
+        self.run.finish()
diff --git a/ppdet/engine/env.py b/ppdet/engine/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a896571db8bee03f3fdb172443af88622a912bd
--- /dev/null
+++ b/ppdet/engine/env.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import numpy as np
+
+import paddle
+from paddle.distributed import fleet
+
+__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']
+
+
+def init_fleet_env(find_unused_parameters=False):
+    strategy = fleet.DistributedStrategy()
+    strategy.find_unused_parameters = find_unused_parameters
+    fleet.init(is_collective=True, strategy=strategy)
+
+
+def init_parallel_env():
+    env = os.environ
+    dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
+    if dist:
+        trainer_id = int(env['PADDLE_TRAINER_ID'])
+        local_seed = (99 + trainer_id)
+        random.seed(local_seed)
+        np.random.seed(local_seed)
+
+    paddle.distributed.init_parallel_env()
+
+
+def set_random_seed(seed):
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
diff --git a/ppdet/engine/export_utils.py b/ppdet/engine/export_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..59d4399a5eb03d28efbf91900b688722365eb352
--- /dev/null
+++ b/ppdet/engine/export_utils.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import yaml
+from collections import OrderedDict
+
+import paddle
+from ppdet.data.source.category import get_categories
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+# Global dictionary
+TRT_MIN_SUBGRAPH = {
+    'YOLO': 3,
+    'PPYOLOE': 10,
+    'YOLOX': 20,
+    'YOLOF': 40,
+    'YOLOv5': 20,
+    'RTMDet': 20,
+    'YOLOv6': 10,
+    'YOLOv7': 10,
+    'YOLOv8': 10,
+    'DETR': 3,
+}
+
+TO_STATIC_SPEC = {
+    'yolov5_l_300e_coco': None,
+    'yolov7_l_300e_coco': None,
+    'yolov3_darknet53_270e_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, 50], dtype='float32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'target0': paddle.static.InputSpec(
+            name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target1': paddle.static.InputSpec(
+            name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target2': paddle.static.InputSpec(
+            name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+    }],
+}
+
+
+def apply_to_static(config, model):
+    filename = config.get('filename', None)
+    spec = TO_STATIC_SPEC.get(filename, None)
+    model = paddle.jit.to_static(model, input_spec=spec)
+    logger.info("Successfully to apply @to_static with specs: {}".format(spec))
+    return model
+
+
+def _prune_input_spec(input_spec, program, targets):
+    # try to prune static program to figure out pruned input spec
+    # so we perform following operations in static mode
+    device = paddle.get_device()
+    paddle.enable_static()
+    paddle.set_device(device)
+    pruned_input_spec = [{}]
+    program = program.clone()
+    program = program._prune(targets=targets)
+    global_block = program.global_block()
+    for name, spec in input_spec[0].items():
+        try:
+            v = global_block.var(name)
+            pruned_input_spec[0][name] = spec
+        except Exception:
+            pass
+    paddle.disable_static(place=device)
+    return pruned_input_spec
+
+
+def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
+    preprocess_list = []
+
+    anno_file = dataset_cfg.get_anno()
+
+    clsid2catid, catid2name = get_categories(metric, anno_file, arch)
+
+    label_list = [str(cat) for cat in catid2name.values()]
+
+    fuse_normalize = reader_cfg.get('fuse_normalize', False)
+    sample_transforms = reader_cfg['sample_transforms']
+    for st in sample_transforms[1:]:
+        for key, value in st.items():
+            p = {'type': key}
+            if key == 'Resize':
+                if int(image_shape[1]) != -1:
+                    value['target_size'] = image_shape[1:]
+                value['interp'] = value.get('interp', 1)  # cv2.INTER_LINEAR
+            if fuse_normalize and key == 'NormalizeImage':
+                continue
+            p.update(value)
+            preprocess_list.append(p)
+    batch_transforms = reader_cfg.get('batch_transforms', None)
+    if batch_transforms:
+        for bt in batch_transforms:
+            for key, value in bt.items():
+                # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride)
+                if key == 'PadBatch':
+                    preprocess_list.append({
+                        'type': 'PadStride',
+                        'stride': value['pad_to_stride']
+                    })
+                    break
+
+    return preprocess_list, label_list
+
+
+def _parse_tracker(tracker_cfg):
+    tracker_params = {}
+    for k, v in tracker_cfg.items():
+        tracker_params.update({k: v})
+    return tracker_params
+
+
+def _dump_infer_config(config, path, image_shape, model):
+    arch_state = False
+    from ppdet.core.config.yaml_helpers import setup_orderdict
+    setup_orderdict()
+    use_dynamic_shape = True if image_shape[2] == -1 else False
+    infer_cfg = OrderedDict({
+        'mode': 'paddle',
+        'draw_threshold': 0.5,
+        'metric': config['metric'],
+        'use_dynamic_shape': use_dynamic_shape
+    })
+    export_onnx = config.get('export_onnx', False)
+    export_eb = config.get('export_eb', False)
+
+    infer_arch = config['architecture']
+    if 'RCNN' in infer_arch and export_onnx:
+        logger.warning(
+            "Exporting RCNN model to ONNX only support batch_size = 1")
+        infer_cfg['export_onnx'] = True
+        infer_cfg['export_eb'] = export_eb
+
+    for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items():
+        if arch in infer_arch:
+            infer_cfg['arch'] = arch
+            infer_cfg['min_subgraph_size'] = min_subgraph_size
+            arch_state = True
+            break
+
+    if infer_arch == 'PPYOLOEWithAuxHead':
+        infer_arch = 'PPYOLOE'
+
+    if infer_arch in [
+            'YOLOX', 'YOLOF', 'PPYOLOE', 'YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8'
+    ]:
+        infer_cfg['arch'] = infer_arch
+        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
+        arch_state = True
+
+    if not arch_state:
+        logger.error(
+            'Architecture: {} is not supported for exporting model now.\n'.
+            format(infer_arch) +
+            'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py')
+        os._exit(0)
+    if 'mask_head' in config[config['architecture']] and config[config[
+            'architecture']]['mask_head']:
+        infer_cfg['mask'] = True
+    label_arch = 'detection_arch'
+
+    reader_cfg = config['TestReader']
+    dataset_cfg = config['TestDataset']
+
+    infer_cfg['Preprocess'], infer_cfg['label_list'] = _parse_reader(
+        reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])
+
+    yaml.dump(infer_cfg, open(path, 'w'))
+    logger.info("Export inference config file to {}".format(os.path.join(path)))
diff --git a/ppdet/engine/trainer.py b/ppdet/engine/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c159f22eaa7e2662a0933ae21fd7f363b53f30d3
--- /dev/null
+++ b/ppdet/engine/trainer.py
@@ -0,0 +1,1134 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import copy
+import time
+from tqdm import tqdm
+
+import numpy as np
+import typing
+from PIL import Image, ImageOps, ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from paddle.static import InputSpec
+from ppdet.optimizer import ModelEMA
+
+from ppdet.core.workspace import create
+from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
+from ppdet.utils.visualizer import visualize_results, save_result
+from ppdet.metrics import Metric, COCOMetric, VOCMetric, get_infer_results
+from ppdet.data.source.category import get_categories
+import ppdet.utils.stats as stats
+from ppdet.utils.fuse_utils import fuse_conv_bn
+from ppdet.utils import profiler
+from ppdet.modeling.initializer import reset_initialized_parameter
+from ppdet.modeling.post_process import multiclass_nms
+
+from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, VisualDLWriter, WandbCallback
+from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static
+
+from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = ['Trainer']
+
+
+class Trainer(object):
+    def __init__(self, cfg, mode='train'):
+        self.cfg = cfg.copy()
+        assert mode.lower() in ['train', 'eval', 'test'], \
+                "mode should be 'train', 'eval' or 'test'"
+        self.mode = mode.lower()
+        self.optimizer = None
+        self.is_loaded_weights = False
+        self.use_amp = self.cfg.get('amp', False)
+        self.amp_level = self.cfg.get('amp_level', 'O1')
+        self.custom_white_list = self.cfg.get('custom_white_list', None)
+        self.custom_black_list = self.cfg.get('custom_black_list', None)
+
+        if self.cfg.architecture in ['RTMDet', 'YOLOv6'
+                                     ] and self.mode == 'train':
+            raise NotImplementedError('{} training not supported yet.'.format(
+                self.cfg.architecture))
+        if 'slim' in cfg and cfg['slim_type'] == 'PTQ':
+            self.cfg['TestDataset'] = create('TestDataset')()
+
+        # build data loader
+        capital_mode = self.mode.capitalize()
+        if self.mode in ['train', 'eval', 'test']:
+            self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
+                '{}Dataset'.format(capital_mode))()
+
+        if self.mode == 'train':
+            self.loader = create('{}Reader'.format(capital_mode))(
+                self.dataset, cfg.worker_num)
+
+        # build model
+        if 'model' not in self.cfg:
+            self.model = create(cfg.architecture)
+        else:
+            self.model = self.cfg.model
+            self.is_loaded_weights = True
+
+        if self.cfg.architecture in ['YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8']:
+            reset_initialized_parameter(self.model)
+            self.model.yolo_head._initialize_biases()  # Note: must added
+
+        if cfg.architecture in [
+                'YOLOX', 'YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8'
+        ]:
+            for k, m in self.model.named_sublayers():
+                if isinstance(m, nn.BatchNorm2D):
+                    m._epsilon = 1e-3  # for amp(fp16)
+                    m._momentum = 0.97  # 0.03 in pytorch
+
+        #normalize params for deploy
+        if 'slim' in cfg and cfg['slim_type'] == 'OFA':
+            self.model.model.load_meanstd(cfg['TestReader'][
+                'sample_transforms'])
+        elif 'slim' in cfg and cfg['slim_type'] == 'Distill':
+            self.model.student_model.load_meanstd(cfg['TestReader'][
+                'sample_transforms'])
+        elif 'slim' in cfg and cfg[
+                'slim_type'] == 'DistillPrune' and self.mode == 'train':
+            self.model.student_model.load_meanstd(cfg['TestReader'][
+                'sample_transforms'])
+        else:
+            self.model.load_meanstd(cfg['TestReader']['sample_transforms'])
+
+        # EvalDataset build with BatchSampler to evaluate in single device
+        # TODO: multi-device evaluate
+        if self.mode == 'eval':
+            self._eval_batch_sampler = paddle.io.BatchSampler(
+                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
+            reader_name = '{}Reader'.format(self.mode.capitalize())
+            # If metric is VOC, need to be set collate_batch=False.
+            if cfg.metric == 'VOC':
+                self.cfg[reader_name]['collate_batch'] = False
+            self.loader = create(reader_name)(self.dataset, cfg.worker_num,
+                                              self._eval_batch_sampler)
+        # TestDataset build after user set images, skip loader creation here
+
+        # get Params
+        print_params = self.cfg.get('print_params', False)
+        if print_params:
+            params = sum([
+                p.numel() for n, p in self.model.named_parameters()
+                if all([x not in n for x in ['_mean', '_variance', 'aux_']])
+            ])  # exclude BatchNorm running status
+            logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[
+                0]))
+
+        # build optimizer in train mode
+        if self.mode == 'train':
+            steps_per_epoch = len(self.loader)
+            if steps_per_epoch < 1:
+                logger.warning(
+                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
+                )
+            self.lr = create('LearningRate')(steps_per_epoch)
+            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+            # Unstructured pruner is only enabled in the train mode.
+            if self.cfg.get('unstructured_prune'):
+                self.pruner = create('UnstructuredPruner')(self.model,
+                                                           steps_per_epoch)
+        if self.use_amp and self.amp_level == 'O2':
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=self.amp_level)
+        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
+        if self.use_ema:
+            ema_decay = self.cfg.get('ema_decay', 0.9998)
+            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
+            cycle_epoch = self.cfg.get('cycle_epoch', -1)
+            ema_black_list = self.cfg.get('ema_black_list', None)
+            ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)
+            self.ema = ModelEMA(
+                self.model,
+                decay=ema_decay,
+                ema_decay_type=ema_decay_type,
+                cycle_epoch=cycle_epoch,
+                ema_black_list=ema_black_list,
+                ema_filter_no_grad=ema_filter_no_grad)
+
+        self._nranks = dist.get_world_size()
+        self._local_rank = dist.get_rank()
+
+        self.status = {}
+
+        self.start_epoch = 0
+        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+
+    def _init_callbacks(self):
+        if self.mode == 'train':
+            self._callbacks = [LogPrinter(self), Checkpointer(self)]
+            if self.cfg.get('use_vdl', False):
+                self._callbacks.append(VisualDLWriter(self))
+            if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg:
+                self._callbacks.append(WandbCallback(self))
+            self._compose_callback = ComposeCallback(self._callbacks)
+        elif self.mode == 'eval':
+            self._callbacks = [LogPrinter(self)]
+            self._compose_callback = ComposeCallback(self._callbacks)
+        elif self.mode == 'test' and self.cfg.get('use_vdl', False):
+            self._callbacks = [VisualDLWriter(self)]
+            self._compose_callback = ComposeCallback(self._callbacks)
+        else:
+            self._callbacks = []
+            self._compose_callback = None
+
+    def _init_metrics(self, validate=False):
+        if self.mode == 'test' or (self.mode == 'train' and not validate):
+            self._metrics = []
+            return
+        classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
+        if self.cfg.metric == 'COCO':
+            # TODO: bias should be unified
+            bias = 1 if self.cfg.get('bias', False) else 0
+            output_eval = self.cfg['output_eval'] \
+                if 'output_eval' in self.cfg else None
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+
+            # pass clsid2catid info to metric instance to avoid multiple loading
+            # annotation file
+            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
+                                if self.mode == 'eval' else None
+
+            # when do validation in train, annotation file should be get from
+            # EvalReader instead of self.dataset(which is TrainReader)
+            if self.mode == 'train' and validate:
+                eval_dataset = self.cfg['EvalDataset']
+                eval_dataset.check_or_download_dataset()
+                anno_file = eval_dataset.get_anno()
+                dataset = eval_dataset
+            else:
+                dataset = self.dataset
+                anno_file = dataset.get_anno()
+
+            IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
+            self._metrics = [
+                COCOMetric(
+                    anno_file=anno_file,
+                    clsid2catid=clsid2catid,
+                    classwise=classwise,
+                    output_eval=output_eval,
+                    bias=bias,
+                    IouType=IouType,
+                    save_prediction_only=save_prediction_only)
+            ]
+        elif self.cfg.metric == 'VOC':
+            output_eval = self.cfg['output_eval'] \
+                if 'output_eval' in self.cfg else None
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+
+            self._metrics = [
+                VOCMetric(
+                    label_list=self.dataset.get_label_list(),
+                    class_num=self.cfg.num_classes,
+                    map_type=self.cfg.map_type,
+                    classwise=classwise,
+                    output_eval=output_eval,
+                    save_prediction_only=save_prediction_only)
+            ]
+        else:
+            logger.warning("Metric not support for metric type {}".format(
+                self.cfg.metric))
+            self._metrics = []
+
+    def _reset_metrics(self):
+        for metric in self._metrics:
+            metric.reset()
+
+    def register_callbacks(self, callbacks):
+        callbacks = [c for c in list(callbacks) if c is not None]
+        for c in callbacks:
+            assert isinstance(c, Callback), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._callbacks.extend(callbacks)
+        self._compose_callback = ComposeCallback(self._callbacks)
+
+    def register_metrics(self, metrics):
+        metrics = [m for m in list(metrics) if m is not None]
+        for m in metrics:
+            assert isinstance(m, Metric), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._metrics.extend(metrics)
+
+    def load_weights(self, weights):
+        if self.is_loaded_weights:
+            return
+        self.start_epoch = 0
+        load_pretrain_weight(self.model, weights)
+        logger.debug("Load weights {} to start training".format(weights))
+
+        if self.mode in ['eval', 'test'] and self.cfg.architecture == 'YOLOv7':
+            self.model.yolo_head.fuse()
+
+    def resume_weights(self, weights):
+        # support Distill resume weights
+        if hasattr(self.model, 'student_model'):
+            self.start_epoch = load_weight(self.model.student_model, weights,
+                                           self.optimizer)
+        else:
+            self.start_epoch = load_weight(self.model, weights, self.optimizer,
+                                           self.ema if self.use_ema else None)
+        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
+
+    def train(self, validate=False):
+        assert self.mode == 'train', "Model not in 'train' mode"
+        Init_mark = False
+        if validate:
+            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
+                "EvalDataset")()
+
+        model = self.model
+        if self.cfg.get('to_static', False):
+            model = apply_to_static(self.cfg, model)
+            if self.cfg.architecture == 'YOLOv5':
+                model.yolo_head.loss.to_static = True
+        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
+                   (self.cfg.use_gpu or self.cfg.use_mlu) and self._nranks > 1)
+        if sync_bn:
+            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+        # enabel auto mixed precision mode
+        if self.use_amp:
+            scaler = paddle.amp.GradScaler(
+                enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,
+                init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
+
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            model = fleet.distributed_model(model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            model = paddle.DataParallel(
+                model, find_unused_parameters=find_unused_parameters)
+
+        self.status.update({
+            'epoch_id': self.start_epoch,
+            'step_id': 0,
+            'steps_per_epoch': len(self.loader)
+        })
+
+        self.status['batch_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['data_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
+
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num)
+            self._flops(flops_loader)
+        profiler_options = self.cfg.get('profiler_options', None)
+
+        self._compose_callback.on_train_begin(self.status)
+
+        use_fused_allreduce_gradients = self.cfg[
+            'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False
+
+        for epoch_id in range(self.start_epoch, self.cfg.epoch):
+            self.status['mode'] = 'train'
+            self.status['epoch_id'] = epoch_id
+            self._compose_callback.on_epoch_begin(self.status)
+            self.loader.dataset.set_epoch(epoch_id)
+            model.train()
+            iter_tic = time.time()
+            for step_id, data in enumerate(self.loader):
+                if self.cfg.architecture in [
+                        'YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8'
+                ]:
+                    # TODO: YOLOv5 Warmup, always 3 epoch
+                    nw = 3 * len(self.loader)
+                    ni = len(self.loader) * epoch_id + step_id
+                    # yolov5 Warmup
+                    if ni <= nw:
+                        xi = [0, nw]
+                        self.optimizer._momentum = np.interp(ni, xi,
+                                                             [0.8, 0.937])
+                        self.optimizer._default_dict['momentum'] = np.interp(
+                            ni, xi, [0.8, 0.937])
+
+                self.status['data_time'].update(time.time() - iter_tic)
+                self.status['step_id'] = step_id
+                profiler.add_profiler_step(profiler_options)
+                self._compose_callback.on_step_begin(self.status)
+                data['epoch_id'] = epoch_id
+                data['num_gpus'] = self._nranks
+                if self.cfg.get('to_static',
+                                False) and 'image_file' in data.keys():
+                    data.pop('image_file')
+
+                if self.use_amp:
+                    if isinstance(
+                            model, paddle.
+                            DataParallel) and use_fused_allreduce_gradients:
+                        with model.no_sync():
+                            with paddle.amp.auto_cast(
+                                    enable=self.cfg.use_gpu or
+                                    self.cfg.use_npu or self.cfg.use_mlu,
+                                    custom_white_list=self.custom_white_list,
+                                    custom_black_list=self.custom_black_list,
+                                    level=self.amp_level):
+                                # model forward
+                                outputs = model(data)
+                                loss = outputs['loss']
+                            # model backward
+                            scaled_loss = scaler.scale(loss)
+                            scaled_loss.backward()
+                        fused_allreduce_gradients(
+                            list(model.parameters()), None)
+                    else:
+                        with paddle.amp.auto_cast(
+                                enable=self.cfg.use_gpu or self.cfg.use_npu or
+                                self.cfg.use_mlu,
+                                custom_white_list=self.custom_white_list,
+                                custom_black_list=self.custom_black_list,
+                                level=self.amp_level):
+                            # model forward
+                            outputs = model(data)
+                            loss = outputs['loss']
+                        # model backward
+                        scaled_loss = scaler.scale(loss)
+                        scaled_loss.backward()
+                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
+                    scaler.minimize(self.optimizer, scaled_loss)
+                else:
+                    # model forward
+                    outputs = model(data)
+                    loss = outputs['loss']
+
+                    # avoid some all_reduce timeout due to computation progress differs between xpu cards
+                    if self._nranks > 1 and self.cfg.use_xpu:
+                        tensor_for_all_reduce = paddle.to_tensor(1.0)
+                        paddle.distributed.all_reduce(tensor_for_all_reduce)
+
+                    # model backward
+                    loss.backward()
+                    self.optimizer.step()
+                curr_lr = self.optimizer.get_lr()
+                self.lr.step()
+                if self.cfg.get('unstructured_prune'):
+                    self.pruner.step()
+                self.optimizer.clear_grad()
+                self.status['learning_rate'] = curr_lr
+
+                if self._nranks < 2 or self._local_rank == 0:
+                    self.status['training_staus'].update(outputs)
+
+                self.status['batch_time'].update(time.time() - iter_tic)
+                self._compose_callback.on_step_end(self.status)
+                if self.use_ema:
+                    self.ema.update()
+                iter_tic = time.time()
+
+            if self.cfg.get('unstructured_prune'):
+                self.pruner.update_params()
+
+            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
+                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
+            if is_snapshot and self.use_ema:
+                # apply ema weight on model
+                weight = copy.deepcopy(self.model.state_dict())
+                self.model.set_dict(self.ema.apply())
+                self.status['weight'] = weight
+
+            self._compose_callback.on_epoch_end(self.status)
+
+            if validate and is_snapshot:
+                if not hasattr(self, '_eval_loader'):
+                    # build evaluation dataset and loader
+                    self._eval_dataset = self.cfg.EvalDataset
+                    self._eval_batch_sampler = \
+                        paddle.io.BatchSampler(
+                            self._eval_dataset,
+                            batch_size=self.cfg.EvalReader['batch_size'])
+                    # If metric is VOC, need to be set collate_batch=False.
+                    if self.cfg.metric == 'VOC':
+                        self.cfg['EvalReader']['collate_batch'] = False
+                    self._eval_loader = create('EvalReader')(
+                        self._eval_dataset,
+                        self.cfg.worker_num,
+                        batch_sampler=self._eval_batch_sampler)
+                # if validation in training is enabled, metrics should be re-init
+                # Init_mark makes sure this code will only execute once
+                if validate and Init_mark == False:
+                    Init_mark = True
+                    self._init_metrics(validate=validate)
+                    self._reset_metrics()
+
+                with paddle.no_grad():
+                    self.status['save_best_model'] = True
+                    self._eval_with_loader(self._eval_loader)
+
+            if is_snapshot and self.use_ema:
+                # reset original weight
+                self.model.set_dict(weight)
+                self.status.pop('weight')
+
+        self._compose_callback.on_train_end(self.status)
+
+    def _eval_with_loader(self, loader):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+
+        self.model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
+            self._flops(flops_loader)
+        eval_batch_size = self.cfg.EvalReader['batch_size']
+        logger.info("Eval loader length is {}, eval batch_size is {}.".format(
+            len(loader), eval_batch_size))
+        logger.info("Starting evaluation ......\n")
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = self.model(data)
+            else:
+                outs = self.model(data)
+
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data, outs)
+
+            # multi-scale inputs: all inputs have same im_id
+            if isinstance(data, typing.Sequence):
+                sample_num += data[0]['im_id'].numpy().shape[0]
+            else:
+                sample_num += data['im_id'].numpy().shape[0]
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def evaluate(self):
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+        with paddle.no_grad():
+            self._eval_with_loader(self.loader)
+
+    def _eval_with_loader_slice(self,
+                                loader,
+                                slice_size=[640, 640],
+                                overlap_ratio=[0.25, 0.25],
+                                combine_method='nms',
+                                match_threshold=0.6,
+                                match_metric='iou'):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+        self.model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
+            self._flops(flops_loader)
+
+        merged_bboxs = []
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = self.model(data)
+            else:
+                outs = self.model(data)
+
+            shift_amount = data['st_pix']
+            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount
+            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount
+            merged_bboxs.append(outs['bbox'])
+
+            if data['is_last'] > 0:
+                # merge matching predictions
+                merged_results = {'bbox': []}
+                if combine_method == 'nms':
+                    final_boxes = multiclass_nms(
+                        np.concatenate(merged_bboxs), self.cfg.num_classes,
+                        match_threshold, match_metric)
+                    merged_results['bbox'] = np.concatenate(final_boxes)
+                elif combine_method == 'concat':
+                    merged_results['bbox'] = np.concatenate(merged_bboxs)
+                else:
+                    raise ValueError(
+                        "Now only support 'nms' or 'concat' to fuse detection results."
+                    )
+                merged_results['im_id'] = np.array([[0]])
+                merged_results['bbox_num'] = np.array(
+                    [len(merged_results['bbox'])])
+
+                merged_bboxs = []
+                data['im_id'] = data['ori_im_id']
+                # update metrics
+                for metric in self._metrics:
+                    metric.update(data, merged_results)
+
+                # multi-scale inputs: all inputs have same im_id
+                if isinstance(data, typing.Sequence):
+                    sample_num += data[0]['im_id'].numpy().shape[0]
+                else:
+                    sample_num += data['im_id'].numpy().shape[0]
+
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def evaluate_slice(self,
+                       slice_size=[640, 640],
+                       overlap_ratio=[0.25, 0.25],
+                       combine_method='nms',
+                       match_threshold=0.6,
+                       match_metric='iou'):
+        with paddle.no_grad():
+            self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio,
+                                         combine_method, match_threshold,
+                                         match_metric)
+
+    def slice_predict(self,
+                      images,
+                      slice_size=[640, 640],
+                      overlap_ratio=[0.25, 0.25],
+                      combine_method='nms',
+                      match_threshold=0.6,
+                      match_metric='iou',
+                      draw_threshold=0.5,
+                      output_dir='output',
+                      save_results=False,
+                      visualize=True):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        self.dataset.set_slice_images(images, slice_size, overlap_ratio)
+        loader = create('TestReader')(self.dataset, 0)
+        imid2path = self.dataset.get_imid2path()
+
+        def setup_metrics_for_loader():
+            # mem
+            metrics = copy.deepcopy(self._metrics)
+            mode = self.mode
+            save_prediction_only = self.cfg[
+                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
+            output_eval = self.cfg[
+                'output_eval'] if 'output_eval' in self.cfg else None
+
+            # modify
+            self.mode = '_test'
+            self.cfg['save_prediction_only'] = True
+            self.cfg['output_eval'] = output_dir
+            self.cfg['imid2path'] = imid2path
+            self._init_metrics()
+
+            # restore
+            self.mode = mode
+            self.cfg.pop('save_prediction_only')
+            if save_prediction_only is not None:
+                self.cfg['save_prediction_only'] = save_prediction_only
+
+            self.cfg.pop('output_eval')
+            if output_eval is not None:
+                self.cfg['output_eval'] = output_eval
+
+            self.cfg.pop('imid2path')
+
+            _metrics = copy.deepcopy(self._metrics)
+            self._metrics = metrics
+
+            return _metrics
+
+        if save_results:
+            metrics = setup_metrics_for_loader()
+        else:
+            metrics = []
+
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+
+        # Run Infer 
+        self.status['mode'] = 'test'
+        self.model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('TestReader')(self.dataset, 0)
+            self._flops(flops_loader)
+
+        results = []  # all images
+        merged_bboxs = []  # single image
+        for step_id, data in enumerate(tqdm(loader)):
+            self.status['step_id'] = step_id
+            # forward
+            outs = self.model(data)
+
+            outs['bbox'] = outs['bbox'].numpy()  # only in test mode
+            shift_amount = data['st_pix']
+            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy()
+            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy()
+            merged_bboxs.append(outs['bbox'])
+
+            if data['is_last'] > 0:
+                # merge matching predictions
+                merged_results = {'bbox': []}
+                if combine_method == 'nms':
+                    final_boxes = multiclass_nms(
+                        np.concatenate(merged_bboxs), self.cfg.num_classes,
+                        match_threshold, match_metric)
+                    merged_results['bbox'] = np.concatenate(final_boxes)
+                elif combine_method == 'concat':
+                    merged_results['bbox'] = np.concatenate(merged_bboxs)
+                else:
+                    raise ValueError(
+                        "Now only support 'nms' or 'concat' to fuse detection results."
+                    )
+                merged_results['im_id'] = np.array([[0]])
+                merged_results['bbox_num'] = np.array(
+                    [len(merged_results['bbox'])])
+
+                merged_bboxs = []
+                data['im_id'] = data['ori_im_id']
+
+                for _m in metrics:
+                    _m.update(data, merged_results)
+
+                for key in ['im_shape', 'scale_factor', 'im_id']:
+                    if isinstance(data, typing.Sequence):
+                        merged_results[key] = data[0][key]
+                    else:
+                        merged_results[key] = data[key]
+                for key, value in merged_results.items():
+                    if hasattr(value, 'numpy'):
+                        merged_results[key] = value.numpy()
+                results.append(merged_results)
+
+        for _m in metrics:
+            _m.accumulate()
+            _m.reset()
+
+        if visualize:
+            for outs in results:
+                batch_res = get_infer_results(outs, clsid2catid)
+                bbox_num = outs['bbox_num']
+
+                start = 0
+                for i, im_id in enumerate(outs['im_id']):
+                    image_path = imid2path[int(im_id)]
+                    image = Image.open(image_path).convert('RGB')
+                    image = ImageOps.exif_transpose(image)
+                    self.status['original_image'] = np.array(image.copy())
+
+                    end = start + bbox_num[i]
+                    bbox_res = batch_res['bbox'][start:end] \
+                            if 'bbox' in batch_res else None
+                    mask_res = batch_res['mask'][start:end] \
+                            if 'mask' in batch_res else None
+                    segm_res = batch_res['segm'][start:end] \
+                            if 'segm' in batch_res else None
+                    keypoint_res = batch_res['keypoint'][start:end] \
+                            if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
+                    image = visualize_results(
+                        image, bbox_res, mask_res, segm_res, keypoint_res,
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
+                    self.status['result_image'] = np.array(image.copy())
+                    if self._compose_callback:
+                        self._compose_callback.on_step_end(self.status)
+                    # save image with detection
+                    save_name = self._get_save_image_name(output_dir,
+                                                          image_path)
+                    logger.info("Detection bbox results save in {}".format(
+                        save_name))
+                    image.save(save_name, quality=95)
+
+                    start = end
+
+    def predict(self,
+                images,
+                draw_threshold=0.5,
+                output_dir='output',
+                save_results=False,
+                visualize=True):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        self.dataset.set_images(images)
+        loader = create('TestReader')(self.dataset, 0)
+
+        imid2path = self.dataset.get_imid2path()
+
+        def setup_metrics_for_loader():
+            # mem
+            metrics = copy.deepcopy(self._metrics)
+            mode = self.mode
+            save_prediction_only = self.cfg[
+                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
+            output_eval = self.cfg[
+                'output_eval'] if 'output_eval' in self.cfg else None
+
+            # modify
+            self.mode = '_test'
+            self.cfg['save_prediction_only'] = True
+            self.cfg['output_eval'] = output_dir
+            self.cfg['imid2path'] = imid2path
+            self._init_metrics()
+
+            # restore
+            self.mode = mode
+            self.cfg.pop('save_prediction_only')
+            if save_prediction_only is not None:
+                self.cfg['save_prediction_only'] = save_prediction_only
+
+            self.cfg.pop('output_eval')
+            if output_eval is not None:
+                self.cfg['output_eval'] = output_eval
+
+            self.cfg.pop('imid2path')
+
+            _metrics = copy.deepcopy(self._metrics)
+            self._metrics = metrics
+
+            return _metrics
+
+        if save_results:
+            metrics = setup_metrics_for_loader()
+        else:
+            metrics = []
+
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+
+        # Run Infer 
+        self.status['mode'] = 'test'
+        self.model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('TestReader')(self.dataset, 0)
+            self._flops(flops_loader)
+        results = []
+        test_batch_size = self.cfg.TestReader['batch_size']
+        logger.info("Test loader length is {}, test batch_size is {}.".format(
+            len(loader), test_batch_size))
+        logger.info("Starting predicting ......\n")
+        for step_id, data in enumerate(tqdm(loader)):
+            self.status['step_id'] = step_id
+            # forward
+            outs = self.model(data)
+
+            for _m in metrics:
+                _m.update(data, outs)
+
+            for key in ['im_shape', 'scale_factor', 'im_id']:
+                if isinstance(data, typing.Sequence):
+                    outs[key] = data[0][key]
+                else:
+                    outs[key] = data[key]
+            for key, value in outs.items():
+                if hasattr(value, 'numpy'):
+                    outs[key] = value.numpy()
+            results.append(outs)
+
+        for _m in metrics:
+            _m.accumulate()
+            _m.reset()
+
+        if visualize:
+            for outs in results:
+                batch_res = get_infer_results(outs, clsid2catid)
+                bbox_num = outs['bbox_num']
+
+                start = 0
+                for i, im_id in enumerate(outs['im_id']):
+                    image_path = imid2path[int(im_id)]
+                    image = Image.open(image_path).convert('RGB')
+                    image = ImageOps.exif_transpose(image)
+                    self.status['original_image'] = np.array(image.copy())
+
+                    end = start + bbox_num[i]
+                    bbox_res = batch_res['bbox'][start:end] \
+                            if 'bbox' in batch_res else None
+                    mask_res = batch_res['mask'][start:end] \
+                            if 'mask' in batch_res else None
+                    segm_res = batch_res['segm'][start:end] \
+                            if 'segm' in batch_res else None
+                    keypoint_res = batch_res['keypoint'][start:end] \
+                            if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
+                    image = visualize_results(
+                        image, bbox_res, mask_res, segm_res, keypoint_res,
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
+                    self.status['result_image'] = np.array(image.copy())
+                    if self._compose_callback:
+                        self._compose_callback.on_step_end(self.status)
+                    # save image with detection
+                    save_name = self._get_save_image_name(output_dir,
+                                                          image_path)
+                    logger.info("Detection bbox results save in {}".format(
+                        save_name))
+                    image.save(save_name, quality=95)
+
+                    start = end
+        return results
+
+    def _get_save_image_name(self, output_dir, image_path):
+        """
+        Get save image name from source image path.
+        """
+        image_name = os.path.split(image_path)[-1]
+        name, ext = os.path.splitext(image_name)
+        return os.path.join(output_dir, "{}".format(name)) + ext
+
+    def _get_infer_cfg_and_input_spec(self,
+                                      save_dir,
+                                      prune_input=True,
+                                      kl_quant=False):
+        image_shape = None
+        im_shape = [None, 2]
+        scale_factor = [None, 2]
+        test_reader_name = 'TestReader'
+        if 'inputs_def' in self.cfg[test_reader_name]:
+            inputs_def = self.cfg[test_reader_name]['inputs_def']
+            image_shape = inputs_def.get('image_shape', None)
+        # set image_shape=[None, 3, -1, -1] as default
+        if image_shape is None:
+            image_shape = [None, 3, -1, -1]
+
+        if len(image_shape) == 3:
+            image_shape = [None] + image_shape
+        else:
+            im_shape = [image_shape[0], 2]
+            scale_factor = [image_shape[0], 2]
+
+        if hasattr(self.model, 'deploy'):
+            self.model.deploy = True
+
+        if 'slim' not in self.cfg:
+            for layer in self.model.sublayers():
+                if hasattr(layer, 'convert_to_deploy'):
+                    layer.convert_to_deploy()
+
+        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
+                'export'] and self.cfg['export']['fuse_conv_bn']:
+            self.model = fuse_conv_bn(self.model)
+
+        export_post_process = self.cfg['export'].get(
+            'post_process', False) if hasattr(self.cfg, 'export') else True
+        export_nms = self.cfg['export'].get('nms', False) if hasattr(
+            self.cfg, 'export') else True
+        export_benchmark = self.cfg['export'].get(
+            'benchmark', False) if hasattr(self.cfg, 'export') else False
+        if hasattr(self.model, 'fuse_norm'):
+            self.model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize',
+                                                              False)
+        if hasattr(self.model, 'export_post_process'):
+            self.model.export_post_process = export_post_process if not export_benchmark else False
+        if hasattr(self.model, 'export_nms'):
+            self.model.export_nms = export_nms if not export_benchmark else False
+        if export_post_process and not export_benchmark:
+            image_shape = [None] + image_shape[1:]
+
+        # Save infer cfg
+        _dump_infer_config(self.cfg,
+                           os.path.join(save_dir, 'infer_cfg.yml'), image_shape,
+                           self.model)
+
+        input_spec = [{
+            "image": InputSpec(
+                shape=image_shape, name='image'),
+            "im_shape": InputSpec(
+                shape=im_shape, name='im_shape'),
+            "scale_factor": InputSpec(
+                shape=scale_factor, name='scale_factor')
+        }]
+
+        if prune_input:
+            static_model = paddle.jit.to_static(
+                self.model, input_spec=input_spec, full_graph=True)
+            # NOTE: dy2st do not pruned program, but jit.save will prune program
+            # input spec, prune input spec here and save with pruned input spec
+            pruned_input_spec = _prune_input_spec(
+                input_spec, static_model.forward.main_program,
+                static_model.forward.outputs)
+        else:
+            static_model = None
+            pruned_input_spec = input_spec
+
+        # TODO: Hard code, delete it when support prune input_spec.
+        if self.cfg.architecture == 'PicoDet' and not export_post_process:
+            pruned_input_spec = [{
+                "image": InputSpec(
+                    shape=image_shape, name='image')
+            }]
+        if kl_quant:
+            if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights:
+                pruned_input_spec = [{
+                    "image": InputSpec(
+                        shape=image_shape, name='image'),
+                    "scale_factor": InputSpec(
+                        shape=scale_factor, name='scale_factor')
+                }]
+            elif 'tinypose' in self.cfg.weights:
+                pruned_input_spec = [{
+                    "image": InputSpec(
+                        shape=image_shape, name='image')
+                }]
+
+        return static_model, pruned_input_spec
+
+    def export(self, output_dir='output_inference'):
+        if hasattr(self.model, 'aux_neck'):
+            self.model.__delattr__('aux_neck')
+        if hasattr(self.model, 'aux_head'):
+            self.model.__delattr__('aux_head')
+        self.model.eval()
+
+        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
+                'export'] and self.cfg['export']['fuse_conv_bn']:
+            self.model = fuse_conv_bn(self.model)
+
+        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
+        save_dir = os.path.join(output_dir, model_name)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec(
+            save_dir)
+
+        # dy2st and save model
+        if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']:
+            paddle.jit.save(
+                static_model,
+                os.path.join(save_dir, 'model'),
+                input_spec=pruned_input_spec)
+        else:
+            self.cfg.slim.save_quantized_model(
+                self.model,
+                os.path.join(save_dir, 'model'),
+                input_spec=pruned_input_spec)
+        logger.info("Export model and saved in {}".format(save_dir))
+
+    def post_quant(self, output_dir='output_inference'):
+        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
+        save_dir = os.path.join(output_dir, model_name)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        for idx, data in enumerate(self.loader):
+            self.model(data)
+            if idx == int(self.cfg.get('quant_batch_num', 10)):
+                break
+
+        # TODO: support prune input_spec
+        kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False
+        _, pruned_input_spec = self._get_infer_cfg_and_input_spec(
+            save_dir, prune_input=False, kl_quant=kl_quant)
+
+        self.cfg.slim.save_quantized_model(
+            self.model,
+            os.path.join(save_dir, 'model'),
+            input_spec=pruned_input_spec)
+        logger.info("Export Post-Quant model and saved in {}".format(save_dir))
+
+    def _flops(self, loader):
+        if hasattr(self.model, 'aux_neck'):
+            self.model.__delattr__('aux_neck')
+        if hasattr(self.model, 'aux_head'):
+            self.model.__delattr__('aux_head')
+        self.model.eval()
+        try:
+            import paddleslim
+        except Exception as e:
+            logger.warning(
+                'Unable to calculate flops, please install paddleslim, for example: `pip install paddleslim`'
+            )
+            return
+
+        from paddleslim.analysis import dygraph_flops as flops
+        input_data = None
+        for data in loader:
+            input_data = data
+            break
+
+        input_spec = [{
+            "image": input_data['image'][0].unsqueeze(0),
+            "im_shape": input_data['im_shape'][0].unsqueeze(0),
+            "scale_factor": input_data['scale_factor'][0].unsqueeze(0)
+        }]
+        flops = flops(self.model, input_spec) / (1000**3)
+        logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format(
+            flops, input_data['image'][0].unsqueeze(0).shape))
diff --git a/ppdet/engine/trainer_ssod.py b/ppdet/engine/trainer_ssod.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cd8b7aaa45b1578d2dd3392da7254d34220e58d
--- /dev/null
+++ b/ppdet/engine/trainer_ssod.py
@@ -0,0 +1,848 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import copy
+import time
+import typing
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from ppdet.optimizer import ModelEMA, SimpleModelEMA
+from ppdet.core.workspace import create
+from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, save_model
+import ppdet.utils.stats as stats
+from ppdet.utils import profiler
+from ppdet.modeling.ssod.utils import align_weak_strong_shape
+from .trainer import Trainer
+from ppdet.utils.logger import setup_logger
+from paddle.static import InputSpec
+from ppdet.engine.export_utils import _dump_infer_config, _prune_input_spec
+logger = setup_logger('ppdet.engine')
+
+__all__ = ['Trainer_DenseTeacher', 'Trainer_ARSL']
+
+
+class Trainer_DenseTeacher(Trainer):
+    def __init__(self, cfg, mode='train'):
+        self.cfg = cfg
+        assert mode.lower() in ['train', 'eval', 'test'], \
+                "mode should be 'train', 'eval' or 'test'"
+        self.mode = mode.lower()
+        self.optimizer = None
+        self.is_loaded_weights = False
+        self.use_amp = self.cfg.get('amp', False)
+        self.amp_level = self.cfg.get('amp_level', 'O1')
+        self.custom_white_list = self.cfg.get('custom_white_list', None)
+        self.custom_black_list = self.cfg.get('custom_black_list', None)
+
+        # build data loader
+        capital_mode = self.mode.capitalize()
+        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
+            '{}Dataset'.format(capital_mode))()
+
+        if self.mode == 'train':
+            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
+                'UnsupTrainDataset')
+            self.loader = create('SemiTrainReader')(
+                self.dataset, self.dataset_unlabel, cfg.worker_num)
+
+        # build model
+        if 'model' not in self.cfg:
+            self.model = create(cfg.architecture)
+        else:
+            self.model = self.cfg.model
+            self.is_loaded_weights = True
+
+        # EvalDataset build with BatchSampler to evaluate in single device
+        # TODO: multi-device evaluate
+        if self.mode == 'eval':
+            self._eval_batch_sampler = paddle.io.BatchSampler(
+                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
+            # If metric is VOC, need to be set collate_batch=False.
+            if cfg.metric == 'VOC':
+                cfg['EvalReader']['collate_batch'] = False
+            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
+                                               self._eval_batch_sampler)
+        # TestDataset build after user set images, skip loader creation here
+
+        # build optimizer in train mode
+        if self.mode == 'train':
+            steps_per_epoch = len(self.loader)
+            if steps_per_epoch < 1:
+                logger.warning(
+                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
+                )
+            self.lr = create('LearningRate')(steps_per_epoch)
+            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+            # Unstructured pruner is only enabled in the train mode.
+            if self.cfg.get('unstructured_prune'):
+                self.pruner = create('UnstructuredPruner')(self.model,
+                                                           steps_per_epoch)
+        if self.use_amp and self.amp_level == 'O2':
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=self.amp_level)
+
+        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
+        if self.use_ema:
+            ema_decay = self.cfg.get('ema_decay', 0.9998)
+            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
+            cycle_epoch = self.cfg.get('cycle_epoch', -1)
+            ema_black_list = self.cfg.get('ema_black_list', None)
+            self.ema = ModelEMA(
+                self.model,
+                decay=ema_decay,
+                ema_decay_type=ema_decay_type,
+                cycle_epoch=cycle_epoch,
+                ema_black_list=ema_black_list)
+            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
+
+        # simple_ema for SSOD
+        self.use_simple_ema = ('use_simple_ema' in cfg and
+                               cfg['use_simple_ema'])
+        if self.use_simple_ema:
+            self.use_ema = True
+            ema_decay = self.cfg.get('ema_decay', 0.9996)
+            self.ema = SimpleModelEMA(self.model, decay=ema_decay)
+            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
+
+        self._nranks = dist.get_world_size()
+        self._local_rank = dist.get_rank()
+
+        self.status = {}
+
+        self.start_epoch = 0
+        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+
+    def load_weights(self, weights):
+        if self.is_loaded_weights:
+            return
+        self.start_epoch = 0
+        load_pretrain_weight(self.model, weights)
+        load_pretrain_weight(self.ema.model, weights)
+        logger.info("Load weights {} to start training for teacher and student".
+                    format(weights))
+
+    def resume_weights(self, weights, exchange=True):
+        # support Distill resume weights
+        if hasattr(self.model, 'student_model'):
+            self.start_epoch = load_weight(self.model.student_model, weights,
+                                           self.optimizer, exchange)
+        else:
+            self.start_epoch = load_weight(self.model, weights, self.optimizer,
+                                           self.ema
+                                           if self.use_ema else None, exchange)
+        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
+
+    def train(self, validate=False):
+        self.semi_start_iters = self.cfg.get('semi_start_iters', 5000)
+        Init_mark = False
+        if validate:
+            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
+                "EvalDataset")()
+
+        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
+                   self.cfg.use_gpu and self._nranks > 1)
+        if sync_bn:
+            self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                self.model)
+
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+            self.ema.model = paddle.DataParallel(
+                self.ema.model, find_unused_parameters=find_unused_parameters)
+
+        self.status.update({
+            'epoch_id': self.start_epoch,
+            'step_id': 0,
+            'steps_per_epoch': len(self.loader),
+            'exchange_save_model': True,
+        })
+        # Note: exchange_save_model
+        # in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams
+
+        self.status['batch_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['data_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
+        profiler_options = self.cfg.get('profiler_options', None)
+        self._compose_callback.on_train_begin(self.status)
+
+        train_cfg = self.cfg.DenseTeacher['train_cfg']
+        concat_sup_data = train_cfg.get('concat_sup_data', True)
+
+        for param in self.ema.model.parameters():
+            param.stop_gradient = True
+
+        for epoch_id in range(self.start_epoch, self.cfg.epoch):
+            self.status['mode'] = 'train'
+            self.status['epoch_id'] = epoch_id
+            self._compose_callback.on_epoch_begin(self.status)
+            self.loader.dataset_label.set_epoch(epoch_id)
+            self.loader.dataset_unlabel.set_epoch(epoch_id)
+            iter_tic = time.time()
+            loss_dict = {
+                'loss': paddle.to_tensor([0]),
+                'loss_sup_sum': paddle.to_tensor([0]),
+                'loss_unsup_sum': paddle.to_tensor([0]),
+                'fg_sum': paddle.to_tensor([0]),
+            }
+            if self._nranks > 1:
+                for k in self.model._layers.get_loss_keys():
+                    loss_dict.update({k: paddle.to_tensor([0.])})
+                for k in self.model._layers.get_loss_keys():
+                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
+            else:
+                for k in self.model.get_loss_keys():
+                    loss_dict.update({k: paddle.to_tensor([0.])})
+                for k in self.model.get_loss_keys():
+                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
+
+            # Note: for step_id, data in enumerate(self.loader): # enumerate bug
+            for step_id in range(len(self.loader)):
+                data = next(self.loader)
+
+                self.model.train()
+                self.ema.model.eval()
+                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data
+
+                self.status['data_time'].update(time.time() - iter_tic)
+                self.status['step_id'] = step_id
+                profiler.add_profiler_step(profiler_options)
+                self._compose_callback.on_step_begin(self.status)
+
+                if data_sup_w['image'].shape != data_sup_s['image'].shape:
+                    data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
+                                                                     data_sup_s)
+
+                data_sup_w['epoch_id'] = epoch_id
+                data_sup_s['epoch_id'] = epoch_id
+                if concat_sup_data:
+                    for k, v in data_sup_s.items():
+                        if k in ['epoch_id']:
+                            continue
+                        data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
+                    loss_dict_sup = self.model(data_sup_s)
+                else:
+                    loss_dict_sup_w = self.model(data_sup_w)
+                    loss_dict_sup = self.model(data_sup_s)
+                    for k, v in loss_dict_sup_w.items():
+                        loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5
+
+                losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight']
+                losses_sup.backward()
+
+                losses = losses_sup.detach()
+                loss_dict.update(loss_dict_sup)
+                loss_dict.update({'loss_sup_sum': loss_dict['loss']})
+
+                curr_iter = len(self.loader) * epoch_id + step_id
+                st_iter = self.semi_start_iters
+                if curr_iter == st_iter:
+                    logger.info("***" * 30)
+                    logger.info('Semi starting ...')
+                    logger.info("***" * 30)
+                if curr_iter > st_iter:
+                    unsup_weight = train_cfg['unsup_weight']
+                    if train_cfg['suppress'] == 'linear':
+                        tar_iter = st_iter * 2
+                        if curr_iter <= tar_iter:
+                            unsup_weight *= (curr_iter - st_iter) / st_iter
+                    elif train_cfg['suppress'] == 'exp':
+                        tar_iter = st_iter + 2000
+                        if curr_iter <= tar_iter:
+                            scale = np.exp((curr_iter - tar_iter) / 1000)
+                            unsup_weight *= scale
+                    elif train_cfg['suppress'] == 'step':
+                        tar_iter = st_iter * 2
+                        if curr_iter <= tar_iter:
+                            unsup_weight *= 0.25
+                    else:
+                        raise ValueError
+
+                    if data_unsup_w['image'].shape != data_unsup_s[
+                            'image'].shape:
+                        data_unsup_w, data_unsup_s = align_weak_strong_shape(
+                            data_unsup_w, data_unsup_s)
+
+                    data_unsup_w['epoch_id'] = epoch_id
+                    data_unsup_s['epoch_id'] = epoch_id
+
+                    data_unsup_s['get_data'] = True
+                    student_preds = self.model(data_unsup_s)
+
+                    with paddle.no_grad():
+                        data_unsup_w['is_teacher'] = True
+                        teacher_preds = self.ema.model(data_unsup_w)
+
+                    train_cfg['curr_iter'] = curr_iter
+                    train_cfg['st_iter'] = st_iter
+                    if self._nranks > 1:
+                        loss_dict_unsup = self.model._layers.get_ssod_loss(
+                            student_preds, teacher_preds, train_cfg)
+                    else:
+                        loss_dict_unsup = self.model.get_ssod_loss(
+                            student_preds, teacher_preds, train_cfg)
+
+                    fg_num = loss_dict_unsup["fg_sum"]
+                    del loss_dict_unsup["fg_sum"]
+                    distill_weights = train_cfg['loss_weight']
+                    loss_dict_unsup = {
+                        k: v * distill_weights[k]
+                        for k, v in loss_dict_unsup.items()
+                    }
+
+                    losses_unsup = sum([
+                        metrics_value
+                        for metrics_value in loss_dict_unsup.values()
+                    ]) * unsup_weight
+                    losses_unsup.backward()
+
+                    loss_dict.update(loss_dict_unsup)
+                    loss_dict.update({'loss_unsup_sum': losses_unsup})
+                    losses += losses_unsup.detach()
+                    loss_dict.update({"fg_sum": fg_num})
+                    loss_dict['loss'] = losses
+
+                self.optimizer.step()
+                curr_lr = self.optimizer.get_lr()
+                self.lr.step()
+                self.optimizer.clear_grad()
+                self.status['learning_rate'] = curr_lr
+                if self._nranks < 2 or self._local_rank == 0:
+                    self.status['training_staus'].update(loss_dict)
+
+                self.status['batch_time'].update(time.time() - iter_tic)
+                self._compose_callback.on_step_end(self.status)
+                # Note: ema_start_iters
+                if self.use_ema and curr_iter == self.ema_start_iters:
+                    logger.info("***" * 30)
+                    logger.info('EMA starting ...')
+                    logger.info("***" * 30)
+                    self.ema.update(self.model, decay=0)
+                elif self.use_ema and curr_iter > self.ema_start_iters:
+                    self.ema.update(self.model)
+                iter_tic = time.time()
+
+            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
+                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
+            if is_snapshot and self.use_ema:
+                # apply ema weight on model
+                weight = copy.deepcopy(self.ema.model.state_dict())
+                for k, v in weight.items():
+                    if paddle.is_floating_point(v):
+                        weight[k].stop_gradient = True
+                self.status['weight'] = weight
+
+            self._compose_callback.on_epoch_end(self.status)
+
+            if validate and is_snapshot:
+                if not hasattr(self, '_eval_loader'):
+                    # build evaluation dataset and loader
+                    self._eval_dataset = self.cfg.EvalDataset
+                    self._eval_batch_sampler = \
+                        paddle.io.BatchSampler(
+                            self._eval_dataset,
+                            batch_size=self.cfg.EvalReader['batch_size'])
+                    # If metric is VOC, need to be set collate_batch=False.
+                    if self.cfg.metric == 'VOC':
+                        self.cfg['EvalReader']['collate_batch'] = False
+                    self._eval_loader = create('EvalReader')(
+                        self._eval_dataset,
+                        self.cfg.worker_num,
+                        batch_sampler=self._eval_batch_sampler)
+                # if validation in training is enabled, metrics should be re-init
+                # Init_mark makes sure this code will only execute once
+                if validate and Init_mark == False:
+                    Init_mark = True
+                    self._init_metrics(validate=validate)
+                    self._reset_metrics()
+
+                with paddle.no_grad():
+                    self.status['save_best_model'] = True
+                    self._eval_with_loader(self._eval_loader)
+
+            if is_snapshot and self.use_ema:
+                self.status.pop('weight')
+
+        self._compose_callback.on_train_end(self.status)
+
+    def evaluate(self):
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+        with paddle.no_grad():
+            self._eval_with_loader(self.loader)
+
+    def _eval_with_loader(self, loader):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+
+        test_cfg = self.cfg.DenseTeacher['test_cfg']
+        if test_cfg['inference_on'] == 'teacher':
+            logger.info("***** teacher model evaluating *****")
+            eval_model = self.ema.model
+        else:
+            logger.info("***** student model evaluating *****")
+            eval_model = self.model
+
+        eval_model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
+            self._flops(flops_loader)
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = eval_model(data)
+            else:
+                outs = eval_model(data)
+
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data, outs)
+
+            # multi-scale inputs: all inputs have same im_id
+            if isinstance(data, typing.Sequence):
+                sample_num += data[0]['im_id'].numpy().shape[0]
+            else:
+                sample_num += data['im_id'].numpy().shape[0]
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        self._reset_metrics()
+
+
+class Trainer_ARSL(Trainer):
+    def __init__(self, cfg, mode='train'):
+        self.cfg = cfg
+        assert mode.lower() in ['train', 'eval', 'test'], \
+                "mode should be 'train', 'eval' or 'test'"
+        self.mode = mode.lower()
+        self.optimizer = None
+        self.is_loaded_weights = False
+        capital_mode = self.mode.capitalize()
+        self.use_ema = False
+        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
+            '{}Dataset'.format(capital_mode))()
+        if self.mode == 'train':
+            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
+                'UnsupTrainDataset')
+            self.loader = create('SemiTrainReader')(
+                self.dataset, self.dataset_unlabel, cfg.worker_num)
+
+        # build model
+        if 'model' not in self.cfg:
+            self.student_model = create(cfg.architecture)
+            self.teacher_model = create(cfg.architecture)
+            self.model = EnsembleTSModel(self.teacher_model, self.student_model)
+        else:
+            self.model = self.cfg.model
+            self.is_loaded_weights = True
+        # save path for burn-in model
+        self.base_path = cfg.get('weights')
+        self.base_path = os.path.dirname(self.base_path)
+
+        # EvalDataset build with BatchSampler to evaluate in single device
+        # TODO: multi-device evaluate
+        if self.mode == 'eval':
+            self._eval_batch_sampler = paddle.io.BatchSampler(
+                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
+            self.loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, cfg.worker_num, self._eval_batch_sampler)
+        # TestDataset build after user set images, skip loader creation here
+
+        self.start_epoch = 0
+        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
+        self.epoch_iter = self.cfg.epoch_iter  # set fixed iter in each epoch to control checkpoint
+
+        # build optimizer in train mode
+        if self.mode == 'train':
+            steps_per_epoch = self.epoch_iter
+            self.lr = create('LearningRate')(steps_per_epoch)
+            self.optimizer = create('OptimizerBuilder')(self.lr,
+                                                        self.model.modelStudent)
+
+        self._nranks = dist.get_world_size()
+        self._local_rank = dist.get_rank()
+
+        self.status = {}
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+        self.iter = 0
+
+    def resume_weights(self, weights):
+        # support Distill resume weights
+        if hasattr(self.model, 'student_model'):
+            self.start_epoch = load_weight(self.model.student_model, weights,
+                                           self.optimizer)
+        else:
+            self.start_epoch = load_weight(self.model, weights, self.optimizer)
+        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
+
+    def train(self, validate=False):
+        assert self.mode == 'train', "Model not in 'train' mode"
+        Init_mark = False
+
+        # if validation in training is enabled, metrics should be re-init
+        if validate:
+            self._init_metrics(validate=validate)
+            self._reset_metrics()
+
+        if self.cfg.get('fleet', False):
+            self.model.modelStudent = fleet.distributed_model(
+                self.model.modelStudent)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model.modelStudent = paddle.DataParallel(
+                self.model.modelStudent,
+                find_unused_parameters=find_unused_parameters)
+
+        # set fixed iter in each epoch to control checkpoint
+        self.status.update({
+            'epoch_id': self.start_epoch,
+            'step_id': 0,
+            'steps_per_epoch': self.epoch_iter
+        })
+        print('338 Len of DataLoader: {}'.format(len(self.loader)))
+
+        self.status['batch_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['data_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
+
+        self._compose_callback.on_train_begin(self.status)
+
+        epoch_id = self.start_epoch
+        self.iter = self.start_epoch * self.epoch_iter
+        # use iter rather than epoch to control training schedule
+        while self.iter < self.cfg.max_iter:
+            # epoch loop
+            self.status['mode'] = 'train'
+            self.status['epoch_id'] = epoch_id
+            self._compose_callback.on_epoch_begin(self.status)
+            self.loader.dataset_label.set_epoch(epoch_id)
+            self.loader.dataset_unlabel.set_epoch(epoch_id)
+            paddle.device.cuda.empty_cache()  # clear GPU memory
+            # set model status
+            self.model.modelStudent.train()
+            self.model.modelTeacher.eval()
+            iter_tic = time.time()
+
+            # iter loop in each eopch
+            for step_id in range(self.epoch_iter):
+                data = next(self.loader)
+                self.status['data_time'].update(time.time() - iter_tic)
+                self.status['step_id'] = step_id
+                # profiler.add_profiler_step(profiler_options)
+                self._compose_callback.on_step_begin(self.status)
+
+                # model forward and calculate loss
+                loss_dict = self.run_step_full_semisup(data)
+
+                if (step_id + 1) % self.cfg.optimize_rate == 0:
+                    self.optimizer.step()
+                    self.optimizer.clear_grad()
+                curr_lr = self.optimizer.get_lr()
+                self.lr.step()
+
+                # update log status
+                self.status['learning_rate'] = curr_lr
+                if self._nranks < 2 or self._local_rank == 0:
+                    self.status['training_staus'].update(loss_dict)
+                self.status['batch_time'].update(time.time() - iter_tic)
+                self._compose_callback.on_step_end(self.status)
+                self.iter += 1
+                iter_tic = time.time()
+
+            self._compose_callback.on_epoch_end(self.status)
+
+            if validate and (self._nranks < 2 or self._local_rank == 0) \
+                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
+                             or epoch_id == self.end_epoch - 1):
+                if not hasattr(self, '_eval_loader'):
+                    # build evaluation dataset and loader
+                    self._eval_dataset = self.cfg.EvalDataset
+                    self._eval_batch_sampler = \
+                        paddle.io.BatchSampler(
+                            self._eval_dataset,
+                            batch_size=self.cfg.EvalReader['batch_size'])
+                    self._eval_loader = create('EvalReader')(
+                        self._eval_dataset,
+                        self.cfg.worker_num,
+                        batch_sampler=self._eval_batch_sampler)
+                if validate and Init_mark == False:
+                    Init_mark = True
+                    self._init_metrics(validate=validate)
+                    self._reset_metrics()
+                with paddle.no_grad():
+                    self.status['save_best_model'] = True
+                    # before burn-in stage, eval student. after burn-in stage, eval teacher
+                    if self.iter <= self.cfg.SEMISUPNET['BURN_UP_STEP']:
+                        print("start eval student model")
+                        self._eval_with_loader(
+                            self._eval_loader, mode="student")
+                    else:
+                        print("start eval teacher model")
+                        self._eval_with_loader(
+                            self._eval_loader, mode="teacher")
+
+            epoch_id += 1
+
+        self._compose_callback.on_train_end(self.status)
+
+    def merge_data(self, data1, data2):
+        data = copy.deepcopy(data1)
+        for k, v in data1.items():
+            if type(v) is paddle.Tensor:
+                data[k] = paddle.concat(x=[data[k], data2[k]], axis=0)
+            elif type(v) is list:
+                data[k].extend(data2[k])
+        return data
+
+    def run_step_full_semisup(self, data):
+        label_data_k, label_data_q, unlabel_data_k, unlabel_data_q = data
+        data_merge = self.merge_data(label_data_k, label_data_q)
+        loss_sup_dict = self.model.modelStudent(data_merge, branch="supervised")
+        loss_dict = {}
+        for key in loss_sup_dict.keys():
+            if key[:4] == "loss":
+                loss_dict[key] = loss_sup_dict[key] * 1
+        losses_sup = paddle.add_n(list(loss_dict.values()))
+        # norm loss when using gradient accumulation
+        losses_sup = losses_sup / self.cfg.optimize_rate
+        losses_sup.backward()
+
+        for key in loss_sup_dict.keys():
+            loss_dict[key + "_pseudo"] = paddle.to_tensor([0])
+        loss_dict["loss_tot"] = losses_sup
+        """
+        semi-supervised training after burn-in stage
+        """
+        if self.iter >= self.cfg.SEMISUPNET['BURN_UP_STEP']:
+            # init teacher model with burn-up weight
+            if self.iter == self.cfg.SEMISUPNET['BURN_UP_STEP']:
+                print(
+                    'Starting semi-supervised learning and load the teacher model.'
+                )
+                self._update_teacher_model(keep_rate=0.00)
+                # save burn-in model
+                if dist.get_world_size() < 2 or dist.get_rank() == 0:
+                    print('saving burn-in model.')
+                    save_name = 'burnIn'
+                    epoch_id = self.iter // self.epoch_iter
+                    save_model(self.model, self.optimizer, self.base_path,
+                               save_name, epoch_id)
+            # Update teacher model with EMA
+            elif (self.iter + 1) % self.cfg.optimize_rate == 0:
+                self._update_teacher_model(
+                    keep_rate=self.cfg.SEMISUPNET['EMA_KEEP_RATE'])
+
+            #warm-up weight for pseudo loss
+            pseudo_weight = self.cfg.SEMISUPNET['UNSUP_LOSS_WEIGHT']
+            pseudo_warmup_iter = self.cfg.SEMISUPNET['PSEUDO_WARM_UP_STEPS']
+            temp = self.iter - self.cfg.SEMISUPNET['BURN_UP_STEP']
+            if temp <= pseudo_warmup_iter:
+                pseudo_weight *= (temp / pseudo_warmup_iter)
+
+            # get teacher predictions on weak-augmented unlabeled data
+            with paddle.no_grad():
+                teacher_pred = self.model.modelTeacher(
+                    unlabel_data_k, branch='semi_supervised')
+
+            # calculate unsupervised loss on strong-augmented unlabeled data
+            loss_unsup_dict = self.model.modelStudent(
+                unlabel_data_q,
+                branch="semi_supervised",
+                teacher_prediction=teacher_pred, )
+
+            for key in loss_unsup_dict.keys():
+                if key[-6:] == "pseudo":
+                    loss_unsup_dict[key] = loss_unsup_dict[key] * pseudo_weight
+            losses_unsup = paddle.add_n(list(loss_unsup_dict.values()))
+            # norm loss when using gradient accumulation
+            losses_unsup = losses_unsup / self.cfg.optimize_rate
+            losses_unsup.backward()
+
+            loss_dict.update(loss_unsup_dict)
+            loss_dict["loss_tot"] += losses_unsup
+        return loss_dict
+
+    def export(self, output_dir='output_inference'):
+        self.model.eval()
+        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
+        save_dir = os.path.join(output_dir, model_name)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        image_shape = None
+        test_reader_name = 'TestReader'
+        if 'inputs_def' in self.cfg[test_reader_name]:
+            inputs_def = self.cfg[test_reader_name]['inputs_def']
+            image_shape = inputs_def.get('image_shape', None)
+        # set image_shape=[3, -1, -1] as default
+        if image_shape is None:
+            image_shape = [3, -1, -1]
+
+        self.model.modelTeacher.eval()
+        if hasattr(self.model.modelTeacher, 'deploy'):
+            self.model.modelTeacher.deploy = True
+
+        # Save infer cfg
+        _dump_infer_config(self.cfg,
+                           os.path.join(save_dir, 'infer_cfg.yml'), image_shape,
+                           self.model.modelTeacher)
+
+        input_spec = [{
+            "image": InputSpec(
+                shape=[None] + image_shape, name='image'),
+            "im_shape": InputSpec(
+                shape=[None, 2], name='im_shape'),
+            "scale_factor": InputSpec(
+                shape=[None, 2], name='scale_factor')
+        }]
+
+        static_model = paddle.jit.to_static(
+            self.model.modelTeacher, input_spec=input_spec)
+        # NOTE: dy2st do not pruned program, but jit.save will prune program
+        # input spec, prune input spec here and save with pruned input spec
+        pruned_input_spec = _prune_input_spec(input_spec,
+                                              static_model.forward.main_program,
+                                              static_model.forward.outputs)
+
+        # dy2st and save model
+        if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT':
+            paddle.jit.save(
+                static_model,
+                os.path.join(save_dir, 'model'),
+                input_spec=pruned_input_spec)
+        else:
+            self.cfg.slim.save_quantized_model(
+                self.model.modelTeacher,
+                os.path.join(save_dir, 'model'),
+                input_spec=pruned_input_spec)
+        logger.info("Export model and saved in {}".format(save_dir))
+
+    def _eval_with_loader(self, loader, mode="teacher"):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+        # self.model.eval()
+        self.model.modelTeacher.eval()
+        self.model.modelStudent.eval()
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            if mode == "teacher":
+                outs = self.model.modelTeacher(data)
+            else:
+                outs = self.model.modelStudent(data)
+
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data, outs)
+
+            sample_num += data['im_id'].numpy().shape[0]
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def evaluate(self):
+        with paddle.no_grad():
+            self._eval_with_loader(self.loader)
+
+    @paddle.no_grad()
+    def _update_teacher_model(self, keep_rate=0.996):
+        student_model_dict = copy.deepcopy(self.model.modelStudent.state_dict())
+        new_teacher_dict = dict()
+        for key, value in self.model.modelTeacher.state_dict().items():
+            if key in student_model_dict.keys():
+                v = student_model_dict[key] * (1 - keep_rate
+                                               ) + value * keep_rate
+                v.stop_gradient = True
+                new_teacher_dict[key] = v
+            else:
+                raise Exception("{} is not found in student model".format(key))
+
+        self.model.modelTeacher.set_dict(new_teacher_dict)
+
+
+class EnsembleTSModel(nn.Layer):
+    def __init__(self, modelTeacher, modelStudent):
+        super(EnsembleTSModel, self).__init__()
+        self.modelTeacher = modelTeacher
+        self.modelStudent = modelStudent
diff --git a/ppdet/metrics/__init__.py b/ppdet/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6065af49433cc4c9d49ca8debb9dc9b136232f87
--- /dev/null
+++ b/ppdet/metrics/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import metrics
+
+from .metrics import *
diff --git a/ppdet/metrics/coco_utils.py b/ppdet/metrics/coco_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..47b92bc628be1d80aaf45a4133f81ccafc723da8
--- /dev/null
+++ b/ppdet/metrics/coco_utils.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import numpy as np
+import itertools
+
+from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res
+from ppdet.metrics.map_utils import draw_pr_curve
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def get_infer_results(outs, catid, bias=0):
+    """
+    Get result at the stage of inference.
+    The output format is dictionary containing bbox or mask result.
+
+    For example, bbox result is a list and each element contains
+    image_id, category_id, bbox and score.
+    """
+    if outs is None or len(outs) == 0:
+        raise ValueError(
+            'The number of valid detection result if zero. Please use reasonable model and check input data.'
+        )
+
+    im_id = outs['im_id']
+
+    infer_res = {}
+    if 'bbox' in outs:
+        if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6:
+            infer_res['bbox'] = get_det_poly_res(
+                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
+        else:
+            infer_res['bbox'] = get_det_res(
+                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
+
+    if 'mask' in outs:
+        # mask post process
+        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],
+                                        outs['bbox_num'], im_id, catid)
+
+    if 'segm' in outs:
+        infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)
+
+    if 'keypoint' in outs:
+        infer_res['keypoint'] = get_keypoint_res(outs, im_id)
+        outs['bbox_num'] = [len(infer_res['keypoint'])]
+
+    return infer_res
+
+
+def cocoapi_eval(jsonfile,
+                 style,
+                 coco_gt=None,
+                 anno_file=None,
+                 max_dets=(100, 300, 1000),
+                 classwise=False,
+                 sigmas=None,
+                 use_area=True):
+    """
+    Args:
+        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
+        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
+        coco_gt (str): Whether to load COCOAPI through anno_file,
+                 eg: coco_gt = COCO(anno_file)
+        anno_file (str): COCO annotations file.
+        max_dets (tuple): COCO evaluation maxDets.
+        classwise (bool): Whether per-category AP and draw P-R Curve or not.
+        sigmas (nparray): keypoint labelling sigmas.
+        use_area (bool): If gt annotations (eg. CrowdPose, AIC)
+                         do not have 'area', please set use_area=False.
+    """
+    assert coco_gt != None or anno_file != None
+    if style == 'keypoints_crowd':
+        #please install xtcocotools==1.6
+        from xtcocotools.coco import COCO
+        from xtcocotools.cocoeval import COCOeval
+    else:
+        from pycocotools.coco import COCO
+        from pycocotools.cocoeval import COCOeval
+
+    if coco_gt == None:
+        coco_gt = COCO(anno_file)
+    logger.info("Start evaluate...")
+    coco_dt = coco_gt.loadRes(jsonfile)
+    if style == 'proposal':
+        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
+        coco_eval.params.useCats = 0
+        coco_eval.params.maxDets = list(max_dets)
+    elif style == 'keypoints_crowd':
+        coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area)
+    else:
+        coco_eval = COCOeval(coco_gt, coco_dt, style)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    if classwise:
+        # Compute per-category AP and PR curve
+        try:
+            from terminaltables import AsciiTable
+        except Exception as e:
+            logger.error(
+                'terminaltables not found, plaese install terminaltables. '
+                'for example: `pip install terminaltables`.')
+            raise e
+        precisions = coco_eval.eval['precision']
+        cat_ids = coco_gt.getCatIds()
+        # precision: (iou, recall, cls, area range, max dets)
+        assert len(cat_ids) == precisions.shape[2]
+        results_per_category = []
+        for idx, catId in enumerate(cat_ids):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            nm = coco_gt.loadCats(catId)[0]
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            if precision.size:
+                ap = np.mean(precision)
+            else:
+                ap = float('nan')
+            results_per_category.append(
+                (str(nm["name"]), '{:0.3f}'.format(float(ap))))
+            pr_array = precisions[0, :, idx, 0, 2]
+            recall_array = np.arange(0.0, 1.01, 0.01)
+            draw_pr_curve(
+                pr_array,
+                recall_array,
+                out_dir=style + '_pr_curve',
+                file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))
+
+        num_columns = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        headers = ['category', 'AP'] * (num_columns // 2)
+        results_2d = itertools.zip_longest(
+            *[results_flatten[i::num_columns] for i in range(num_columns)])
+        table_data = [headers]
+        table_data += [result for result in results_2d]
+        table = AsciiTable(table_data)
+        logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
+        logger.info("per-category PR curve has output to {} folder.".format(
+            style + '_pr_curve'))
+    # flush coco evaluation result
+    sys.stdout.flush()
+    return coco_eval.stats
+
+
+def json_eval_results(metric, json_directory, dataset):
+    """
+    cocoapi eval with already exists proposal.json, bbox.json or mask.json
+    """
+    assert metric == 'COCO'
+    anno_file = dataset.get_anno()
+    json_file_list = ['proposal.json', 'bbox.json', 'mask.json']
+    if json_directory:
+        assert os.path.exists(
+            json_directory), "The json directory:{} does not exist".format(
+                json_directory)
+        for k, v in enumerate(json_file_list):
+            json_file_list[k] = os.path.join(str(json_directory), v)
+
+    coco_eval_style = ['proposal', 'bbox', 'segm']
+    for i, v_json in enumerate(json_file_list):
+        if os.path.exists(v_json):
+            cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file)
+        else:
+            logger.info("{} not exists!".format(v_json))
diff --git a/ppdet/metrics/json_results.py b/ppdet/metrics/json_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..93354ec1fc592b1567b5f0a3e2044a215d231a30
--- /dev/null
+++ b/ppdet/metrics/json_results.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import six
+import numpy as np
+
+
+def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
+    det_res = []
+    k = 0
+    for i in range(len(bbox_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = bbox_nums[i]
+        for j in range(det_nums):
+            dt = bboxes[k]
+            k = k + 1
+            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
+            w = xmax - xmin + bias
+            h = ymax - ymin + bias
+            bbox = [xmin, ymin, w, h]
+            dt_res = {
+                'image_id': cur_image_id,
+                'category_id': category_id,
+                'bbox': bbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
+    det_res = []
+    k = 0
+    for i in range(len(bbox_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = bbox_nums[i]
+        for j in range(det_nums):
+            dt = bboxes[k]
+            k = k + 1
+            num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
+            rbox = [x1, y1, x2, y2, x3, y3, x4, y4]
+            dt_res = {
+                'image_id': cur_image_id,
+                'category_id': category_id,
+                'bbox': rbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+def strip_mask(mask):
+    row = mask[0, 0, :]
+    col = mask[0, :, 0]
+    im_h = len(col) - np.count_nonzero(col == -1)
+    im_w = len(row) - np.count_nonzero(row == -1)
+    return mask[:, :im_h, :im_w]
+
+
+def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
+    import pycocotools.mask as mask_util
+    seg_res = []
+    k = 0
+    for i in range(len(mask_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = mask_nums[i]
+        mask_i = masks[k:k + det_nums]
+        mask_i = strip_mask(mask_i)
+        for j in range(det_nums):
+            mask = mask_i[j].astype(np.uint8)
+            score = float(bboxes[k][1])
+            label = int(bboxes[k][0])
+            k = k + 1
+            if label == -1:
+                continue
+            cat_id = label_to_cat_id_map[label]
+            rle = mask_util.encode(
+                np.array(
+                    mask[:, :, None], order="F", dtype="uint8"))[0]
+            if six.PY3:
+                if 'counts' in rle:
+                    rle['counts'] = rle['counts'].decode("utf8")
+            sg_res = {
+                'image_id': cur_image_id,
+                'category_id': cat_id,
+                'segmentation': rle,
+                'score': score
+            }
+            seg_res.append(sg_res)
+    return seg_res
+
+
+def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map):
+    import pycocotools.mask as mask_util
+    segm_res = []
+    # for each batch
+    segms = results['segm'].astype(np.uint8)
+    clsid_labels = results['cate_label']
+    clsid_scores = results['cate_score']
+    lengths = segms.shape[0]
+    im_id = int(image_id[0][0])
+    if lengths == 0 or segms is None:
+        return None
+    # for each sample
+    for i in range(lengths - 1):
+        clsid = int(clsid_labels[i])
+        catid = num_id_to_cat_id_map[clsid]
+        score = float(clsid_scores[i])
+        mask = segms[i]
+        segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
+        segm['counts'] = segm['counts'].decode('utf8')
+        coco_res = {
+            'image_id': im_id,
+            'category_id': catid,
+            'segmentation': segm,
+            'score': score
+        }
+        segm_res.append(coco_res)
+    return segm_res
+
+
+def get_keypoint_res(results, im_id):
+    anns = []
+    preds = results['keypoint']
+    for idx in range(im_id.shape[0]):
+        image_id = im_id[idx].item()
+        kpts, scores = preds[idx]
+        for kpt, score in zip(kpts, scores):
+            kpt = kpt.flatten()
+            ann = {
+                'image_id': image_id,
+                'category_id': 1,  # XXX hard code
+                'keypoints': kpt.tolist(),
+                'score': float(score)
+            }
+            x = kpt[0::3]
+            y = kpt[1::3]
+            x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item(
+            ), np.max(y).item()
+            ann['area'] = (x1 - x0) * (y1 - y0)
+            ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
+            anns.append(ann)
+    return anns
diff --git a/ppdet/metrics/map_utils.py b/ppdet/metrics/map_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eef48a8d15cb857da55ed2069bd4e9bebdf5c9c
--- /dev/null
+++ b/ppdet/metrics/map_utils.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import sys
+import numpy as np
+import itertools
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'draw_pr_curve',
+    'bbox_area',
+    'jaccard_overlap',
+    'prune_zero_padding',
+    'DetectionMAP',
+]
+
+
+def draw_pr_curve(precision,
+                  recall,
+                  iou=0.5,
+                  out_dir='pr_curve',
+                  file_name='precision_recall_curve.jpg'):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    output_path = os.path.join(out_dir, file_name)
+    try:
+        import matplotlib.pyplot as plt
+    except Exception as e:
+        logger.error('Matplotlib not found, plaese install matplotlib.'
+                     'for example: `pip install matplotlib`.')
+        raise e
+    plt.cla()
+    plt.figure('P-R Curve')
+    plt.title('Precision/Recall Curve(IoU={})'.format(iou))
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.grid(True)
+    plt.plot(recall, precision)
+    plt.savefig(output_path)
+
+
+def bbox_area(bbox, is_bbox_normalized):
+    """
+    Calculate area of a bounding box
+    """
+    norm = 1. - float(is_bbox_normalized)
+    width = bbox[2] - bbox[0] + norm
+    height = bbox[3] - bbox[1] + norm
+    return width * height
+
+
+def jaccard_overlap(pred, gt, is_bbox_normalized=False):
+    """
+    Calculate jaccard overlap ratio between two bounding box
+    """
+    if pred[0] >= gt[2] or pred[2] <= gt[0] or \
+        pred[1] >= gt[3] or pred[3] <= gt[1]:
+        return 0.
+    inter_xmin = max(pred[0], gt[0])
+    inter_ymin = max(pred[1], gt[1])
+    inter_xmax = min(pred[2], gt[2])
+    inter_ymax = min(pred[3], gt[3])
+    inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax],
+                           is_bbox_normalized)
+    pred_size = bbox_area(pred, is_bbox_normalized)
+    gt_size = bbox_area(gt, is_bbox_normalized)
+    overlap = float(inter_size) / (pred_size + gt_size - inter_size)
+    return overlap
+
+
+def prune_zero_padding(gt_box, gt_label, difficult=None):
+    valid_cnt = 0
+    for i in range(len(gt_box)):
+        if (gt_box[i] == 0).all():
+            break
+        valid_cnt += 1
+    return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]
+            if difficult is not None else None)
+
+
+class DetectionMAP(object):
+    """
+    Calculate detection mean average precision.
+    Currently support two types: 11point and integral
+
+    Args:
+        class_num (int): The class number.
+        overlap_thresh (float): The threshold of overlap
+            ratio between prediction bounding box and 
+            ground truth bounding box for deciding 
+            true/false positive. Default 0.5.
+        map_type (str): Calculation method of mean average
+            precision, currently support '11point' and
+            'integral'. Default '11point'.
+        is_bbox_normalized (bool): Whether bounding boxes
+            is normalized to range[0, 1]. Default False.
+        evaluate_difficult (bool): Whether to evaluate
+            difficult bounding boxes. Default False.
+        catid2name (dict): Mapping between category id and category name.
+        classwise (bool): Whether per-category AP and draw
+            P-R Curve or not.
+    """
+
+    def __init__(self,
+                 class_num,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False,
+                 catid2name=None,
+                 classwise=False):
+        self.class_num = class_num
+        self.overlap_thresh = overlap_thresh
+        assert map_type in ['11point', 'integral'], \
+                "map_type currently only support '11point' "\
+                "and 'integral'"
+        self.map_type = map_type
+        self.is_bbox_normalized = is_bbox_normalized
+        self.evaluate_difficult = evaluate_difficult
+        self.classwise = classwise
+        self.classes = []
+        for cname in catid2name.values():
+            self.classes.append(cname)
+        self.reset()
+
+    def update(self, bbox, score, label, gt_box, gt_label, difficult=None):
+        """
+        Update metric statics from given prediction and ground
+        truth infomations.
+        """
+        if difficult is None:
+            difficult = np.zeros_like(gt_label)
+
+        # record class gt count
+        for gtl, diff in zip(gt_label, difficult):
+            if self.evaluate_difficult or int(diff) == 0:
+                self.class_gt_counts[int(np.array(gtl))] += 1
+
+        # record class score positive
+        visited = [False] * len(gt_label)
+        for b, s, l in zip(bbox, score, label):
+            pred = b.tolist() if isinstance(b, np.ndarray) else b
+            max_idx = -1
+            max_overlap = -1.0
+            for i, gl in enumerate(gt_label):
+                if int(gl) == int(l):
+                    if len(gt_box[i]) == 8:
+                        overlap = calc_rbox_iou(pred, gt_box[i])
+                    else:
+                        overlap = jaccard_overlap(pred, gt_box[i],
+                                                  self.is_bbox_normalized)
+                    if overlap > max_overlap:
+                        max_overlap = overlap
+                        max_idx = i
+
+            if max_overlap > self.overlap_thresh:
+                if self.evaluate_difficult or \
+                        int(np.array(difficult[max_idx])) == 0:
+                    if not visited[max_idx]:
+                        self.class_score_poss[int(l)].append([s, 1.0])
+                        visited[max_idx] = True
+                    else:
+                        self.class_score_poss[int(l)].append([s, 0.0])
+            else:
+                self.class_score_poss[int(l)].append([s, 0.0])
+
+    def reset(self):
+        """
+        Reset metric statics
+        """
+        self.class_score_poss = [[] for _ in range(self.class_num)]
+        self.class_gt_counts = [0] * self.class_num
+        self.mAP = 0.0
+
+    def accumulate(self):
+        """
+        Accumulate metric results and calculate mAP
+        """
+        mAP = 0.
+        valid_cnt = 0
+        eval_results = []
+        for score_pos, count in zip(self.class_score_poss,
+                                    self.class_gt_counts):
+            if count == 0: continue
+            if len(score_pos) == 0:
+                valid_cnt += 1
+                continue
+
+            accum_tp_list, accum_fp_list = \
+                    self._get_tp_fp_accum(score_pos)
+            precision = []
+            recall = []
+            for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
+                precision.append(float(ac_tp) / (ac_tp + ac_fp))
+                recall.append(float(ac_tp) / count)
+
+            one_class_ap = 0.0
+            if self.map_type == '11point':
+                max_precisions = [0.] * 11
+                start_idx = len(precision) - 1
+                for j in range(10, -1, -1):
+                    for i in range(start_idx, -1, -1):
+                        if recall[i] < float(j) / 10.:
+                            start_idx = i
+                            if j > 0:
+                                max_precisions[j - 1] = max_precisions[j]
+                                break
+                        else:
+                            if max_precisions[j] < precision[i]:
+                                max_precisions[j] = precision[i]
+                one_class_ap = sum(max_precisions) / 11.
+                mAP += one_class_ap
+                valid_cnt += 1
+            elif self.map_type == 'integral':
+                import math
+                prev_recall = 0.
+                for i in range(len(precision)):
+                    recall_gap = math.fabs(recall[i] - prev_recall)
+                    if recall_gap > 1e-6:
+                        one_class_ap += precision[i] * recall_gap
+                        prev_recall = recall[i]
+                mAP += one_class_ap
+                valid_cnt += 1
+            else:
+                logger.error("Unspported mAP type {}".format(self.map_type))
+                sys.exit(1)
+            eval_results.append({
+                'class': self.classes[valid_cnt - 1],
+                'ap': one_class_ap,
+                'precision': precision,
+                'recall': recall,
+            })
+        self.eval_results = eval_results
+        self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
+
+    def get_map(self):
+        """
+        Get mAP result
+        """
+        if self.mAP is None:
+            logger.error("mAP is not calculated.")
+        if self.classwise:
+            # Compute per-category AP and PR curve
+            try:
+                from terminaltables import AsciiTable
+            except Exception as e:
+                logger.error(
+                    'terminaltables not found, plaese install terminaltables. '
+                    'for example: `pip install terminaltables`.')
+                raise e
+            results_per_category = []
+            for eval_result in self.eval_results:
+                results_per_category.append(
+                    (str(eval_result['class']),
+                     '{:0.3f}'.format(float(eval_result['ap']))))
+                draw_pr_curve(
+                    eval_result['precision'],
+                    eval_result['recall'],
+                    out_dir='voc_pr_curve',
+                    file_name='{}_precision_recall_curve.jpg'.format(
+                        eval_result['class']))
+
+            num_columns = min(6, len(results_per_category) * 2)
+            results_flatten = list(itertools.chain(*results_per_category))
+            headers = ['category', 'AP'] * (num_columns // 2)
+            results_2d = itertools.zip_longest(* [
+                results_flatten[i::num_columns] for i in range(num_columns)
+            ])
+            table_data = [headers]
+            table_data += [result for result in results_2d]
+            table = AsciiTable(table_data)
+            logger.info('Per-category of VOC AP: \n{}'.format(table.table))
+            logger.info(
+                "per-category PR curve has output to voc_pr_curve folder.")
+        return self.mAP
+
+    def _get_tp_fp_accum(self, score_pos_list):
+        """
+        Calculate accumulating true/false positive results from
+        [score, pos] records
+        """
+        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
+        accum_tp = 0
+        accum_fp = 0
+        accum_tp_list = []
+        accum_fp_list = []
+        for (score, pos) in sorted_list:
+            accum_tp += int(pos)
+            accum_tp_list.append(accum_tp)
+            accum_fp += 1 - int(pos)
+            accum_fp_list.append(accum_fp)
+        return accum_tp_list, accum_fp_list
diff --git a/ppdet/metrics/metrics.py b/ppdet/metrics/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..20af0f7bcdde4439ea19d3b4c80c9abddcf7fe10
--- /dev/null
+++ b/ppdet/metrics/metrics.py
@@ -0,0 +1,287 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import json
+import paddle
+import numpy as np
+import typing
+from collections import defaultdict
+from pathlib import Path
+
+from .map_utils import prune_zero_padding, DetectionMAP
+from .coco_utils import get_infer_results, cocoapi_eval
+from ppdet.data.source.category import get_categories
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['Metric', 'COCOMetric', 'VOCMetric', 'get_infer_results']
+
+
+class Metric(paddle.metric.Metric):
+    def name(self):
+        return self.__class__.__name__
+
+    def reset(self):
+        pass
+
+    def accumulate(self):
+        pass
+
+    # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate`
+    # :metch:`reset`, in ppdet, we also need following 2 methods:
+
+    # abstract method for logging metric results
+    def log(self):
+        pass
+
+    # abstract method for getting metric results
+    def get_results(self):
+        pass
+
+
+class COCOMetric(Metric):
+    def __init__(self, anno_file, **kwargs):
+        self.anno_file = anno_file
+        self.clsid2catid = kwargs.get('clsid2catid', None)
+        if self.clsid2catid is None:
+            self.clsid2catid, _ = get_categories('COCO', anno_file)
+        self.classwise = kwargs.get('classwise', False)
+        self.output_eval = kwargs.get('output_eval', None)
+        # TODO: bias should be unified
+        self.bias = kwargs.get('bias', 0)
+        self.save_prediction_only = kwargs.get('save_prediction_only', False)
+        self.iou_type = kwargs.get('IouType', 'bbox')
+
+        if not self.save_prediction_only:
+            assert os.path.isfile(anno_file), \
+                    "anno_file {} not a file".format(anno_file)
+
+        if self.output_eval is not None:
+            Path(self.output_eval).mkdir(exist_ok=True)
+
+        self.reset()
+
+    def reset(self):
+        # only bbox and mask evaluation support currently
+        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
+        self.eval_results = {}
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        # multi-scale inputs: all inputs have same im_id
+        if isinstance(inputs, typing.Sequence):
+            im_id = inputs[0]['im_id']
+        else:
+            im_id = inputs['im_id']
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
+                                                    paddle.Tensor) else im_id
+
+        infer_results = get_infer_results(
+            outs, self.clsid2catid, bias=self.bias)
+        self.results['bbox'] += infer_results[
+            'bbox'] if 'bbox' in infer_results else []
+        self.results['mask'] += infer_results[
+            'mask'] if 'mask' in infer_results else []
+        self.results['segm'] += infer_results[
+            'segm'] if 'segm' in infer_results else []
+        self.results['keypoint'] += infer_results[
+            'keypoint'] if 'keypoint' in infer_results else []
+
+    def accumulate(self):
+        if len(self.results['bbox']) > 0:
+            output = "bbox.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['bbox'], f)
+                logger.info('The bbox result is saved to bbox.json.')
+
+            if self.save_prediction_only:
+                logger.info('The bbox result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                bbox_stats = cocoapi_eval(
+                    output,
+                    'bbox',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['bbox'] = bbox_stats
+                sys.stdout.flush()
+
+        if len(self.results['mask']) > 0:
+            output = "mask.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['mask'], f)
+                logger.info('The mask result is saved to mask.json.')
+
+            if self.save_prediction_only:
+                logger.info('The mask result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                seg_stats = cocoapi_eval(
+                    output,
+                    'segm',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['mask'] = seg_stats
+                sys.stdout.flush()
+
+        if len(self.results['segm']) > 0:
+            output = "segm.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['segm'], f)
+                logger.info('The segm result is saved to segm.json.')
+
+            if self.save_prediction_only:
+                logger.info('The segm result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                seg_stats = cocoapi_eval(
+                    output,
+                    'segm',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['mask'] = seg_stats
+                sys.stdout.flush()
+
+    def log(self):
+        pass
+
+    def get_results(self):
+        return self.eval_results
+
+
+class VOCMetric(Metric):
+    def __init__(self,
+                 label_list,
+                 class_num=20,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False,
+                 classwise=False,
+                 output_eval=None,
+                 save_prediction_only=False):
+        assert os.path.isfile(label_list), \
+                "label_list {} not a file".format(label_list)
+        self.clsid2catid, self.catid2name = get_categories('VOC', label_list)
+
+        self.overlap_thresh = overlap_thresh
+        self.map_type = map_type
+        self.evaluate_difficult = evaluate_difficult
+        self.output_eval = output_eval
+        self.save_prediction_only = save_prediction_only
+        self.detection_map = DetectionMAP(
+            class_num=class_num,
+            overlap_thresh=overlap_thresh,
+            map_type=map_type,
+            is_bbox_normalized=is_bbox_normalized,
+            evaluate_difficult=evaluate_difficult,
+            catid2name=self.catid2name,
+            classwise=classwise)
+
+        self.reset()
+
+    def reset(self):
+        self.results = {'bbox': [], 'score': [], 'label': []}
+        self.detection_map.reset()
+
+    def update(self, inputs, outputs):
+        bbox_np = outputs['bbox'].numpy() if isinstance(
+            outputs['bbox'], paddle.Tensor) else outputs['bbox']
+        bboxes = bbox_np[:, 2:]
+        scores = bbox_np[:, 1]
+        labels = bbox_np[:, 0]
+        bbox_lengths = outputs['bbox_num'].numpy() if isinstance(
+            outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num']
+
+        self.results['bbox'].append(bboxes.tolist())
+        self.results['score'].append(scores.tolist())
+        self.results['label'].append(labels.tolist())
+
+        if bboxes.shape == (1, 1) or bboxes is None:
+            return
+        if self.save_prediction_only:
+            return
+
+        gt_boxes = inputs['gt_bbox']
+        gt_labels = inputs['gt_class']
+        difficults = inputs['difficult'] if not self.evaluate_difficult \
+                            else None
+
+        if 'scale_factor' in inputs:
+            scale_factor = inputs['scale_factor'].numpy() if isinstance(
+                inputs['scale_factor'],
+                paddle.Tensor) else inputs['scale_factor']
+        else:
+            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
+
+        bbox_idx = 0
+        for i in range(len(gt_boxes)):
+            gt_box = gt_boxes[i].numpy() if isinstance(
+                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
+            h, w = scale_factor[i]
+            gt_box = gt_box / np.array([w, h, w, h])
+            gt_label = gt_labels[i].numpy() if isinstance(
+                gt_labels[i], paddle.Tensor) else gt_labels[i]
+            if difficults is not None:
+                difficult = difficults[i].numpy() if isinstance(
+                    difficults[i], paddle.Tensor) else difficults[i]
+            else:
+                difficult = None
+            bbox_num = bbox_lengths[i]
+            bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
+            score = scores[bbox_idx:bbox_idx + bbox_num]
+            label = labels[bbox_idx:bbox_idx + bbox_num]
+            gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label,
+                                                             difficult)
+            self.detection_map.update(bbox, score, label, gt_box, gt_label,
+                                      difficult)
+            bbox_idx += bbox_num
+
+    def accumulate(self):
+        output = "bbox.json"
+        if self.output_eval:
+            output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results, f)
+                logger.info('The bbox result is saved to bbox.json.')
+        if self.save_prediction_only:
+            return
+
+        logger.info("Accumulating evaluatation results...")
+        self.detection_map.accumulate()
+
+    def log(self):
+        map_stat = 100. * self.detection_map.get_map()
+        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
+                                                       self.map_type, map_stat))
+
+    def get_results(self):
+        return {'bbox': [self.detection_map.get_map()]}
diff --git a/ppdet/model_zoo/.gitignore b/ppdet/model_zoo/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f296851d6dae0aa69eb0954ad59c095850b135ba
--- /dev/null
+++ b/ppdet/model_zoo/.gitignore
@@ -0,0 +1 @@
+MODEL_ZOO
diff --git a/ppdet/model_zoo/__init__.py b/ppdet/model_zoo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db6eb6c6da542405cd3c61ee991b04530c7b3a9
--- /dev/null
+++ b/ppdet/model_zoo/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import model_zoo
+from .model_zoo import *
+
+__all__ = model_zoo.__all__
diff --git a/ppdet/model_zoo/model_zoo.py b/ppdet/model_zoo/model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..27581ef793dee60e0661f3b2fb69d9b4421ec1a5
--- /dev/null
+++ b/ppdet/model_zoo/model_zoo.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os.path as osp
+import pkg_resources
+
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
+
+from ppdet.core.workspace import load_config, create
+from ppdet.utils.checkpoint import load_weight
+from ppdet.utils.download import get_config_path
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'list_model', 'get_config_file', 'get_weights_url', 'get_model',
+    'MODEL_ZOO_FILENAME'
+]
+
+MODEL_ZOO_FILENAME = 'MODEL_ZOO'
+
+
+def list_model(filters=[]):
+    model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo',
+                                                     MODEL_ZOO_FILENAME)
+    with open(model_zoo_file) as f:
+        model_names = f.read().splitlines()
+
+    # filter model_name
+    def filt(name):
+        for f in filters:
+            if name.find(f) < 0:
+                return False
+        return True
+
+    if isinstance(filters, str) or not isinstance(filters, Sequence):
+        filters = [filters]
+    model_names = [name for name in model_names if filt(name)]
+    if len(model_names) == 0 and len(filters) > 0:
+        raise ValueError("no model found, please check filters seeting, "
+                         "filters can be set as following kinds:\n"
+                         "\tDataset: coco, voc ...\n"
+                         "\tArchitecture: yolo, rcnn, ssd ...\n"
+                         "\tBackbone: resnet, vgg, darknet ...\n")
+
+    model_str = "Available Models:\n"
+    for model_name in model_names:
+        model_str += "\t{}\n".format(model_name)
+    logger.info(model_str)
+
+
+# models and configs save on bcebos under dygraph directory
+def get_config_file(model_name):
+    return get_config_path("ppdet://configs/{}.yml".format(model_name))
+
+
+def get_weights_url(model_name):
+    return "ppdet://models/{}.pdparams".format(osp.split(model_name)[-1])
+
+
+def get_model(model_name, pretrained=True):
+    cfg_file = get_config_file(model_name)
+    cfg = load_config(cfg_file)
+    model = create(cfg.architecture)
+
+    if pretrained:
+        load_weight(model, get_weights_url(model_name))
+
+    return model
diff --git a/ppdet/model_zoo/tests/__init__.py b/ppdet/model_zoo/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f0ea85344b7e0c679730356928c8749cf71cd66
--- /dev/null
+++ b/ppdet/model_zoo/tests/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ppdet/model_zoo/tests/test_get_model.py b/ppdet/model_zoo/tests/test_get_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8887185e0ca2f6c8edc020be2b92b47c9933d604
--- /dev/null
+++ b/ppdet/model_zoo/tests/test_get_model.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import paddle
+import ppdet
+import unittest
+
+# NOTE: weights downloading costs time, we choose
+#       a small model for unittesting
+MODEL_NAME = 'ppyolo/ppyolo_tiny_650e_coco'
+
+
+class TestGetConfigFile(unittest.TestCase):
+    def test_main(self):
+        try:
+            cfg_file = ppdet.model_zoo.get_config_file(MODEL_NAME)
+            assert os.path.isfile(cfg_file)
+        except:
+            self.assertTrue(False)
+
+
+class TestGetModel(unittest.TestCase):
+    def test_main(self):
+        try:
+            model = ppdet.model_zoo.get_model(MODEL_NAME)
+            assert isinstance(model, paddle.nn.Layer)
+        except:
+            self.assertTrue(False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/ppdet/model_zoo/tests/test_list_model.py b/ppdet/model_zoo/tests/test_list_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f91afe0058ff32fae7e1006bb8b4c4de9500fef
--- /dev/null
+++ b/ppdet/model_zoo/tests/test_list_model.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import ppdet
+
+
+class TestListModel(unittest.TestCase):
+    def setUp(self):
+        self._filter = []
+
+    def test_main(self):
+        try:
+            ppdet.model_zoo.list_model(self._filter)
+            self.assertTrue(True)
+        except:
+            self.assertTrue(False)
+
+
+class TestListModelYOLO(TestListModel):
+    def setUp(self):
+        self._filter = ['yolo']
+
+
+class TestListModelRCNN(TestListModel):
+    def setUp(self):
+        self._filter = ['rcnn']
+
+
+class TestListModelSSD(TestListModel):
+    def setUp(self):
+        self._filter = ['ssd']
+
+
+class TestListModelMultiFilter(TestListModel):
+    def setUp(self):
+        self._filter = ['yolo', 'darknet']
+
+
+class TestListModelError(unittest.TestCase):
+    def setUp(self):
+        self._filter = ['xxx']
+
+    def test_main(self):
+        try:
+            ppdet.model_zoo.list_model(self._filter)
+            self.assertTrue(False)
+        except ValueError:
+            self.assertTrue(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/ppdet/modeling/__init__.py b/ppdet/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d803fa83bc144fb3624591d2646cc31729be403d
--- /dev/null
+++ b/ppdet/modeling/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import warnings
+warnings.filterwarnings(
+    action='ignore', category=DeprecationWarning, module='ops')
+
+from . import ops
+from . import backbones
+from . import necks
+from . import proposal_generator
+from . import heads
+from . import losses
+from . import architectures
+from . import post_process
+from . import layers
+from . import assigners
+from . import transformers
+
+from .ops import *
+from .backbones import *
+from .necks import *
+from .proposal_generator import *
+from .heads import *
+from .losses import *
+from .architectures import *
+from .post_process import *
+from .layers import *
+from .assigners import *
+from .transformers import *
diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f89258d1a94e7e4f7368a9e3a3e090f910406386
--- /dev/null
+++ b/ppdet/modeling/architectures/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import meta_arch
+from . import yolo
+from . import ppyoloe
+from . import yolox
+from . import yolof
+from . import yolov5
+from . import yolov6
+from . import yolov7
+from . import yolov8
+from . import rtmdet
+from . import detr
+
+from .meta_arch import *
+from .yolo import *
+from .ppyoloe import *
+from .yolox import *
+from .yolof import *
+from .yolov5 import *
+from .yolov6 import *
+from .yolov7 import *
+from .yolov8 import *
+from .rtmdet import *
+from .detr import *
diff --git a/ppdet/modeling/architectures/detr.py b/ppdet/modeling/architectures/detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..7839a1263ffc02a97edf231ff44395c8960a2ec9
--- /dev/null
+++ b/ppdet/modeling/architectures/detr.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .meta_arch import BaseArch
+from ppdet.core.workspace import register, create
+
+__all__ = ['DETR']
+# Deformable DETR, DINO use the same architecture as DETR
+
+
+@register
+class DETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+    __shared__ = ['with_mask', 'exclude_post_process']
+
+    def __init__(self,
+                 backbone,
+                 transformer='DETRTransformer',
+                 detr_head='DETRHead',
+                 neck=None,
+                 post_process='DETRPostProcess',
+                 with_mask=False,
+                 exclude_post_process=False):
+        super(DETR, self).__init__()
+        self.backbone = backbone
+        self.transformer = transformer
+        self.detr_head = detr_head
+        self.neck = neck
+        self.post_process = post_process
+        self.with_mask = with_mask
+        self.exclude_post_process = exclude_post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # neck
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
+
+        # transformer
+        if neck is not None:
+            kwargs = {'input_shape': neck.out_shape}
+        transformer = create(cfg['transformer'], **kwargs)
+        # head
+        kwargs = {
+            'hidden_dim': transformer.hidden_dim,
+            'nhead': transformer.nhead,
+            'input_shape': backbone.out_shape
+        }
+        detr_head = create(cfg['detr_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'transformer': transformer,
+            "detr_head": detr_head,
+            "neck": neck
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+
+        # Neck
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+
+        # Transformer
+        pad_mask = self.inputs.get('pad_mask', None)
+        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
+
+        # DETR Head
+        if self.training:
+            detr_losses = self.detr_head(out_transformer, body_feats,
+                                         self.inputs)
+            detr_losses.update({
+                'loss': paddle.add_n(
+                    [v for k, v in detr_losses.items() if 'log' not in k])
+            })
+            return detr_losses
+        else:
+            preds = self.detr_head(out_transformer, body_feats)
+            if self.exclude_post_process:
+                bbox, bbox_num, mask = preds
+            else:
+                bbox, bbox_num, mask = self.post_process(
+                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],
+                    paddle.shape(self.inputs['image'])[2:])
+
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+            if self.with_mask:
+                output['mask'] = mask
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/meta_arch.py b/ppdet/modeling/architectures/meta_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..370b2b124bfc1f5477a942f972731f2857e1641c
--- /dev/null
+++ b/ppdet/modeling/architectures/meta_arch.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import typing
+
+from ppdet.core.workspace import register
+from ppdet.modeling.post_process import nms
+
+__all__ = ['BaseArch']
+
+
+@register
+class BaseArch(nn.Layer):
+    def __init__(self, data_format='NCHW', use_extra_data=False):
+        super(BaseArch, self).__init__()
+        self.data_format = data_format
+        self.inputs = {}
+        self.fuse_norm = False
+        self.use_extra_data = use_extra_data
+
+    def load_meanstd(self, cfg_transform):
+        scale = 1.
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        for item in cfg_transform:
+            if 'NormalizeImage' in item:
+                mean = np.array(
+                    item['NormalizeImage']['mean'], dtype=np.float32)
+                std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
+                if item['NormalizeImage'].get('is_scale', True):
+                    scale = 1. / 255.
+                break
+        if self.data_format == 'NHWC':
+            self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
+            self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
+        else:
+            self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
+            self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
+
+    def forward(self, inputs):
+        if self.data_format == 'NHWC':
+            image = inputs['image']
+            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
+
+        if self.fuse_norm:
+            image = inputs['image']
+            self.inputs['image'] = image * self.scale + self.bias
+            self.inputs['im_shape'] = inputs['im_shape']
+            self.inputs['scale_factor'] = inputs['scale_factor']
+        else:
+            self.inputs = inputs
+
+        self.model_arch()
+
+        if self.training:
+            out = self.get_loss()
+        else:
+            inputs_list = []
+            # multi-scale input
+            if not isinstance(inputs, typing.Sequence):
+                inputs_list.append(inputs)
+            else:
+                inputs_list.extend(inputs)
+            outs = []
+            for inp in inputs_list:
+                if self.fuse_norm:
+                    self.inputs['image'] = inp['image'] * self.scale + self.bias
+                    self.inputs['im_shape'] = inp['im_shape']
+                    self.inputs['scale_factor'] = inp['scale_factor']
+                else:
+                    self.inputs = inp
+                outs.append(self.get_pred())
+
+            # multi-scale test
+            if len(outs) > 1:
+                out = self.merge_multi_scale_predictions(outs)
+            else:
+                out = outs[0]
+        return out
+
+    def merge_multi_scale_predictions(self, outs):
+        # default values for architectures not included in following list
+        num_classes = 80
+        nms_threshold = 0.5
+        keep_top_k = 100
+
+        if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
+            num_classes = self.bbox_head.num_classes
+            keep_top_k = self.bbox_post_process.nms.keep_top_k
+            nms_threshold = self.bbox_post_process.nms.nms_threshold
+        else:
+            raise Exception(
+                "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
+            )
+
+        final_boxes = []
+        all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
+        for c in range(num_classes):
+            idxs = all_scale_outs[:, 0] == c
+            if np.count_nonzero(idxs) == 0:
+                continue
+            r = nms(all_scale_outs[idxs, 1:], nms_threshold)
+            final_boxes.append(
+                np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+        out = np.concatenate(final_boxes)
+        out = np.concatenate(sorted(
+            out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
+        out = {
+            'bbox': paddle.to_tensor(out),
+            'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
+        }
+
+        return out
+
+    def build_inputs(self, data, input_def):
+        inputs = {}
+        for i, k in enumerate(input_def):
+            inputs[k] = data[i]
+        return inputs
+
+    def model_arch(self, ):
+        pass
+
+    def get_loss(self, ):
+        raise NotImplementedError("Should implement get_loss method!")
+
+    def get_pred(self, ):
+        raise NotImplementedError("Should implement get_pred method!")
diff --git a/ppdet/modeling/architectures/ppyoloe.py b/ppdet/modeling/architectures/ppyoloe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d9c9d3276c5c0942648bef002a83262071e93f
--- /dev/null
+++ b/ppdet/modeling/architectures/ppyoloe.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead']
+# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head
+# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head
+
+
+@register
+class PPYOLOE(BaseArch):
+    """
+    PPYOLOE network, see https://arxiv.org/abs/2203.16250
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): neck instance
+        yolo_head (nn.Layer): anchor_head instance
+        post_process (object): `BBoxPostProcess` instance
+        ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod)
+        for_distill (bool): whether for distillation
+        feat_distill_place (str): distill which feature for distillation
+        for_mot (bool): whether return other features for multi-object tracking
+            models, default False in pure object detection models.
+    """
+
+    __category__ = 'architecture'
+    __shared__ = ['for_distill']
+    __inject__ = ['post_process', 'ssod_loss']
+
+    def __init__(self,
+                 backbone='CSPResNet',
+                 neck='CustomCSPPAN',
+                 yolo_head='PPYOLOEHead',
+                 post_process='BBoxPostProcess',
+                 ssod_loss='SSODPPYOLOELoss',
+                 for_distill=False,
+                 feat_distill_place='neck_feats',
+                 for_mot=False):
+        super(PPYOLOE, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+        # for ssod, semi-det
+        self.is_teacher = False
+        self.ssod_loss = ssod_loss
+
+        # distill
+        self.for_distill = for_distill
+        self.feat_distill_place = feat_distill_place
+        if for_distill:
+            assert feat_distill_place in ['backbone_feats', 'neck_feats']
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        self.is_teacher = self.inputs.get('is_teacher', False)  # for semi-det
+        if self.training or self.is_teacher:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+
+            if self.for_distill:
+                if self.feat_distill_place == 'backbone_feats':
+                    self.yolo_head.distill_pairs['backbone_feats'] = body_feats
+                elif self.feat_distill_place == 'neck_feats':
+                    self.yolo_head.distill_pairs['neck_feats'] = neck_feats
+                else:
+                    raise ValueError
+            return yolo_losses
+        else:
+
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.post_process is not None:
+                bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+
+            else:
+                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                extra_data['nms_keep_idx'] = nms_keep_idx
+                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+            else:
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
+
+    def get_loss_keys(self):
+        return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast']
+
+    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
+        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
+                                     train_cfg)
+        return ssod_losses
+
+
+@register
+class PPYOLOEWithAuxHead(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='CSPResNet',
+                 neck='CustomCSPPAN',
+                 yolo_head='PPYOLOEHead',
+                 aux_head='SimpleConvHead',
+                 post_process='BBoxPostProcess',
+                 for_mot=False,
+                 detach_epoch=5):
+        """
+        PPYOLOE network, see https://arxiv.org/abs/2203.16250
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            post_process (object): `BBoxPostProcess` instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(PPYOLOEWithAuxHead, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.aux_neck = copy.deepcopy(self.neck)
+
+        self.yolo_head = yolo_head
+        self.aux_head = aux_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+        self.detach_epoch = detach_epoch
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+        aux_neck = copy.deepcopy(neck)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+        aux_head = create(cfg['aux_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+            'aux_head': aux_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            if self.inputs['epoch_id'] >= self.detach_epoch:
+                aux_neck_feats = self.aux_neck([f.detach() for f in body_feats])
+                dual_neck_feats = (paddle.concat(
+                    [f.detach(), aux_f], axis=1) for f, aux_f in
+                                   zip(neck_feats, aux_neck_feats))
+            else:
+                aux_neck_feats = self.aux_neck(body_feats)
+                dual_neck_feats = (paddle.concat(
+                    [f, aux_f], axis=1) for f, aux_f in
+                                   zip(neck_feats, aux_neck_feats))
+            aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats)
+            loss = self.yolo_head(
+                neck_feats,
+                self.inputs,
+                aux_pred=[aux_cls_scores, aux_bbox_preds])
+            return loss
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+            if self.post_process is not None:
+                bbox, bbox_num = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+            else:
+                bbox, bbox_num = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/rtmdet.py b/ppdet/modeling/architectures/rtmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4f7dc82f392fb5ad625aa0778784dc26c07c542
--- /dev/null
+++ b/ppdet/modeling/architectures/rtmdet.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['RTMDet']
+
+
+@register
+class RTMDet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='CSPNeXt',
+                 neck='CSPNeXtPAFPN',
+                 head='RTMDetHead',
+                 post_process='BBoxPostProcess',
+                 for_mot=False):
+        """
+        RTMDet see https://arxiv.org/abs/
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            head (nn.Layer): head instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(RTMDet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.head(neck_feats)
+            post_outs = self.head.post_process(yolo_head_outs,
+                                               self.inputs['im_shape'],
+                                               self.inputs['scale_factor'])
+
+            if not isinstance(post_outs, (tuple, list)):
+                # if set exclude_post_process, concat([pred_bboxes, pred_scores]) not scaled to origin
+                # export onnx as torch yolo models
+                return post_outs
+            else:
+                # if set exclude_nms, [pred_bboxes, pred_scores] scaled to origin
+                bbox, bbox_num = post_outs  # default for end-to-end eval/infer
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/yolo.py b/ppdet/modeling/architectures/yolo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd030852f5ac6f9b7eb911e3ad62e453b9d14a7f
--- /dev/null
+++ b/ppdet/modeling/architectures/yolo.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOv3']
+# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3
+# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py
+
+
+@register
+class YOLOv3(BaseArch):
+    __category__ = 'architecture'
+    __shared__ = ['data_format']
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='DarkNet',
+                 neck='YOLOv3FPN',
+                 yolo_head='YOLOv3Head',
+                 post_process='BBoxPostProcess',
+                 data_format='NCHW',
+                 for_mot=False):
+        """
+        YOLOv3 network, see https://arxiv.org/abs/1804.02767
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            post_process (object): `BBoxPostProcess` instance
+            data_format (str): data format, NCHW or NHWC
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOv3, self).__init__(data_format=data_format)
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+            if self.post_process is not None:
+                # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors
+                bbox, bbox_num = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+            else:
+                # anchor free YOLOs: PP-YOLOE
+                post_outs = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+
+                if not isinstance(post_outs, (tuple, list)):
+                    # if set exclude_post_process, concat([pred_bboxes, pred_scores]) not scaled to origin
+                    # export onnx as torch yolo models
+                    return post_outs
+                else:
+                    # if set exclude_nms, [pred_bboxes, pred_scores] scaled to origin
+                    bbox, bbox_num = post_outs  # default for end-to-end eval/infer
+                    return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/yolof.py b/ppdet/modeling/architectures/yolof.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6a2920529e7203f88ae9150f6f3e014cc36cab0
--- /dev/null
+++ b/ppdet/modeling/architectures/yolof.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOF']
+
+
+@register
+class YOLOF(BaseArch):
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone='ResNet',
+                 neck='DilatedEncoder',
+                 head='YOLOFHead',
+                 for_mot=False):
+        """
+        YOLOF network, see https://arxiv.org/abs/2103.09460
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): DilatedEncoder instance
+            head (nn.Layer): YOLOFHead instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOF, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.head(neck_feats)
+            bbox, bbox_num = self.head.post_process(yolo_head_outs,
+                                                    self.inputs['im_shape'],
+                                                    self.inputs['scale_factor'])
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/yolov5.py b/ppdet/modeling/architectures/yolov5.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4d6909980c9eafd232d95c05c5e5b1268105336
--- /dev/null
+++ b/ppdet/modeling/architectures/yolov5.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOv5']
+
+
+@register
+class YOLOv5(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='CSPDarkNet',
+                 neck='YOLOCSPPAN',
+                 yolo_head='YOLOv5Head',
+                 post_process='BBoxPostProcess',
+                 for_mot=False):
+        """
+        YOLOv5, YOLOv6(https://arxiv.org/abs/2209.02976) and YOLOv7(https://arxiv.org/abs/2207.02696)
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOv5, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+            post_outs = self.yolo_head.post_process(yolo_head_outs,
+                                                    self.inputs['im_shape'],
+                                                    self.inputs['scale_factor'])
+
+            if not isinstance(post_outs, (tuple, list)):
+                # if set exclude_post_process, concat([pred_bboxes, pred_scores]) not scaled to origin
+                # export onnx as torch yolo models
+                return post_outs
+            else:
+                # if set exclude_nms, [pred_bboxes, pred_scores] scaled to origin
+                bbox, bbox_num = post_outs  # default for end-to-end eval/infer
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/yolov6.py b/ppdet/modeling/architectures/yolov6.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df5e2e436951310a15378ae21b55bbe06c7c8a3
--- /dev/null
+++ b/ppdet/modeling/architectures/yolov6.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOv6']
+
+
+@register
+class YOLOv6(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='EfficientRep',
+                 neck='RepBiFPAN',
+                 yolo_head='EffiDeHead',
+                 post_process='BBoxPostProcess',
+                 for_mot=False):
+        """
+        YOLOv6(https://arxiv.org/abs/2209.02976, https://arxiv.org/abs/2301.05586)
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): head instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOv6, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+            post_outs = self.yolo_head.post_process(yolo_head_outs,
+                                                    self.inputs['im_shape'],
+                                                    self.inputs['scale_factor'])
+
+            if not isinstance(post_outs, (tuple, list)):
+                # if set exclude_post_process, concat([pred_bboxes, pred_scores]) not scaled to origin
+                # export onnx as torch yolo models
+                return post_outs
+            else:
+                # if set exclude_nms, [pred_bboxes, pred_scores] scaled to origin
+                bbox, bbox_num = post_outs  # default for end-to-end eval/infer
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/yolov7.py b/ppdet/modeling/architectures/yolov7.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a34b1b08aadc5280b1b1370c58b1b6ba015897d
--- /dev/null
+++ b/ppdet/modeling/architectures/yolov7.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOv7']
+
+
+@register
+class YOLOv7(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='ELANNet',
+                 neck='ELANFPN',
+                 yolo_head='YOLOv7Head',
+                 post_process='BBoxPostProcess',
+                 for_mot=False):
+        """
+        YOLOv7(https://arxiv.org/abs/2207.02696)
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOv7, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+            post_outs = self.yolo_head.post_process(yolo_head_outs,
+                                                    self.inputs['im_shape'],
+                                                    self.inputs['scale_factor'])
+
+            if not isinstance(post_outs, (tuple, list)):
+                # if set exclude_post_process, concat([pred_bboxes, pred_scores]) not scaled to origin
+                # export onnx as torch yolo models
+                return post_outs
+            else:
+                # if set exclude_nms, [pred_bboxes, pred_scores] scaled to origin
+                bbox, bbox_num = post_outs  # default for end-to-end eval/infer
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/yolov8.py b/ppdet/modeling/architectures/yolov8.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c65362581de6ee6b18588d007ddbe3617b9462
--- /dev/null
+++ b/ppdet/modeling/architectures/yolov8.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOv8']
+
+
+@register
+class YOLOv8(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='YOLOv8CSPDarkNet',
+                 neck='YOLOCSPPAN',
+                 yolo_head='YOLOv8Head',
+                 post_process='BBoxPostProcess',
+                 for_mot=False):
+        """
+        YOLOv8
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOv8, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+            post_outs = self.yolo_head.post_process(yolo_head_outs,
+                                                    self.inputs['im_shape'],
+                                                    self.inputs['scale_factor'])
+
+            if not isinstance(post_outs, (tuple, list)):
+                # if set exclude_post_process, concat([pred_bboxes, pred_scores]) not scaled to origin
+                # export onnx as torch yolo models
+                return post_outs
+            else:
+                # if set exclude_nms, [pred_bboxes, pred_scores] scaled to origin
+                bbox, bbox_num = post_outs  # default for end-to-end eval/infer
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/ppdet/modeling/architectures/yolox.py b/ppdet/modeling/architectures/yolox.py
new file mode 100644
index 0000000000000000000000000000000000000000..cedb7ace76080cdcf2045852d5fd37a282047779
--- /dev/null
+++ b/ppdet/modeling/architectures/yolox.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+import random
+import paddle
+import paddle.nn.functional as F
+
+__all__ = ['YOLOX']
+
+
+@register
+class YOLOX(BaseArch):
+    """
+    YOLOX network, see https://arxiv.org/abs/2107.08430
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): neck instance
+        head (nn.Layer): head instance
+        for_mot (bool): whether used for MOT or not
+        input_size (list[int]): initial scale, will be reset by self._preprocess()
+        size_stride (int): stride of the size range
+        size_range (list[int]): multi-scale range for training
+        random_interval (int): interval of iter to change self._input_size
+    """
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone='CSPDarkNet',
+                 neck='YOLOCSPPAN',
+                 head='YOLOXHead',
+                 for_mot=False,
+                 input_size=[640, 640],
+                 size_stride=32,
+                 size_range=[15, 25],
+                 random_interval=10):
+        super(YOLOX, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.for_mot = for_mot
+
+        self.input_size = input_size
+        self._input_size = paddle.to_tensor(input_size)
+        self.size_stride = size_stride
+        self.size_range = size_range
+        self.random_interval = random_interval
+        self._step = 0
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        if self.training:
+            self._preprocess()
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolox_losses = self.head(neck_feats, self.inputs)
+            yolox_losses.update({'size': self._input_size[0]})
+            return yolox_losses
+        else:
+            head_outs = self.head(neck_feats)
+            post_outs = self.head.post_process(
+                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
+
+            if not isinstance(post_outs, (tuple, list)):
+                # if set exclude_post_process, concat([pred_bboxes, pred_scores]) not scaled to origin
+                # export onnx as torch yolo models
+                return post_outs
+            else:
+                # if set exclude_nms, [pred_bboxes, pred_scores] scaled to origin
+                bbox, bbox_num = post_outs  # default for end-to-end eval/infer
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
+
+    def _preprocess(self):
+        # YOLOX multi-scale training, interpolate resize before inputs of the network.
+        self._get_size()
+        scale_y = self._input_size[0] / self.input_size[0]
+        scale_x = self._input_size[1] / self.input_size[1]
+        if scale_x != 1 or scale_y != 1:
+            self.inputs['image'] = F.interpolate(
+                self.inputs['image'],
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+            gt_bboxes = self.inputs['gt_bbox']
+            for i in range(len(gt_bboxes)):
+                if len(gt_bboxes[i]) > 0:
+                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x
+                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y
+            self.inputs['gt_bbox'] = gt_bboxes
+
+    def _get_size(self):
+        # random_interval = 10 as default, every 10 iters to change self._input_size
+        image_ratio = self.input_size[1] * 1.0 / self.input_size[0]
+        if self._step % self.random_interval == 0:
+            size_factor = random.randint(*self.size_range)
+            size = [
+                self.size_stride * size_factor,
+                self.size_stride * int(size_factor * image_ratio)
+            ]
+            self._input_size = paddle.to_tensor(size)
+        self._step += 1
diff --git a/ppdet/modeling/assigners/__init__.py b/ppdet/modeling/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a86c9e5cb4f707cd3e39adfa2639df6b13fee36
--- /dev/null
+++ b/ppdet/modeling/assigners/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import utils
+from . import task_aligned_assigner
+from . import atss_assigner
+from . import simota_assigner
+from . import task_aligned_assigner_cr
+from . import uniform_assigner
+
+from .utils import *
+from .task_aligned_assigner import *
+from .atss_assigner import *
+from .simota_assigner import *
+from .task_aligned_assigner_cr import *
+from .uniform_assigner import *
diff --git a/ppdet/modeling/assigners/atss_assigner.py b/ppdet/modeling/assigners/atss_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1aae2bebc8ec3c4b1c53c64eb3bb657f25505d5
--- /dev/null
+++ b/ppdet/modeling/assigners/atss_assigner.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ..bbox_utils import iou_similarity, batch_iou_similarity
+from ..bbox_utils import bbox_center
+from .utils import (check_points_inside_bboxes, compute_max_iou_anchor,
+                    compute_max_iou_gt)
+
+__all__ = ['ATSSAssigner']
+
+
+@register
+class ATSSAssigner(nn.Layer):
+    """Bridging the Gap Between Anchor-based and Anchor-free Detection
+     via Adaptive Training Sample Selection
+    """
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 topk=9,
+                 num_classes=80,
+                 force_gt_matching=False,
+                 eps=1e-9,
+                 sm_use=False):
+        super(ATSSAssigner, self).__init__()
+        self.topk = topk
+        self.num_classes = num_classes
+        self.force_gt_matching = force_gt_matching
+        self.eps = eps
+        self.sm_use = sm_use
+
+    def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
+                             pad_gt_mask):
+        gt2anchor_distances_list = paddle.split(
+            gt2anchor_distances, num_anchors_list, axis=-1)
+        num_anchors_index = np.cumsum(num_anchors_list).tolist()
+        num_anchors_index = [0, ] + num_anchors_index[:-1]
+        is_in_topk_list = []
+        topk_idxs_list = []
+        for distances, anchors_index in zip(gt2anchor_distances_list,
+                                            num_anchors_index):
+            num_anchors = distances.shape[-1]
+            _, topk_idxs = paddle.topk(
+                distances, self.topk, axis=-1, largest=False)
+            topk_idxs_list.append(topk_idxs + anchors_index)
+            is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
+                axis=-2).astype(gt2anchor_distances.dtype)
+            is_in_topk_list.append(is_in_topk * pad_gt_mask)
+        is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1)
+        topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1)
+        return is_in_topk_list, topk_idxs_list
+
+    @paddle.no_grad()
+    def forward(self,
+                anchor_bboxes,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index,
+                gt_scores=None,
+                pred_bboxes=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
+
+        The assignment is done in following steps
+        1. compute iou between all bbox (bbox of all pyramid levels) and gt
+        2. compute center distance between all bbox and gt
+        3. on each pyramid level, for each gt, select k bbox whose center
+           are closest to the gt center, so we total select k*l bbox as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as positive
+        6. limit the positive sample's center in gt
+        7. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),
+                    "xmin, xmax, ymin, ymax" format
+            num_anchors_list (List): num of anchors in each level
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            gt_scores (Tensor|None, float32) Score of gt_bboxes,
+                    shape(B, n, 1), if None, then it will initialize with one_hot label
+            pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 4)
+            assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious
+        """
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        num_anchors, _ = anchor_bboxes.shape
+        batch_size, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype='int32')
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, self.num_classes])
+            return assigned_labels, assigned_bboxes, assigned_scores
+
+        # 1. compute iou between gt and anchor bbox, [B, n, L]
+        ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
+        ious = ious.reshape([batch_size, -1, num_anchors])
+
+        # 2. compute center distance between all anchors and gt, [B, n, L]
+        gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
+        anchor_centers = bbox_center(anchor_bboxes)
+        gt2anchor_distances = (gt_centers - anchor_centers.unsqueeze(0)) \
+            .norm(2, axis=-1).reshape([batch_size, -1, num_anchors])
+
+        # 3. on each pyramid level, selecting topk closest candidates
+        # based on the center distance, [B, n, L]
+        is_in_topk, topk_idxs = self._gather_topk_pyramid(
+            gt2anchor_distances, num_anchors_list, pad_gt_mask)
+
+        # 4. get corresponding iou for the these candidates, and compute the
+        # mean and std, 5. set mean + std as the iou threshold
+        iou_candidates = ious * is_in_topk
+        iou_threshold = paddle.index_sample(
+            iou_candidates.flatten(stop_axis=-2),
+            topk_idxs.flatten(stop_axis=-2))
+        iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
+        iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \
+                        iou_threshold.std(axis=-1, keepdim=True)
+        is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk,
+                                  paddle.zeros_like(is_in_topk))
+
+        # 6. check the positive sample's center in gt, [B, n, L]
+        if self.sm_use:
+            is_in_gts = check_points_inside_bboxes(
+                anchor_centers, gt_bboxes, sm_use=True)
+        else:
+            is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
+
+        # select positive sample, [B, n, L]
+        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
+
+        # 7. if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected.
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (
+                mask_positive_sum.unsqueeze(1) > 1).astype('int32').tile(
+                    [1, num_max_boxes, 1]).astype('bool')
+            if self.sm_use:
+                is_max_iou = compute_max_iou_anchor(ious * mask_positive)
+            else:
+                is_max_iou = compute_max_iou_anchor(ious)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        # 8. make sure every gt_bbox matches the anchor
+        if self.force_gt_matching:
+            is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask
+            mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile(
+                [1, num_max_boxes, 1])
+            mask_positive = paddle.where(mask_max_iou, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+
+        # assigned target
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
+
+        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
+        ind = list(range(self.num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+        if pred_bboxes is not None:
+            # assigned iou
+            ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
+            ious = ious.max(axis=-2).unsqueeze(-1)
+            assigned_scores *= ious
+        elif gt_scores is not None:
+            gather_scores = paddle.gather(
+                gt_scores.flatten(), assigned_gt_index.flatten(), axis=0)
+            gather_scores = gather_scores.reshape([batch_size, num_anchors])
+            gather_scores = paddle.where(mask_positive_sum > 0, gather_scores,
+                                         paddle.zeros_like(gather_scores))
+            assigned_scores *= gather_scores.unsqueeze(-1)
+
+        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/ppdet/modeling/assigners/simota_assigner.py b/ppdet/modeling/assigners/simota_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b996b173caac276d45a29ec3218e86426c24940
--- /dev/null
+++ b/ppdet/modeling/assigners/simota_assigner.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/sim_ota_assigner.py
+
+import paddle
+import numpy as np
+import paddle.nn.functional as F
+
+from ppdet.modeling.losses.varifocal_loss import varifocal_loss
+from ppdet.modeling.bbox_utils import batch_bbox_overlaps
+from ppdet.core.workspace import register
+
+
+@register
+class SimOTAAssigner(object):
+    """Computes matching between predictions and ground truth.
+    Args:
+        center_radius (int | float, optional): Ground truth center size
+            to judge whether a prior is in center. Default 2.5.
+        candidate_topk (int, optional): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k. Default 10.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 3.0.
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        num_classes (int): The num_classes of dataset.
+        use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix.
+    """
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 center_radius=2.5,
+                 candidate_topk=10,
+                 iou_weight=3.0,
+                 cls_weight=1.0,
+                 num_classes=80,
+                 use_vfl=True):
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+        self.num_classes = num_classes
+        self.use_vfl = use_vfl
+
+    def get_in_gt_and_in_center_info(self, flatten_center_and_stride,
+                                     gt_bboxes):
+        num_gt = gt_bboxes.shape[0]
+
+        flatten_x = flatten_center_and_stride[:, 0].unsqueeze(1).tile(
+            [1, num_gt])
+        flatten_y = flatten_center_and_stride[:, 1].unsqueeze(1).tile(
+            [1, num_gt])
+        flatten_stride_x = flatten_center_and_stride[:, 2].unsqueeze(1).tile(
+            [1, num_gt])
+        flatten_stride_y = flatten_center_and_stride[:, 3].unsqueeze(1).tile(
+            [1, num_gt])
+
+        # is prior centers in gt bboxes, shape: [n_center, n_gt]
+        l_ = flatten_x - gt_bboxes[:, 0]
+        t_ = flatten_y - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - flatten_x
+        b_ = gt_bboxes[:, 3] - flatten_y
+
+        deltas = paddle.stack([l_, t_, r_, b_], axis=1)
+        is_in_gts = deltas.min(axis=1) > 0
+        is_in_gts_all = is_in_gts.sum(axis=1) > 0
+
+        # is prior centers in gt centers
+        gt_center_xs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_center_ys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        ct_bound_l = gt_center_xs - self.center_radius * flatten_stride_x
+        ct_bound_t = gt_center_ys - self.center_radius * flatten_stride_y
+        ct_bound_r = gt_center_xs + self.center_radius * flatten_stride_x
+        ct_bound_b = gt_center_ys + self.center_radius * flatten_stride_y
+
+        cl_ = flatten_x - ct_bound_l
+        ct_ = flatten_y - ct_bound_t
+        cr_ = ct_bound_r - flatten_x
+        cb_ = ct_bound_b - flatten_y
+
+        ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1)
+        is_in_cts = ct_deltas.min(axis=1) > 0
+        is_in_cts_all = is_in_cts.sum(axis=1) > 0
+
+        # in any of gts or gt centers, shape: [n_center]
+        is_in_gts_or_centers_all = paddle.logical_or(is_in_gts_all,
+                                                     is_in_cts_all)
+
+        is_in_gts_or_centers_all_inds = paddle.nonzero(
+            is_in_gts_or_centers_all).squeeze(1)
+
+        # both in gts and gt centers, shape: [num_fg, num_gt]
+        is_in_gts_and_centers = paddle.logical_and(
+            paddle.gather(
+                is_in_gts.cast('int'), is_in_gts_or_centers_all_inds,
+                axis=0).cast('bool'),
+            paddle.gather(
+                is_in_cts.cast('int'), is_in_gts_or_centers_all_inds,
+                axis=0).cast('bool'))
+        return is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_gts_and_centers
+
+    def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):
+        match_matrix = np.zeros_like(cost_matrix.numpy())
+        # select candidate topk ious for dynamic-k calculation
+        topk_ious, _ = paddle.topk(
+            pairwise_ious,
+            min(self.candidate_topk, pairwise_ious.shape[0]),
+            axis=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = paddle.topk(
+                cost_matrix[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            match_matrix[:, gt_idx][pos_idx.numpy()] = 1.0
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        # match points more than two gts
+        extra_match_gts_mask = match_matrix.sum(1) > 1
+        if extra_match_gts_mask.sum() > 0:
+            cost_matrix = cost_matrix.numpy()
+            cost_argmin = np.argmin(
+                cost_matrix[extra_match_gts_mask, :], axis=1)
+            match_matrix[extra_match_gts_mask, :] *= 0.0
+            match_matrix[extra_match_gts_mask, cost_argmin] = 1.0
+        # get foreground mask
+        match_fg_mask_inmatrix = match_matrix.sum(1) > 0
+        match_gt_inds_to_fg = match_matrix[match_fg_mask_inmatrix, :].argmax(1)
+
+        return match_gt_inds_to_fg, match_fg_mask_inmatrix
+
+    def get_sample(self, assign_gt_inds, gt_bboxes):
+        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
+        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
+        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1
+
+        if gt_bboxes.size == 0:
+            # hack for index error case
+            assert pos_assigned_gt_inds.size == 0
+            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.resize(-1, 4)
+            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
+        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
+
+    def __call__(self,
+                 flatten_cls_pred_scores,
+                 flatten_center_and_stride,
+                 flatten_bboxes,
+                 gt_bboxes,
+                 gt_labels,
+                 eps=1e-7):
+        """Assign gt to priors using SimOTA.
+        TODO: add comment.
+        Returns:
+            assign_result: The assigned result.
+        """
+        num_gt = gt_bboxes.shape[0]
+        num_bboxes = flatten_bboxes.shape[0]
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes
+            label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
+            label_weight = np.ones([num_bboxes], dtype=np.float32)
+            bbox_target = np.zeros_like(flatten_center_and_stride)
+            return 0, label, label_weight, bbox_target
+
+        is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            flatten_center_and_stride, gt_bboxes)
+
+        if len(is_in_gts_or_centers_all_inds) == 0:
+            # No valid boxes
+            label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
+            label_weight = np.ones([num_bboxes], dtype=np.float32)
+            bbox_target = np.zeros_like(flatten_center_and_stride)
+            return 0, label, label_weight, bbox_target
+
+        # bboxes and scores to calculate matrix
+        valid_flatten_bboxes = flatten_bboxes[is_in_gts_or_centers_all_inds]
+        valid_cls_pred_scores = flatten_cls_pred_scores[
+            is_in_gts_or_centers_all_inds]
+        num_valid_bboxes = valid_flatten_bboxes.shape[0]
+
+        pairwise_ious = batch_bbox_overlaps(valid_flatten_bboxes,
+                                            gt_bboxes)  # [num_points,num_gts]
+        if self.use_vfl:
+            gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile(
+                [num_valid_bboxes, 1]).reshape([-1])
+            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
+                [1, num_gt, 1]).reshape([-1, self.num_classes])
+            vfl_score = np.zeros(valid_pred_scores.shape)
+            vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy(
+            )] = pairwise_ious.reshape([-1])
+            vfl_score = paddle.to_tensor(vfl_score)
+            losses_vfl = varifocal_loss(
+                valid_pred_scores, vfl_score,
+                use_sigmoid=False).reshape([num_valid_bboxes, num_gt])
+            losses_giou = batch_bbox_overlaps(
+                valid_flatten_bboxes, gt_bboxes, mode='giou')
+            cost_matrix = (
+                losses_vfl * self.cls_weight + losses_giou * self.iou_weight +
+                paddle.logical_not(is_in_boxes_and_center).cast('float32') *
+                100000000)
+        else:
+            iou_cost = -paddle.log(pairwise_ious + eps)
+            gt_onehot_label = (F.one_hot(
+                gt_labels.squeeze(-1).cast(paddle.int64),
+                flatten_cls_pred_scores.shape[-1]).cast('float32').unsqueeze(0)
+                               .tile([num_valid_bboxes, 1, 1]))
+
+            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
+                [1, num_gt, 1])
+            cls_cost = F.binary_cross_entropy(
+                valid_pred_scores, gt_onehot_label, reduction='none').sum(-1)
+
+            cost_matrix = (
+                cls_cost * self.cls_weight + iou_cost * self.iou_weight +
+                paddle.logical_not(is_in_boxes_and_center).cast('float32') *
+                100000000)
+
+        match_gt_inds_to_fg, match_fg_mask_inmatrix = \
+            self.dynamic_k_matching(
+                cost_matrix, pairwise_ious, num_gt)
+
+        # sample and assign results
+        assigned_gt_inds = np.zeros([num_bboxes], dtype=np.int64)
+        match_fg_mask_inall = np.zeros_like(assigned_gt_inds)
+        match_fg_mask_inall[is_in_gts_or_centers_all.numpy(
+        )] = match_fg_mask_inmatrix
+
+        assigned_gt_inds[match_fg_mask_inall.astype(
+            np.bool_)] = match_gt_inds_to_fg + 1
+
+        pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
+            = self.get_sample(assigned_gt_inds, gt_bboxes.numpy())
+
+        bbox_target = np.zeros_like(flatten_bboxes)
+        bbox_weight = np.zeros_like(flatten_bboxes)
+        label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
+        label_weight = np.zeros([num_bboxes], dtype=np.float32)
+
+        if len(pos_inds) > 0:
+            gt_labels = gt_labels.numpy()
+            pos_bbox_targets = pos_gt_bboxes
+            bbox_target[pos_inds, :] = pos_bbox_targets
+            bbox_weight[pos_inds, :] = 1.0
+            if not np.any(gt_labels):
+                label[pos_inds] = 0
+            else:
+                label[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds]
+
+            label_weight[pos_inds] = 1.0
+        if len(neg_inds) > 0:
+            label_weight[neg_inds] = 1.0
+
+        pos_num = max(pos_inds.size, 1)
+
+        return pos_num, label, label_weight, bbox_target
diff --git a/ppdet/modeling/assigners/task_aligned_assigner.py b/ppdet/modeling/assigners/task_aligned_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..23af79439ae7074b1a0f7fd74c42c1866c4de6ce
--- /dev/null
+++ b/ppdet/modeling/assigners/task_aligned_assigner.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ..bbox_utils import batch_iou_similarity
+from .utils import (gather_topk_anchors, check_points_inside_bboxes,
+                    compute_max_iou_anchor)
+
+__all__ = ['TaskAlignedAssigner']
+
+
+def is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.):
+    """Calculate distance ratio of box1 and box2 in batch for larger stride
+        anchors dist/stride to promote the survive of large distance match
+    Args:
+        anchor (Tensor): box with the shape [L, 2]
+        gt (Tensor): box with the shape [N, M2, 4]
+    Return:
+        dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2]
+    """
+    center1 = anchor.unsqueeze(0)
+    center2 = (gt[..., :2] + gt[..., -2:]) / 2.
+    center1 = center1.unsqueeze(1)  # [N, M1, 2] -> [N, 1, M1, 2]
+    center2 = center2.unsqueeze(2)  # [N, M2, 2] -> [N, M2, 1, 2]
+
+    stride = paddle.concat([
+        paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst)
+    ]).unsqueeze(0).unsqueeze(0)
+    dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride
+    dist_ratio = dist
+    dist_ratio[dist < max_dist] = 1.
+    dist_ratio[dist >= max_dist] = 0.
+    return dist_ratio
+
+
+@register
+class TaskAlignedAssigner(nn.Layer):
+    """TOOD: Task-aligned One-stage Object Detection
+    """
+
+    def __init__(self,
+                 topk=13,
+                 alpha=1.0,
+                 beta=6.0,
+                 eps=1e-9,
+                 is_close_gt=False):
+        super(TaskAlignedAssigner, self).__init__()
+        self.topk = topk
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+        self.is_close_gt = is_close_gt
+
+    @paddle.no_grad()
+    def forward(self,
+                pred_scores,
+                pred_bboxes,
+                anchor_points,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index,
+                gt_scores=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
+
+        The assignment is done in following steps
+        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free detector
+           only can predict positive distance)
+        4. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
+            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
+            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
+            num_anchors_list (List): num of anchors in each level, shape(L)
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 4)
+            assigned_scores (Tensor): (B, L, C)
+        """
+        assert pred_scores.ndim == pred_bboxes.ndim
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        batch_size, num_anchors, num_classes = pred_scores.shape
+        _, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype='int32')
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, num_classes])
+            return assigned_labels, assigned_bboxes, assigned_scores
+
+        # compute iou between gt and pred bbox, [B, n, L]
+        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
+        # gather pred bboxes class score
+        pred_scores = pred_scores.transpose([0, 2, 1])
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        gt_labels_ind = paddle.stack(
+            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
+            axis=-1)
+        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
+        # compute alignment metrics, [B, n, L]
+        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
+            self.beta)
+
+        # check the positive sample's center in gt, [B, n, L]
+        if self.is_close_gt:
+            is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list)
+        else:
+            is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
+
+        # select topk largest alignment metrics pred bbox as candidates
+        # for each gt, [B, n, L]
+        is_in_topk = gather_topk_anchors(
+            alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)
+
+        # select positive sample, [B, n, L]
+        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected, [B, n, L]
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
+                [1, num_max_boxes, 1])
+            is_max_iou = compute_max_iou_anchor(ious)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+
+        # assigned target
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
+
+        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
+        ind = list(range(num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+        # rescale alignment metrics
+        alignment_metrics *= mask_positive
+        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
+        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
+                                                           keepdim=True)
+        alignment_metrics = alignment_metrics / (
+            max_metrics_per_instance + self.eps) * max_ious_per_instance
+        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
+        assigned_scores = assigned_scores * alignment_metrics
+
+        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/ppdet/modeling/assigners/task_aligned_assigner_cr.py b/ppdet/modeling/assigners/task_aligned_assigner_cr.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c5097604d55856af45e0eeffacd381c0f5518b0
--- /dev/null
+++ b/ppdet/modeling/assigners/task_aligned_assigner_cr.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ..bbox_utils import batch_iou_similarity
+from .utils import (gather_topk_anchors, check_points_inside_bboxes,
+                    compute_max_iou_anchor)
+
+__all__ = ['TaskAlignedAssigner_CR']
+
+
+@register
+class TaskAlignedAssigner_CR(nn.Layer):
+    """TOOD: Task-aligned One-stage Object Detection with Center R
+    """
+
+    def __init__(self,
+                 topk=13,
+                 alpha=1.0,
+                 beta=6.0,
+                 center_radius=None,
+                 eps=1e-9):
+        super(TaskAlignedAssigner_CR, self).__init__()
+        self.topk = topk
+        self.alpha = alpha
+        self.beta = beta
+        self.center_radius = center_radius
+        self.eps = eps
+
+    @paddle.no_grad()
+    def forward(self,
+                pred_scores,
+                pred_bboxes,
+                anchor_points,
+                stride_tensor,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index,
+                gt_scores=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
+
+        The assignment is done in following steps
+        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free detector
+           only can predict positive distance)
+        4. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
+            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
+            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
+            stride_tensor (Tensor, float32): stride of feature map, shape(L, 1)
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 4)
+            assigned_scores (Tensor): (B, L, C)
+        """
+        assert pred_scores.ndim == pred_bboxes.ndim
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        batch_size, num_anchors, num_classes = pred_scores.shape
+        _, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype='int32')
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, num_classes])
+            return assigned_labels, assigned_bboxes, assigned_scores
+
+        # compute iou between gt and pred bbox, [B, n, L]
+        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
+        # gather pred bboxes class score
+        pred_scores = pred_scores.transpose([0, 2, 1])
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        gt_labels_ind = paddle.stack(
+            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
+            axis=-1)
+        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
+        # compute alignment metrics, [B, n, L]
+        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
+            self.beta) * pad_gt_mask
+
+        # select positive sample, [B, n, L]
+        if self.center_radius is None:
+            # check the positive sample's center in gt, [B, n, L]
+            is_in_gts = check_points_inside_bboxes(
+                anchor_points, gt_bboxes, sm_use=True)
+            # select topk largest alignment metrics pred bbox as candidates
+            # for each gt, [B, n, L]
+            mask_positive = gather_topk_anchors(
+                alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts
+        else:
+            is_in_gts, is_in_center = check_points_inside_bboxes(
+                anchor_points,
+                gt_bboxes,
+                stride_tensor * self.center_radius,
+                sm_use=True)
+            is_in_gts *= pad_gt_mask
+            is_in_center *= pad_gt_mask
+            candidate_metrics = paddle.where(
+                is_in_gts.sum(-1, keepdim=True) == 0,
+                alignment_metrics + is_in_center,
+                alignment_metrics)
+            mask_positive = gather_topk_anchors(
+                candidate_metrics, self.topk,
+                topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) |
+                                                     (is_in_gts > 0), 'float32')
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected, [B, n, L]
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
+                [1, num_max_boxes, 1])
+            is_max_iou = compute_max_iou_anchor(ious * mask_positive)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+
+        # assigned target
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
+
+        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
+        ind = list(range(num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+        # rescale alignment metrics
+        alignment_metrics *= mask_positive
+        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
+        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
+                                                           keepdim=True)
+        alignment_metrics = alignment_metrics / (
+            max_metrics_per_instance + self.eps) * max_ious_per_instance
+        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
+        assigned_scores = assigned_scores * alignment_metrics
+
+        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/ppdet/modeling/assigners/uniform_assigner.py b/ppdet/modeling/assigners/uniform_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c1480593d9fc66147b2eefb9d3b6246713e2c74
--- /dev/null
+++ b/ppdet/modeling/assigners/uniform_assigner.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+from ppdet.modeling.bbox_utils import batch_bbox_overlaps
+from ppdet.modeling.transformers import bbox_xyxy_to_cxcywh
+
+__all__ = ['UniformAssigner']
+
+
+def batch_p_dist(x, y, p=2):
+    """
+    calculate pairwise p_dist, the first index of x and y are batch
+    return [x.shape[0], y.shape[0]]
+    """
+    x = x.unsqueeze(1)
+    diff = x - y
+    return paddle.norm(diff, p=p, axis=list(range(2, diff.dim())))
+
+
+@register
+class UniformAssigner(nn.Layer):
+    def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4):
+        super(UniformAssigner, self).__init__()
+        self.pos_ignore_thr = pos_ignore_thr
+        self.neg_ignore_thr = neg_ignore_thr
+        self.match_times = match_times
+
+    def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None):
+        num_bboxes = bbox_pred.shape[0]
+        num_gts = gt_bboxes.shape[0]
+        match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32)
+
+        pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes)
+        pred_max_iou = pred_ious.max(axis=1)
+        neg_ignore = pred_max_iou > self.neg_ignore_thr
+        # exclude potential ignored neg samples first, deal with pos samples later
+        #match_labels: -2(ignore), -1(neg) or >=0(pos_inds)
+        match_labels = paddle.where(neg_ignore,
+                                    paddle.full_like(match_labels, -2),
+                                    match_labels)
+
+        bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred)
+        anchor_c = bbox_xyxy_to_cxcywh(anchor)
+        gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes)
+        bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1)
+        anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1)
+
+        top_pred = bbox_pred_dist.topk(
+            k=self.match_times, axis=0, largest=False)[1]
+        top_anchor = anchor_dist.topk(
+            k=self.match_times, axis=0, largest=False)[1]
+
+        tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts])
+        tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts])
+        pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1])
+        pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1])
+
+        pos_anchor = anchor[pos_places]
+        pos_tar_bbox = gt_bboxes[pos_inds]
+        pos_ious = batch_bbox_overlaps(
+            pos_anchor, pos_tar_bbox, is_aligned=True)
+        pos_ignore = pos_ious < self.pos_ignore_thr
+        pos_inds = paddle.where(pos_ignore,
+                                paddle.full_like(pos_inds, -2), pos_inds)
+        match_labels[pos_places] = pos_inds
+        match_labels.stop_gradient = True
+        pos_keep = ~pos_ignore
+
+        if pos_keep.sum() > 0:
+            pos_places_keep = pos_places[pos_keep]
+            pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4])
+            pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach()
+        else:
+            pos_bbox_pred = None
+            pos_bbox_tar = None
+
+        return match_labels, pos_bbox_pred, pos_bbox_tar
diff --git a/ppdet/modeling/assigners/utils.py b/ppdet/modeling/assigners/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe7c9382c359a55b8c4b6efc491eaa049aab5b1
--- /dev/null
+++ b/ppdet/modeling/assigners/utils.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn.functional as F
+
+__all__ = [
+    'pad_gt', 'gather_topk_anchors', 'check_points_inside_bboxes',
+    'compute_max_iou_anchor', 'compute_max_iou_gt',
+    'generate_anchors_for_grid_cell'
+]
+
+
+def pad_gt(gt_labels, gt_bboxes, gt_scores=None):
+    r""" Pad 0 in gt_labels and gt_bboxes.
+    Args:
+        gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes,
+            shape is [B, n, 1] or [[n_1, 1], [n_2, 1], ...], here n = sum(n_i)
+        gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes,
+            shape is [B, n, 4] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
+        gt_scores (Tensor|List[Tensor]|None, float32): Score of gt_bboxes,
+            shape is [B, n, 1] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
+    Returns:
+        pad_gt_labels (Tensor, int64): shape[B, n, 1]
+        pad_gt_bboxes (Tensor, float32): shape[B, n, 4]
+        pad_gt_scores (Tensor, float32): shape[B, n, 1]
+        pad_gt_mask (Tensor, float32): shape[B, n, 1], 1 means bbox, 0 means no bbox
+    """
+    if isinstance(gt_labels, paddle.Tensor) and isinstance(gt_bboxes,
+                                                           paddle.Tensor):
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+        pad_gt_mask = (
+            gt_bboxes.sum(axis=-1, keepdim=True) > 0).astype(gt_bboxes.dtype)
+        if gt_scores is None:
+            gt_scores = pad_gt_mask.clone()
+        assert gt_labels.ndim == gt_scores.ndim
+
+        return gt_labels, gt_bboxes, gt_scores, pad_gt_mask
+    elif isinstance(gt_labels, list) and isinstance(gt_bboxes, list):
+        assert len(gt_labels) == len(gt_bboxes), \
+            'The number of `gt_labels` and `gt_bboxes` is not equal. '
+        num_max_boxes = max([len(a) for a in gt_bboxes])
+        batch_size = len(gt_bboxes)
+        # pad label and bbox
+        pad_gt_labels = paddle.zeros(
+            [batch_size, num_max_boxes, 1], dtype=gt_labels[0].dtype)
+        pad_gt_bboxes = paddle.zeros(
+            [batch_size, num_max_boxes, 4], dtype=gt_bboxes[0].dtype)
+        pad_gt_scores = paddle.zeros(
+            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
+        pad_gt_mask = paddle.zeros(
+            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
+        for i, (label, bbox) in enumerate(zip(gt_labels, gt_bboxes)):
+            if len(label) > 0 and len(bbox) > 0:
+                pad_gt_labels[i, :len(label)] = label
+                pad_gt_bboxes[i, :len(bbox)] = bbox
+                pad_gt_mask[i, :len(bbox)] = 1.
+                if gt_scores is not None:
+                    pad_gt_scores[i, :len(gt_scores[i])] = gt_scores[i]
+        if gt_scores is None:
+            pad_gt_scores = pad_gt_mask.clone()
+        return pad_gt_labels, pad_gt_bboxes, pad_gt_scores, pad_gt_mask
+    else:
+        raise ValueError('The input `gt_labels` or `gt_bboxes` is invalid! ')
+
+
+def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
+    r"""
+    Args:
+        metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
+        topk (int): The number of top elements to look for along the axis.
+        largest (bool) : largest is a flag, if set to true,
+            algorithm will sort by descending order, otherwise sort by
+            ascending order. Default: True
+        topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask,
+            Default: None
+        eps (float): Default: 1e-9
+    Returns:
+        is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected
+    """
+    num_anchors = metrics.shape[-1]
+    topk_metrics, topk_idxs = paddle.topk(
+        metrics, topk, axis=-1, largest=largest)
+    if topk_mask is None:
+        topk_mask = (
+            topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype)
+    is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
+        axis=-2).astype(metrics.dtype)
+    return is_in_topk * topk_mask
+
+
+def check_points_inside_bboxes(points,
+                               bboxes,
+                               center_radius_tensor=None,
+                               eps=1e-9,
+                               sm_use=False):
+    r"""
+    Args:
+        points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
+        bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format
+        center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None.
+        eps (float): Default: 1e-9
+    Returns:
+        is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected
+    """
+    points = points.unsqueeze([0, 1])
+    x, y = points.chunk(2, axis=-1)
+    xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1)
+    # check whether `points` is in `bboxes`
+    l = x - xmin
+    t = y - ymin
+    r = xmax - x
+    b = ymax - y
+    delta_ltrb = paddle.concat([l, t, r, b], axis=-1)
+    is_in_bboxes = (delta_ltrb.min(axis=-1) > eps)
+    if center_radius_tensor is not None:
+        # check whether `points` is in `center_radius`
+        center_radius_tensor = center_radius_tensor.unsqueeze([0, 1])
+        cx = (xmin + xmax) * 0.5
+        cy = (ymin + ymax) * 0.5
+        l = x - (cx - center_radius_tensor)
+        t = y - (cy - center_radius_tensor)
+        r = (cx + center_radius_tensor) - x
+        b = (cy + center_radius_tensor) - y
+        delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)
+        is_in_center = (delta_ltrb_c.min(axis=-1) > eps)
+        if sm_use:
+            return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype(
+                bboxes.dtype)
+        else:
+            return (paddle.logical_and(is_in_bboxes, is_in_center),
+                    paddle.logical_or(is_in_bboxes, is_in_center))
+
+    return is_in_bboxes.astype(bboxes.dtype)
+
+
+def compute_max_iou_anchor(ious):
+    r"""
+    For each anchor, find the GT with the largest IOU.
+    Args:
+        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
+    Returns:
+        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
+    """
+    num_max_boxes = ious.shape[-2]
+    max_iou_index = ious.argmax(axis=-2)
+    is_max_iou = F.one_hot(max_iou_index, num_max_boxes).transpose([0, 2, 1])
+    return is_max_iou.astype(ious.dtype)
+
+
+def compute_max_iou_gt(ious):
+    r"""
+    For each GT, find the anchor with the largest IOU.
+    Args:
+        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
+    Returns:
+        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
+    """
+    num_anchors = ious.shape[-1]
+    max_iou_index = ious.argmax(axis=-1)
+    is_max_iou = F.one_hot(max_iou_index, num_anchors)
+    return is_max_iou.astype(ious.dtype)
+
+
+def generate_anchors_for_grid_cell(feats,
+                                   fpn_strides,
+                                   grid_cell_size=5.0,
+                                   grid_cell_offset=0.5,
+                                   dtype='float32'):
+    r"""
+    Like ATSS, generate anchors based on grid size.
+    Args:
+        feats (List[Tensor]): shape[s, (b, c, h, w)]
+        fpn_strides (tuple|list): shape[s], stride for each scale feature
+        grid_cell_size (float): anchor size
+        grid_cell_offset (float): The range is between 0 and 1.
+    Returns:
+        anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format.
+        anchor_points (Tensor): shape[l, 2], "x, y" format.
+        num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...].
+        stride_tensor (Tensor): shape[l, 1], contains the stride for each scale.
+    """
+    assert len(feats) == len(fpn_strides)
+    anchors = []
+    anchor_points = []
+    num_anchors_list = []
+    stride_tensor = []
+    for feat, stride in zip(feats, fpn_strides):
+        _, _, h, w = feat.shape
+        cell_half_size = grid_cell_size * stride * 0.5
+        shift_x = (paddle.arange(end=w) + grid_cell_offset) * stride
+        shift_y = (paddle.arange(end=h) + grid_cell_offset) * stride
+        shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
+        anchor = paddle.stack(
+            [
+                shift_x - cell_half_size, shift_y - cell_half_size,
+                shift_x + cell_half_size, shift_y + cell_half_size
+            ],
+            axis=-1).astype(dtype)
+        anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype)
+
+        anchors.append(anchor.reshape([-1, 4]))
+        anchor_points.append(anchor_point.reshape([-1, 2]))
+        num_anchors_list.append(len(anchors[-1]))
+        stride_tensor.append(
+            paddle.full(
+                [num_anchors_list[-1], 1], stride, dtype=dtype))
+    anchors = paddle.concat(anchors)
+    anchors.stop_gradient = True
+    anchor_points = paddle.concat(anchor_points)
+    anchor_points.stop_gradient = True
+    stride_tensor = paddle.concat(stride_tensor)
+    stride_tensor.stop_gradient = True
+    return anchors, anchor_points, num_anchors_list, stride_tensor
diff --git a/ppdet/modeling/backbones/__init__.py b/ppdet/modeling/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a4ab9dce474aa3f3520dd8e77f51e092bc4be3
--- /dev/null
+++ b/ppdet/modeling/backbones/__init__.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import resnet
+from . import darknet
+from . import mobilenet_v1
+from . import mobilenet_v3
+from . import cspresnet
+from . import csp_darknet
+from . import yolov6_efficientrep
+from . import yolov7_elannet
+from . import cspnext
+from . import yolov8_csp_darknet
+from . import convnext
+from . import focalnet
+from . import swin_transformer
+from . import vision_transformer
+from . import vit_mae
+from . import hgnet_v2
+
+from .resnet import *
+from .darknet import *
+from .mobilenet_v1 import *
+from .mobilenet_v3 import *
+from .cspresnet import *
+from .csp_darknet import *
+from .yolov6_efficientrep import *
+from .yolov7_elannet import *
+from .cspnext import *
+from .yolov8_csp_darknet import *
+from .convnext import *
+from .focalnet import *
+from .swin_transformer import *
+from .vision_transformer import *
+from .vit_mae import *
+from .hgnet_v2 import *
diff --git a/ppdet/modeling/backbones/convnext.py b/ppdet/modeling/backbones/convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..476e12b2da50585dd142f3049ba024769e691e8b
--- /dev/null
+++ b/ppdet/modeling/backbones/convnext.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+'''
+Modified from https://github.com/facebookresearch/ConvNeXt
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+'''
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+
+import numpy as np
+
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+from .transformer_utils import DropPath, trunc_normal_, zeros_
+
+__all__ = ['ConvNeXt']
+
+
+class Block(nn.Layer):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in Pypaddle
+    
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2D(
+            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+
+        if layer_scale_init_value > 0:
+            self.gamma = self.create_parameter(
+                shape=(dim, ),
+                attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
+        else:
+            self.gamma = None
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
+        )
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.transpose([0, 2, 3, 1])
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose([0, 3, 1, 2])
+        x = input + self.drop_path(x)
+        return x
+
+
+class LayerNorm(nn.Layer):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+
+        self.weight = self.create_parameter(
+            shape=(normalized_shape, ),
+            attr=ParamAttr(initializer=Constant(1.)))
+        self.bias = self.create_parameter(
+            shape=(normalized_shape, ),
+            attr=ParamAttr(initializer=Constant(0.)))
+
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight,
+                                self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / paddle.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+@register
+@serializable
+class ConvNeXt(nn.Layer):
+    r""" ConvNeXt
+        A Pypaddle impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    arch_settings = {
+        'tiny': {
+            'depths': [3, 3, 9, 3],
+            'dims': [96, 192, 384, 768]
+        },
+        'small': {
+            'depths': [3, 3, 27, 3],
+            'dims': [96, 192, 384, 768]
+        },
+        'base': {
+            'depths': [3, 3, 27, 3],
+            'dims': [128, 256, 512, 1024]
+        },
+        'large': {
+            'depths': [3, 3, 27, 3],
+            'dims': [192, 384, 768, 1536]
+        },
+        'xlarge': {
+            'depths': [3, 3, 27, 3],
+            'dims': [256, 512, 1024, 2048]
+        },
+    }
+
+    def __init__(
+            self,
+            arch='tiny',
+            in_chans=3,
+            drop_path_rate=0.,
+            layer_scale_init_value=1e-6,
+            return_idx=[1, 2, 3],
+            norm_output=True,
+            pretrained=None, ):
+        super().__init__()
+        depths = self.arch_settings[arch]['depths']
+        dims = self.arch_settings[arch]['dims']
+        self.downsample_layers = nn.LayerList(
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2D(
+                in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(
+                dims[0], eps=1e-6, data_format="channels_first"))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(
+                    dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2D(
+                    dims[i], dims[i + 1], kernel_size=2, stride=2), )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.LayerList(
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(* [
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.return_idx = return_idx
+        self.dims = [dims[i] for i in return_idx]  # [::-1]
+
+        self.norm_output = norm_output
+        if norm_output:
+            self.norms = nn.LayerList([
+                LayerNorm(
+                    c, eps=1e-6, data_format="channels_first")
+                for c in self.dims
+            ])
+
+        self.apply(self._init_weights)
+
+        if pretrained is not None:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2D, nn.Linear)):
+            trunc_normal_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        output = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            output.append(x)
+
+        outputs = [output[i] for i in self.return_idx]
+        if self.norm_output:
+            outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
+
+        return outputs
+
+    def forward(self, x):
+        x = self.forward_features(x['image'])
+        return x
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self.dims]
diff --git a/ppdet/modeling/backbones/csp_darknet.py b/ppdet/modeling/backbones/csp_darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..92c4c309d2840df39bd9b62789476f65fcd1d9c8
--- /dev/null
+++ b/ppdet/modeling/backbones/csp_darknet.py
@@ -0,0 +1,442 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.initializer import conv_init_
+from ..shape_spec import ShapeSpec
+
+__all__ = [
+    'CSPDarkNet',
+    'BaseConv',
+    'DWConv',
+    'BottleNeck',
+    'SPPLayer',
+    'SPPFLayer',
+]
+
+
+def get_activation(name="silu"):
+    if name == "silu":
+        module = nn.Silu()
+    elif name == "relu":
+        module = nn.ReLU()
+    elif name in ["LeakyReLU", 'leakyrelu', 'lrelu']:
+        module = nn.LeakyReLU(0.1)
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+
+class SiLU(nn.Layer):
+    def __init__(self):
+        super(SiLU, self).__init__()
+
+    def forward(self, x):
+        return x * F.sigmoid(x)
+
+
+class BaseConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride,
+                 groups=1,
+                 bias=False,
+                 act="silu"):
+        super(BaseConv, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=(ksize - 1) // 2,
+            groups=groups,
+            bias_attr=bias)
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            # epsilon=1e-3,  # for amp(fp16), set in ppdet/engine/trainer.py
+            # momentum=0.97,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.act = get_activation(act) if isinstance(act, str) else act
+        self._init_weights()
+
+    def _init_weights(self):
+        conv_init_(self.conv)
+
+    def forward(self, x):
+        x = self.bn(self.conv(x))
+        if self.training:
+            y = self.act(x)
+        else:
+            if isinstance(self.act, nn.Silu):
+                self.act = SiLU()
+            y = self.act(x)
+        return y
+
+
+class DWConv(nn.Layer):
+    """Depthwise Conv"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 bias=False,
+                 act="silu"):
+        super(DWConv, self).__init__()
+        self.dw_conv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            bias=bias,
+            act=act)
+        self.pw_conv = BaseConv(
+            in_channels,
+            out_channels,
+            ksize=1,
+            stride=1,
+            groups=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        return self.pw_conv(self.dw_conv(x))
+
+
+class Focus(nn.Layer):
+    """Focus width and height information into channel space, used in YOLOX."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=3,
+                 stride=1,
+                 bias=False,
+                 act="silu"):
+        super(Focus, self).__init__()
+        self.conv = BaseConv(
+            in_channels * 4,
+            out_channels,
+            ksize=ksize,
+            stride=stride,
+            bias=bias,
+            act=act)
+
+    def forward(self, inputs):
+        # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
+        top_left = inputs[:, :, 0::2, 0::2]
+        top_right = inputs[:, :, 0::2, 1::2]
+        bottom_left = inputs[:, :, 1::2, 0::2]
+        bottom_right = inputs[:, :, 1::2, 1::2]
+        outputs = paddle.concat(
+            [top_left, bottom_left, top_right, bottom_right], 1)
+        return self.conv(outputs)
+
+
+class BottleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 shortcut=True,
+                 kernel_sizes=(1, 3),
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(BottleNeck, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels,
+            hidden_channels,
+            ksize=kernel_sizes[0],
+            stride=1,
+            bias=bias,
+            act=act)
+        self.conv2 = Conv(
+            hidden_channels,
+            out_channels,
+            ksize=kernel_sizes[1],
+            stride=1,
+            bias=bias,
+            act=act)
+        self.add_shortcut = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.add_shortcut:
+            return paddle.add(y, x)
+        else:
+            return y
+
+
+class SPPLayer(nn.Layer):
+    """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 bias=False,
+                 act="silu"):
+        super(SPPLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.maxpoolings = nn.LayerList([
+            nn.MaxPool2D(
+                kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
+        x = self.conv2(x)
+        return x
+
+
+class SPPFLayer(nn.Layer):
+    """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
+        equivalent to SPP(k=(5, 9, 13))
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=5,
+                 bias=False,
+                 act='silu'):
+        super(SPPFLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.maxpooling = nn.MaxPool2D(
+            kernel_size=ksize, stride=1, padding=ksize // 2)
+        conv2_channels = hidden_channels * 4
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        y1 = self.maxpooling(x)
+        y2 = self.maxpooling(y1)
+        y3 = self.maxpooling(y2)
+        concats = paddle.concat([x, y1, y2, y3], axis=1)
+        out = self.conv2(concats)
+        return out
+
+
+class CSPLayer(nn.Layer):
+    """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=1,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(CSPLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(* [
+            BottleNeck(
+                hidden_channels,
+                hidden_channels,
+                shortcut=shortcut,
+                expansion=1.0,
+                depthwise=depthwise,
+                bias=bias,
+                act=act) for _ in range(num_blocks)
+        ])
+        self.conv3 = BaseConv(
+            hidden_channels * 2,
+            out_channels,
+            ksize=1,
+            stride=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        x = paddle.concat([x_1, x_2], axis=1)
+        x = self.conv3(x)
+        return x
+
+
+@register
+@serializable
+class CSPDarkNet(nn.Layer):
+    """
+    CSPDarkNet backbone.
+    Args:
+        arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
+            and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
+        depth_mult (float): Depth multiplier, multiply number of channels in
+            each layer, default as 1.0.
+        width_mult (float): Width multiplier, multiply number of blocks in
+            CSPLayer, default as 1.0.
+        depthwise (bool): Whether to use depth-wise conv layer.
+        act (str): Activation function type, default as 'silu'.
+        return_idx (list): Index of stages whose feature maps are returned.
+    """
+
+    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
+
+    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
+    # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
+    arch_settings = {
+        'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+              [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, True, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, True, True]],
+    }
+
+    def __init__(self,
+                 arch='X',
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 depthwise=False,
+                 act='silu',
+                 trt=False,
+                 return_idx=[2, 3, 4]):
+        super(CSPDarkNet, self).__init__()
+        self.arch = arch
+        self.return_idx = return_idx
+        Conv = DWConv if depthwise else BaseConv
+        arch_setting = self.arch_settings[arch]
+        base_channels = int(arch_setting[0][0] * width_mult)
+
+        # Note: differences between the latest YOLOv5 and the original YOLOX
+        # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
+        # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
+        if arch in ['P5', 'P6']:
+            # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
+            self.stem = Conv(
+                3, base_channels, ksize=6, stride=2, bias=False, act=act)
+            spp_kernal_sizes = 5
+        elif arch in ['X']:
+            # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
+            self.stem = Focus(
+                3, base_channels, ksize=3, stride=1, bias=False, act=act)
+            spp_kernal_sizes = (5, 9, 13)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(arch))
+
+        _out_channels = [base_channels]
+        layers_num = 1
+        self.csp_dark_blocks = []
+
+        for i, (in_channels, out_channels, num_blocks, shortcut,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * width_mult)
+            out_channels = int(out_channels * width_mult)
+            _out_channels.append(out_channels)
+            num_blocks = max(round(num_blocks * depth_mult), 1)
+            stage = []
+
+            conv_layer = self.add_sublayer(
+                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
+                Conv(
+                    in_channels, out_channels, 3, 2, bias=False, act=act))
+            stage.append(conv_layer)
+            layers_num += 1
+
+            if use_spp and arch in ['X']:
+                # in YOLOX use SPPLayer
+                spp_layer = self.add_sublayer(
+                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
+                    SPPLayer(
+                        out_channels,
+                        out_channels,
+                        kernel_sizes=spp_kernal_sizes,
+                        bias=False,
+                        act=act))
+                stage.append(spp_layer)
+                layers_num += 1
+
+            csp_layer = self.add_sublayer(
+                'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
+                CSPLayer(
+                    out_channels,
+                    out_channels,
+                    num_blocks=num_blocks,
+                    shortcut=shortcut,
+                    depthwise=depthwise,
+                    bias=False,
+                    act=act))
+            stage.append(csp_layer)
+            layers_num += 1
+
+            if use_spp and arch in ['P5', 'P6']:
+                # in latest YOLOv5 use SPPFLayer instead of SPPLayer
+                sppf_layer = self.add_sublayer(
+                    'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
+                    SPPFLayer(
+                        out_channels,
+                        out_channels,
+                        ksize=5,
+                        bias=False,
+                        act=act))
+                stage.append(sppf_layer)
+                layers_num += 1
+
+            self.csp_dark_blocks.append(nn.Sequential(*stage))
+
+        self._out_channels = [_out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.csp_dark_blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
diff --git a/ppdet/modeling/backbones/cspnext.py b/ppdet/modeling/backbones/cspnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..a26c3f366f4875e625f720b5f1b520d62c278838
--- /dev/null
+++ b/ppdet/modeling/backbones/cspnext.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from ppdet.core.workspace import register, serializable
+from .csp_darknet import BaseConv, DWConv, SPPLayer
+from ..shape_spec import ShapeSpec
+
+__all__ = ['CSPNeXtBlock', 'CSPNeXtLayer', 'CSPNeXt']
+
+
+class CSPNeXtBlock(nn.Layer):
+    """The basic bottleneck block used in CSPNeXt."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 kernel_size=5,
+                 bias=False,
+                 act="silu"):
+        super(CSPNeXtBlock, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 3, stride=1, bias=bias, act=act)
+        self.conv2 = DWConv(
+            hidden_channels,
+            out_channels,
+            ksize=kernel_size,
+            stride=1,
+            bias=bias,
+            act=act)
+        self.add_shortcut = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.add_shortcut:
+            y = y + x
+        return y
+
+
+class ChannelAttention(nn.Layer):
+    def __init__(self, channels=256):
+        super().__init__()
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        self.fc = nn.Conv2D(channels, channels, 1, 1, bias_attr=True)
+        self.act = nn.Hardsigmoid()
+
+    def forward(self, x):
+        y = self.pool(x)
+        out = self.act(self.fc(y))
+        return x * out
+
+
+class CSPNeXtLayer(nn.Layer):
+    """CSPNeXt layer used in RTMDet, like CSPLayer(C3) in YOLOv5/YOLOX"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=1,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 ch_attn=False,
+                 bias=False,
+                 act="silu"):
+        super(CSPNeXtLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.ch_attn = ch_attn
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv3 = BaseConv(
+            hidden_channels * 2,
+            out_channels,
+            ksize=1,
+            stride=1,
+            bias=bias,
+            act=act)
+        self.bottlenecks = nn.Sequential(* [
+            CSPNeXtBlock(
+                hidden_channels,
+                hidden_channels,
+                shortcut=shortcut,
+                expansion=1.0,
+                depthwise=depthwise,
+                bias=bias,
+                act=act) for _ in range(num_blocks)
+        ])
+        if ch_attn:
+            self.ch_attn = ChannelAttention(hidden_channels * 2)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        x = paddle.concat([x_1, x_2], axis=1)
+        if self.ch_attn:
+            x = self.ch_attn(x)
+        x = self.conv3(x)
+        return x
+
+
+@register
+@serializable
+class CSPNeXt(nn.Layer):
+    """
+    CSPNeXt backbone of RTMDet.
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+        depth_mult (float): Depth multiplier, multiply number of channels in
+            each layer, default as 1.0.
+        width_mult (float): Width multiplier, multiply number of blocks in
+            CSPNeXtLayer, default as 1.0.
+        depthwise (bool): Whether to use depth-wise conv layer.
+        spp_kernel_sizes (tuple): kernel_sizes of SPP
+        ch_attn (bool): Whether to add channel attention.
+        act (str): Activation function type, default as 'silu'.
+        trt (str): Whether to use trt infer in activation.
+        return_idx (list): Index of stages whose feature maps are returned.
+    """
+
+    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
+
+    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(self,
+                 arch='P5',
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 depthwise=False,
+                 spp_kernel_sizes=(5, 9, 13),
+                 ch_attn=True,
+                 act='silu',
+                 trt=False,
+                 return_idx=[2, 3, 4]):
+        super(CSPNeXt, self).__init__()
+        self.arch = arch
+        self.return_idx = return_idx
+        Conv = DWConv if depthwise else BaseConv
+        arch_setting = self.arch_settings[arch]
+        stem_ch = int(arch_setting[0][0] * width_mult // 2)
+        stem_out_ch = int(stem_ch * 2)
+
+        self.stem = nn.Sequential(
+            ('conv1', BaseConv(
+                3, stem_ch, 3, 2, act=act)), ('conv2', BaseConv(
+                    stem_ch, stem_ch, 3, 1, act=act)), ('conv3', BaseConv(
+                        stem_ch, stem_out_ch, 3, 1, act=act)))
+
+        _out_channels = [stem_out_ch]
+        layers_num = 1
+        self.csp_next_blocks = []
+
+        for i, (in_ch, out_ch, n, shortcut, use_spp) in enumerate(arch_setting):
+            in_channels = int(in_ch * width_mult)
+            out_channels = int(out_ch * width_mult)
+            _out_channels.append(out_channels)
+            num_blocks = max(round(n * depth_mult), 1)
+            stage = []
+
+            conv_layer = self.add_sublayer(
+                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
+                Conv(
+                    in_channels, out_channels, 3, 2, act=act))
+            stage.append(conv_layer)
+            layers_num += 1
+
+            if use_spp:
+                spp_layer = self.add_sublayer(
+                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
+                    SPPLayer(
+                        out_channels,
+                        out_channels,
+                        kernel_sizes=spp_kernel_sizes,
+                        bias=False,
+                        act=act))
+                stage.append(spp_layer)
+                layers_num += 1
+
+            csp_layer = self.add_sublayer(
+                'layers{}.stage{}.cspnext_layer'.format(layers_num, i + 1),
+                CSPNeXtLayer(
+                    out_channels,
+                    out_channels,
+                    num_blocks=num_blocks,
+                    shortcut=shortcut,
+                    depthwise=depthwise,
+                    ch_attn=ch_attn,
+                    bias=False,
+                    act=act))
+            stage.append(csp_layer)
+            layers_num += 1
+
+            self.csp_next_blocks.append(nn.Sequential(*stage))
+
+        self._out_channels = [_out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.csp_next_blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
diff --git a/ppdet/modeling/backbones/cspresnet.py b/ppdet/modeling/backbones/cspresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5268ec835381052988b9ceaca47c89ab2755bec9
--- /dev/null
+++ b/ppdet/modeling/backbones/cspresnet.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant
+
+from ppdet.modeling.ops import get_act_fn
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(
+            ch_out,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class RepVggBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out, act='relu', alpha=False):
+        super(RepVggBlock, self).__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=None)
+        self.conv2 = ConvBNLayer(
+            ch_in, ch_out, 1, stride=1, padding=0, act=None)
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+        if alpha:
+            self.alpha = self.create_parameter(
+                shape=[1],
+                attr=ParamAttr(initializer=Constant(value=1.)),
+                dtype="float32")
+        else:
+            self.alpha = None
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            if self.alpha:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.set_value(kernel)
+        self.conv.bias.set_value(bias)
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + self.alpha * bias1x1
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 act='relu',
+                 shortcut=True,
+                 use_alpha=False):
+        super(BasicBlock, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return paddle.add(x, y)
+        else:
+            return y
+
+
+class EffectiveSELayer(nn.Layer):
+    """ Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+
+    def __init__(self, channels, act='hardsigmoid'):
+        super(EffectiveSELayer, self).__init__()
+        self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        return x * self.act(x_se)
+
+
+class CSPResStage(nn.Layer):
+    def __init__(self,
+                 block_fn,
+                 ch_in,
+                 ch_out,
+                 n,
+                 stride,
+                 act='relu',
+                 attn='eca',
+                 use_alpha=False):
+        super(CSPResStage, self).__init__()
+
+        ch_mid = (ch_in + ch_out) // 2
+        if stride == 2:
+            self.conv_down = ConvBNLayer(
+                ch_in, ch_mid, 3, stride=2, padding=1, act=act)
+        else:
+            self.conv_down = None
+        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                ch_mid // 2,
+                ch_mid // 2,
+                act=act,
+                shortcut=True,
+                use_alpha=use_alpha) for i in range(n)
+        ])
+        if attn:
+            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
+        else:
+            self.attn = None
+
+        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = paddle.concat([y1, y2], axis=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+
+
+@register
+@serializable
+class CSPResNet(nn.Layer):
+    __shared__ = ['width_mult', 'depth_mult', 'trt']
+
+    def __init__(self,
+                 layers=[3, 6, 6, 3],
+                 channels=[64, 128, 256, 512, 1024],
+                 act='swish',
+                 return_idx=[1, 2, 3],
+                 depth_wise=False,
+                 use_large_stem=False,
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 trt=False,
+                 use_checkpoint=False,
+                 use_alpha=False,
+                 **args):
+        super(CSPResNet, self).__init__()
+        self.use_checkpoint = use_checkpoint
+        channels = [max(round(c * width_mult), 1) for c in channels]
+        layers = [max(round(l * depth_mult), 1) for l in layers]
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+
+        if use_large_stem:
+            self.stem = nn.Sequential(
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0] // 2,
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)), ('conv3', ConvBNLayer(
+                        channels[0] // 2,
+                        channels[0],
+                        3,
+                        stride=1,
+                        padding=1,
+                        act=act)))
+        else:
+            self.stem = nn.Sequential(
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)))
+
+        n = len(channels) - 1
+        self.stages = nn.Sequential(*[(str(i), CSPResStage(
+            BasicBlock,
+            channels[i],
+            channels[i + 1],
+            layers[i],
+            2,
+            act=act,
+            use_alpha=use_alpha)) for i in range(n)])
+
+        self._out_channels = channels[1:]
+        self._out_strides = [4 * 2**i for i in range(n)]
+        self.return_idx = return_idx
+        if use_checkpoint:
+            paddle.seed(0)
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    stage, x, **{"preserve_rng_state": True})
+            else:
+                x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
diff --git a/ppdet/modeling/backbones/darknet.py b/ppdet/modeling/backbones/darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c68c65027e83e0c1b353d05c1795ed7a622438a4
--- /dev/null
+++ b/ppdet/modeling/backbones/darknet.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.ops import batch_norm, mish
+from ..shape_spec import ShapeSpec
+
+__all__ = ['DarkNet', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 act="leaky",
+                 freeze_norm=False,
+                 data_format='NCHW',
+                 name=''):
+        """
+        conv + bn + activation layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 1
+            groups (int): number of groups of conv layer, default 1
+            padding (int): padding size, default 0
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            act (str): activation function type, default 'leaky', which means leaky_relu
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            data_format=data_format,
+            bias_attr=False)
+        self.batch_norm = batch_norm(
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.act = act
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.batch_norm(out)
+        if self.act == 'leaky':
+            out = F.leaky_relu(out, 0.1)
+        else:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class DownSample(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=2,
+                 padding=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        downsample layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 2
+            padding (int): padding size, default 1
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(DownSample, self).__init__()
+
+        self.conv_bn_layer = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        out = self.conv_bn_layer(inputs)
+        return out
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        BasicBlock layer of DarkNet
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(BasicBlock, self).__init__()
+
+        assert ch_in == ch_out and (ch_in % 2) == 0, \
+            f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
+        # example:
+        # --------------{conv1} --> {conv2}
+        # channel route: 10-->5 --> 5-->10
+        self.conv1 = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=int(ch_out / 2),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            ch_in=int(ch_out / 2),
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+    def forward(self, inputs):
+        conv1 = self.conv1(inputs)
+        conv2 = self.conv2(conv1)
+        out = paddle.add(x=inputs, y=conv2)
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 count,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None,
+                 data_format='NCHW'):
+        """
+        Blocks layer, which consist of some BaickBlock layers
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            count (int): number of BasicBlock layer
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(Blocks, self).__init__()
+
+        self.basicblock0 = BasicBlock(
+            ch_in,
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.res_out_list = []
+        for i in range(1, count):
+            block_name = '{}.{}'.format(name, i)
+            res_out = self.add_sublayer(
+                block_name,
+                BasicBlock(
+                    ch_out,
+                    ch_out,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.res_out_list.append(res_out)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        y = self.basicblock0(inputs)
+        for basic_block_i in self.res_out_list:
+            y = basic_block_i(y)
+        return y
+
+
+DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
+
+
+@register
+@serializable
+class DarkNet(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 depth=53,
+                 freeze_at=-1,
+                 return_idx=[2, 3, 4],
+                 num_stages=5,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        Darknet, see https://pjreddie.com/darknet/yolo/
+
+        Args:
+            depth (int): depth of network
+            freeze_at (int): freeze the backbone at which stage
+            filter_size (int): filter size, default 3
+            return_idx (list): index of stages whose feature maps are returned
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(DarkNet, self).__init__()
+        self.depth = depth
+        self.freeze_at = freeze_at
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        self.stages = DarkNet_cfg[self.depth][0:num_stages]
+
+        self.conv0 = ConvBNLayer(
+            ch_in=3,
+            ch_out=32,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self.downsample0 = DownSample(
+            ch_in=32,
+            ch_out=32 * 2,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self._out_channels = []
+        self.darknet_conv_block_list = []
+        self.downsample_list = []
+        ch_in = [64, 128, 256, 512, 1024]
+        for i, stage in enumerate(self.stages):
+            name = 'stage.{}'.format(i)
+            conv_block = self.add_sublayer(
+                name,
+                Blocks(
+                    int(ch_in[i]),
+                    int(ch_in[i]),
+                    stage,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format,
+                    name=name))
+            self.darknet_conv_block_list.append(conv_block)
+            if i in return_idx:
+                self._out_channels.append(int(ch_in[i]))
+        for i in range(num_stages - 1):
+            down_name = 'stage.{}.downsample'.format(i)
+            downsample = self.add_sublayer(
+                down_name,
+                DownSample(
+                    ch_in=int(ch_in[i]),
+                    ch_out=int(ch_in[i + 1]),
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.downsample_list.append(downsample)
+
+    def forward(self, inputs):
+        x = inputs['image']
+
+        out = self.conv0(x)
+        out = self.downsample0(out)
+        blocks = []
+        for i, conv_block_i in enumerate(self.darknet_conv_block_list):
+            out = conv_block_i(out)
+            if i == self.freeze_at:
+                out.stop_gradient = True
+            if i in self.return_idx:
+                blocks.append(out)
+            if i < self.num_stages - 1:
+                out = self.downsample_list[i](out)
+        return blocks
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/backbones/focalnet.py b/ppdet/modeling/backbones/focalnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..54c2877623f269db4890875f89520e3293cdcef0
--- /dev/null
+++ b/ppdet/modeling/backbones/focalnet.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
+"""
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+from .transformer_utils import DropPath, Identity
+from .transformer_utils import add_parameter, to_2tuple
+from .transformer_utils import ones_, zeros_, trunc_normal_
+from .swin_transformer import Mlp
+
+__all__ = ['FocalNet']
+
+MODEL_cfg = {
+    'focalnet_T_224_1k_srf': dict(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.2,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
+    ),
+    'focalnet_S_224_1k_srf': dict(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.3,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
+    ),
+    'focalnet_B_224_1k_srf': dict(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
+    ),
+    'focalnet_T_224_1k_lrf': dict(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.2,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
+    ),
+    'focalnet_S_224_1k_lrf': dict(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.3,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
+    ),
+    'focalnet_B_224_1k_lrf': dict(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
+    ),
+    'focalnet_L_384_22k_fl3': dict(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[5, 5, 5, 5],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
+    ),
+    'focalnet_L_384_22k_fl4': dict(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=True,  #
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
+    ),
+    'focalnet_XL_384_22k_fl3': dict(
+        embed_dim=256,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[5, 5, 5, 5],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
+    ),
+    'focalnet_XL_384_22k_fl4': dict(
+        embed_dim=256,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
+    ),
+    'focalnet_H_224_22k_fl3': dict(
+        embed_dim=352,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=True,  #
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
+    ),
+    'focalnet_H_224_22k_fl4': dict(
+        embed_dim=352,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=True,  #
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
+    ),
+}
+
+
+class FocalModulation(nn.Layer):
+    """
+    Args:
+        dim (int): Number of input channels.
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        focal_factor (int): Step to increase the focal window. Default: 2
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm
+        normalize_modulator (bool): Whether use normalize in modulator
+    """
+
+    def __init__(self,
+                 dim,
+                 proj_drop=0.,
+                 focal_level=2,
+                 focal_window=7,
+                 focal_factor=2,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False):
+        super().__init__()
+        self.dim = dim
+
+        # specific args for focalv3
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        self.focal_factor = focal_factor
+        self.use_postln_in_modulation = use_postln_in_modulation
+        self.normalize_modulator = normalize_modulator
+
+        self.f = nn.Linear(
+            dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
+        self.h = nn.Conv2D(
+            dim,
+            dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=True)
+
+        self.act = nn.GELU()
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.focal_layers = nn.LayerList()
+
+        if self.use_postln_in_modulation:
+            self.ln = nn.LayerNorm(dim)
+
+        for k in range(self.focal_level):
+            kernel_size = self.focal_factor * k + self.focal_window
+            self.focal_layers.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        dim,
+                        dim,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        groups=dim,
+                        padding=kernel_size // 2,
+                        bias_attr=False),
+                    nn.GELU()))
+
+    def forward(self, x):
+        """ Forward function.
+        Args:
+            x: input features with shape of (B, H, W, C)
+        """
+        _, _, _, C = x.shape
+        x = self.f(x)
+        x = x.transpose([0, 3, 1, 2])
+        q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
+
+        ctx_all = 0
+        for l in range(self.focal_level):
+            ctx = self.focal_layers[l](ctx)
+            ctx_all = ctx_all + ctx * gates[:, l:l + 1]
+        ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
+        ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
+        if self.normalize_modulator:
+            ctx_all = ctx_all / (self.focal_level + 1)
+
+        x_out = q * self.h(ctx_all)
+        x_out = x_out.transpose([0, 2, 3, 1])
+        if self.use_postln_in_modulation:
+            x_out = self.ln(x_out)
+        x_out = self.proj(x_out)
+        x_out = self.proj_drop(x_out)
+        return x_out
+
+
+class FocalModulationBlock(nn.Layer):
+    """ Focal Modulation Block.
+    Args:
+        dim (int): Number of input channels.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+        focal_level (int): number of focal levels
+        focal_window (int): focal kernel size at level 1
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value for layer scale. Default: 1e-4 
+    """
+
+    def __init__(self,
+                 dim,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 focal_level=2,
+                 focal_window=9,
+                 use_postln=False,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False,
+                 use_layerscale=False,
+                 layerscale_value=1e-4):
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.focal_window = focal_window
+        self.focal_level = focal_level
+        self.use_postln = use_postln
+        self.use_layerscale = use_layerscale
+
+        self.norm1 = norm_layer(dim)
+        self.modulation = FocalModulation(
+            dim,
+            proj_drop=drop,
+            focal_level=self.focal_level,
+            focal_window=self.focal_window,
+            use_postln_in_modulation=use_postln_in_modulation,
+            normalize_modulator=normalize_modulator)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.H = None
+        self.W = None
+
+        self.gamma_1 = 1.0
+        self.gamma_2 = 1.0
+        if self.use_layerscale:
+            self.gamma_1 = add_parameter(self,
+                                         layerscale_value * paddle.ones([dim]))
+            self.gamma_2 = add_parameter(self,
+                                         layerscale_value * paddle.ones([dim]))
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        if not self.use_postln:
+            x = self.norm1(x)
+        x = x.reshape([-1, H, W, C])
+
+        # FM
+        x = self.modulation(x).reshape([-1, H * W, C])
+        if self.use_postln:
+            x = self.norm1(x)
+
+        # FFN
+        x = shortcut + self.drop_path(self.gamma_1 * x)
+
+        if self.use_postln:
+            x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """ A basic focal modulation layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value of layerscale
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 focal_level=2,
+                 focal_window=9,
+                 use_conv_embed=False,
+                 use_layerscale=False,
+                 layerscale_value=1e-4,
+                 use_postln=False,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False,
+                 use_checkpoint=False):
+        super().__init__()
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            FocalModulationBlock(
+                dim=dim,
+                mlp_ratio=mlp_ratio,
+                drop=drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, np.ndarray) else drop_path,
+                act_layer=nn.GELU,
+                norm_layer=norm_layer,
+                focal_level=focal_level,
+                focal_window=focal_window,
+                use_postln=use_postln,
+                use_postln_in_modulation=use_postln_in_modulation,
+                normalize_modulator=normalize_modulator,
+                use_layerscale=use_layerscale,
+                layerscale_value=layerscale_value) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                patch_size=2,
+                in_chans=dim,
+                embed_dim=2 * dim,
+                use_conv_embed=use_conv_embed,
+                norm_layer=norm_layer,
+                is_stem=False)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+        """
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x)
+
+        if self.downsample is not None:
+            x_reshaped = x.transpose([0, 2, 1]).reshape(
+                [x.shape[0], x.shape[-1], H, W])
+            x_down = self.downsample(x_reshaped)
+            x_down = x_down.flatten(2).transpose([0, 2, 1])
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
+        is_stem (bool): Is the stem block or not. 
+    """
+
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None,
+                 use_conv_embed=False,
+                 is_stem=False):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        if use_conv_embed:
+            # if we choose to use conv embedding, then we treat the stem and non-stem differently
+            if is_stem:
+                kernel_size = 7
+                padding = 2
+                stride = 4
+            else:
+                kernel_size = 3
+                padding = 1
+                stride = 2
+            self.proj = nn.Conv2D(
+                in_chans,
+                embed_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding)
+        else:
+            self.proj = nn.Conv2D(
+                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        _, _, H, W = x.shape
+
+        if W % self.patch_size[1] != 0:
+            # for 3D tensor: [pad_left, pad_right]
+            # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+            W += W % self.patch_size[1]
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+            H += H % self.patch_size[0]
+
+        x = self.proj(x)
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@register
+@serializable
+class FocalNet(nn.Layer):
+    """ FocalNet backbone
+    Args:
+        arch (str): Architecture of FocalNet
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each FocalNet Transformer stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop_rate (float): Dropout rate.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        focal_levels (Sequence[int]): Number of focal levels at four stages
+        focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value of layerscale
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+            self,
+            arch='focalnet_T_224_1k_srf',
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=-1,
+            patch_size=4,
+            in_chans=3,
+            embed_dim=96,
+            depths=[2, 2, 6, 2],
+            mlp_ratio=4.,
+            drop_rate=0.,
+            drop_path_rate=0.2,  # 0.5 better for large+ models
+            norm_layer=nn.LayerNorm,
+            patch_norm=True,
+            focal_levels=[2, 2, 2, 2],
+            focal_windows=[3, 3, 3, 3],
+            use_conv_embed=False,
+            use_layerscale=False,
+            layerscale_value=1e-4,
+            use_postln=False,
+            use_postln_in_modulation=False,
+            normalize_modulator=False,
+            use_checkpoint=False,
+            pretrained=None):
+        super(FocalNet, self).__init__()
+        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
+
+        embed_dim = MODEL_cfg[arch]['embed_dim']
+        depths = MODEL_cfg[arch]['depths']
+        drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
+        focal_levels = MODEL_cfg[arch]['focal_levels']
+        focal_windows = MODEL_cfg[arch]['focal_windows']
+        use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
+        use_layerscale = MODEL_cfg[arch]['use_layerscale']
+        use_postln = MODEL_cfg[arch]['use_postln']
+        use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
+        normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
+        if pretrained is None:
+            pretrained = MODEL_cfg[arch]['pretrained']
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.num_layers = len(depths)
+        self.patch_norm = patch_norm
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+            use_conv_embed=use_conv_embed,
+            is_stem=True)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, sum(depths))
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                mlp_ratio=mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchEmbed
+                if (i_layer < self.num_layers - 1) else None,
+                focal_level=focal_levels[i_layer],
+                focal_window=focal_windows[i_layer],
+                use_conv_embed=use_conv_embed,
+                use_layerscale=use_layerscale,
+                layerscale_value=layerscale_value,
+                use_postln=use_postln,
+                use_postln_in_modulation=use_postln_in_modulation,
+                normalize_modulator=normalize_modulator,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self.apply(self._init_weights)
+        self._freeze_stages()
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        x = self.patch_embed(x['image'])
+        B, _, Wh, Ww = x.shape
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
+                    (0, 3, 1, 2))
+                outs.append(out)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        out_strides = [4, 8, 16, 32]
+        return [
+            ShapeSpec(
+                channels=self.num_features[i], stride=out_strides[i])
+            for i in self.out_indices
+        ]
diff --git a/ppdet/modeling/backbones/hgnet_v2.py b/ppdet/modeling/backbones/hgnet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f989a28533c0e25396ef59a2e2c517594ab660
--- /dev/null
+++ b/ppdet/modeling/backbones/hgnet_v2.py
@@ -0,0 +1,447 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal, Constant
+from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+
+import copy
+
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['PPHGNetV2']
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class LearnableAffineBlock(nn.Layer):
+    def __init__(self,
+                 scale_value=1.0,
+                 bias_value=0.0,
+                 lr_mult=1.0,
+                 lab_lr=0.01):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 groups=1,
+                 use_act=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding
+            if isinstance(padding, str) else (kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False)
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult))
+        if self.use_act:
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HG_Block(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size=3,
+                 layer_num=6,
+                 identity=False,
+                 light_block=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(in_channels=in_channels
+                                 if i == 0 else mid_channels,
+                                 out_channels=mid_channels,
+                                 stride=1,
+                                 kernel_size=kernel_size,
+                                 use_lab=use_lab,
+                                 lr_mult=lr_mult))
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HG_Stage(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num=6,
+                 downsample=True,
+                 light_block=True,
+                 kernel_size=3,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult)
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+def _freeze_norm(m: nn.BatchNorm2D):
+    param_attr = ParamAttr(
+        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
+    bias_attr = ParamAttr(
+        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
+    global_stats = True
+    norm = nn.BatchNorm2D(
+        m._num_features,
+        weight_attr=param_attr,
+        bias_attr=bias_attr,
+        use_global_stats=global_stats)
+    for param in norm.parameters():
+        param.stop_gradient = True
+    return norm
+
+
+def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
+    if isinstance(model, nn.BatchNorm2D):
+        model = reset_func(model)
+    else:
+        for name, child in model.named_children():
+            _child = reset_bn(child, reset_func)
+            if _child is not child:
+                setattr(model, name, _child)
+    return model
+
+
+@register
+@serializable
+class PPHGNetV2(nn.Layer):
+    """
+    PPHGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific PPHGNetV2 model depends on args.
+    """
+
+    arch_configs = {
+        'L': {
+            'stem_channels': [3, 32, 48],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            }
+        },
+        'X': {
+            'stem_channels': [3, 32, 64],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            }
+        }
+    }
+
+    def __init__(self,
+                 arch,
+                 use_lab=False,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 return_idx=[1, 2, 3],
+                 freeze_stem_only=True,
+                 freeze_at=0,
+                 freeze_norm=True):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[arch]['stem_channels']
+        stage_config = self.arch_configs[arch]['stage_config']
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0])
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    lr_mult=lr_mult_list[i + 1]))
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            reset_bn(self, reset_func=_freeze_norm)
+
+        self._init_weights()
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
diff --git a/ppdet/modeling/backbones/mobilenet_v1.py b/ppdet/modeling/backbones/mobilenet_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..b268213a41ee9f4bec57e25960c5fe4501228d4e
--- /dev/null
+++ b/ppdet/modeling/backbones/mobilenet_v1.py
@@ -0,0 +1,410 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNet']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act='relu',
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=conv_lr,
+                initializer=KaimingNormal(),
+                regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        if norm_type in ['sync_bn', 'bn']:
+            self._batch_norm = nn.BatchNorm2D(
+                out_channels, weight_attr=param_attr, bias_attr=bias_attr)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        if self.act == "relu":
+            x = F.relu(x)
+        elif self.act == "relu6":
+            x = F.relu6(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups,
+                 stride,
+                 scale,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+
+        self._depthwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_dw")
+
+        self._pointwise_conv = ConvBNLayer(
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_sep")
+
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
+
+
+class ExtraBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups=1,
+                 stride=2,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ExtraBlock, self).__init__()
+
+        self.pointwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra1")
+
+        self.normal_conv = ConvBNLayer(
+            int(out_channels1),
+            int(out_channels2),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra2")
+
+    def forward(self, x):
+        x = self.pointwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 conv_decay=0.,
+                 scale=1,
+                 conv_learning_rate=1.0,
+                 feature_maps=[4, 6, 13],
+                 with_extra_blocks=False,
+                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
+                                      [64, 128]]):
+        super(MobileNet, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        self._out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_lr=conv_learning_rate,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name="conv1")
+
+        self.dwsl = []
+        dws21 = self.add_sublayer(
+            "conv2_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_1"))
+        self.dwsl.append(dws21)
+        self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
+        dws22 = self.add_sublayer(
+            "conv2_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
+                num_groups=64,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_2"))
+        self.dwsl.append(dws22)
+        self._update_out_channels(
+            int(128 * scale), len(self.dwsl), feature_maps)
+        # 1/4
+        dws31 = self.add_sublayer(
+            "conv3_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_1"))
+        self.dwsl.append(dws31)
+        self._update_out_channels(
+            int(128 * scale), len(self.dwsl), feature_maps)
+        dws32 = self.add_sublayer(
+            "conv3_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_2"))
+        self.dwsl.append(dws32)
+        self._update_out_channels(
+            int(256 * scale), len(self.dwsl), feature_maps)
+        # 1/8
+        dws41 = self.add_sublayer(
+            "conv4_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_1"))
+        self.dwsl.append(dws41)
+        self._update_out_channels(
+            int(256 * scale), len(self.dwsl), feature_maps)
+        dws42 = self.add_sublayer(
+            "conv4_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_2"))
+        self.dwsl.append(dws42)
+        self._update_out_channels(
+            int(512 * scale), len(self.dwsl), feature_maps)
+        # 1/16
+        for i in range(5):
+            tmp = self.add_sublayer(
+                "conv5_" + str(i + 1),
+                sublayer=DepthwiseSeparable(
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale,
+                    conv_lr=conv_learning_rate,
+                    conv_decay=conv_decay,
+                    norm_decay=norm_decay,
+                    norm_type=norm_type,
+                    name="conv5_" + str(i + 1)))
+            self.dwsl.append(tmp)
+            self._update_out_channels(
+                int(512 * scale), len(self.dwsl), feature_maps)
+        dws56 = self.add_sublayer(
+            "conv5_6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv5_6"))
+        self.dwsl.append(dws56)
+        self._update_out_channels(
+            int(1024 * scale), len(self.dwsl), feature_maps)
+        # 1/32
+        dws6 = self.add_sublayer(
+            "conv6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv6"))
+        self.dwsl.append(dws6)
+        self._update_out_channels(
+            int(1024 * scale), len(self.dwsl), feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_blocks = []
+            for i, block_filter in enumerate(self.extra_block_filters):
+                in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
+                conv_extra = self.add_sublayer(
+                    "conv7_" + str(i + 1),
+                    sublayer=ExtraBlock(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        conv_lr=conv_learning_rate,
+                        conv_decay=conv_decay,
+                        norm_decay=norm_decay,
+                        norm_type=norm_type,
+                        name="conv7_" + str(i + 1)))
+                self.extra_blocks.append(conv_extra)
+                self._update_out_channels(
+                    block_filter[1],
+                    len(self.dwsl) + len(self.extra_blocks), feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        outs = []
+        y = self.conv1(inputs['image'])
+        for i, block in enumerate(self.dwsl):
+            y = block(y)
+            if i + 1 in self.feature_maps:
+                outs.append(y)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        y = outs[-1]
+        for i, block in enumerate(self.extra_blocks):
+            idx = i + len(self.dwsl)
+            y = block(y)
+            if idx + 1 in self.feature_maps:
+                outs.append(y)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/backbones/mobilenet_v3.py b/ppdet/modeling/backbones/mobilenet_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bd88567a1487437a067ec68497ee9f3b62b4d47
--- /dev/null
+++ b/ppdet/modeling/backbones/mobilenet_v3.py
@@ -0,0 +1,478 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNetV3']
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act=None,
+                 lr_mult=1.,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=""):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr_mult
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.bn = nn.BatchNorm2D(
+                out_c,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.bn.parameters()
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "relu6":
+                x = F.relu6(x)
+            elif self.act == "hard_swish":
+                x = F.hardswish(x)
+            else:
+                raise NotImplementedError(
+                    "The activation function is selected incorrectly.")
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 act=None,
+                 return_list=False,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.use_se = use_se
+        self.return_list = return_list
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_expand")
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_depthwise")
+        if self.use_se:
+            self.mid_se = SEModule(
+                mid_c, lr_mult, conv_decay, name=name + "_se")
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_linear")
+
+    def forward(self, inputs):
+        y = self.expand_conv(inputs)
+        x = self.bottleneck_conv(y)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(inputs, x)
+        if self.return_list:
+            return [y, x]
+        else:
+            return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        mid_channels = int(channel // reduction)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+        self.conv2 = nn.Conv2D(
+            in_channels=mid_channels,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
+        return paddle.multiply(x=inputs, y=outputs)
+
+
+class ExtraBlockDW(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 ch_1,
+                 ch_2,
+                 stride,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None):
+        super(ExtraBlockDW, self).__init__()
+        self.pointwise_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=ch_1,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra1")
+        self.depthwise_conv = ConvBNLayer(
+            in_c=ch_1,
+            out_c=ch_2,
+            filter_size=3,
+            stride=stride,
+            padding='SAME',
+            num_groups=int(ch_1),
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_dw")
+        self.normal_conv = ConvBNLayer(
+            in_c=ch_2,
+            out_c=ch_2,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_sep")
+
+    def forward(self, inputs):
+        x = self.pointwise_conv(inputs)
+        x = self.depthwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNetV3(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(
+            self,
+            scale=1.0,
+            model_name="large",
+            feature_maps=[6, 12, 15],
+            with_extra_blocks=False,
+            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
+            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+            conv_decay=0.0,
+            multiplier=1.0,
+            norm_type='bn',
+            norm_decay=0.0,
+            freeze_norm=False):
+        super(MobileNetV3, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        if norm_type == 'sync_bn' and freeze_norm:
+            raise ValueError(
+                "The norm_type should not be sync_bn when freeze_norm is True")
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, "relu", 1],
+                [3, 64, 24, False, "relu", 2],
+                [3, 72, 24, False, "relu", 1],
+                [5, 72, 40, True, "relu", 2],  # RCNN output
+                [5, 120, 40, True, "relu", 1],
+                [5, 120, 40, True, "relu", 1],  # YOLOv3 output
+                [3, 240, 80, False, "hard_swish", 2],  # RCNN output
+                [3, 200, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 480, 112, True, "hard_swish", 1],
+                [3, 672, 112, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 672, 160, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 960, 160, True, "hard_swish", 1],
+                [5, 960, 160, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, "relu", 2],
+                [3, 72, 24, False, "relu", 2],  # RCNN output
+                [3, 88, 24, False, "relu", 1],  # YOLOv3 output
+                [5, 96, 40, True, "hard_swish", 2],  # RCNN output
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 120, 48, True, "hard_swish", 1],
+                [5, 144, 48, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 288, 96, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 576, 96, True, "hard_swish", 1],
+                [5, 576, 96, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        else:
+            raise NotImplementedError(
+                "mode[{}_model] is not implemented!".format(model_name))
+
+        if multiplier != 1.0:
+            self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
+            self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
+            self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
+            self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
+            self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
+
+        self.conv1 = ConvBNLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            act="hard_swish",
+            lr_mult=lr_mult_list[0],
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="conv1")
+
+        self._out_channels = []
+        self.block_list = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in self.cfg:
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
+            return_list = self.with_extra_blocks and i + 2 in self.feature_maps
+
+            block = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    return_list=return_list,
+                    name="conv" + str(i + 2)))
+            self.block_list.append(block)
+            inplanes = make_divisible(scale * c)
+            i += 1
+            self._update_out_channels(
+                make_divisible(scale * exp)
+                if return_list else inplanes, i + 1, feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_block_list = []
+            extra_out_c = make_divisible(scale * self.cfg[-1][1])
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            conv_extra = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ConvBNLayer(
+                    in_c=inplanes,
+                    out_c=extra_out_c,
+                    filter_size=1,
+                    stride=1,
+                    padding=0,
+                    num_groups=1,
+                    act="hard_swish",
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    name="conv" + str(i + 2)))
+            self.extra_block_list.append(conv_extra)
+            i += 1
+            self._update_out_channels(extra_out_c, i + 1, feature_maps)
+
+            for j, block_filter in enumerate(self.extra_block_filters):
+                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
+                                                                           1][1]
+                conv_extra = self.add_sublayer(
+                    "conv" + str(i + 2),
+                    sublayer=ExtraBlockDW(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        stride=2,
+                        lr_mult=lr_mult,
+                        conv_decay=conv_decay,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        name='conv' + str(i + 2)))
+                self.extra_block_list.append(conv_extra)
+                i += 1
+                self._update_out_channels(block_filter[1], i + 1, feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        x = self.conv1(inputs['image'])
+        outs = []
+        for idx, block in enumerate(self.block_list):
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                if isinstance(x, list):
+                    outs.append(x[0])
+                    x = x[1]
+                else:
+                    outs.append(x)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        for i, block in enumerate(self.extra_block_list):
+            idx = i + len(self.block_list)
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                outs.append(x)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/backbones/name_adapter.py b/ppdet/modeling/backbones/name_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..4afbb9b189e5091dc048194ca5f3a5cbaea061d3
--- /dev/null
+++ b/ppdet/modeling/backbones/name_adapter.py
@@ -0,0 +1,69 @@
+class NameAdapter(object):
+    """Fix the backbones variable names for pretrained weight"""
+
+    def __init__(self, model):
+        super(NameAdapter, self).__init__()
+        self.model = model
+
+    @property
+    def model_type(self):
+        return getattr(self.model, '_model_type', '')
+
+    @property
+    def variant(self):
+        return getattr(self.model, 'variant', '')
+
+    def fix_conv_norm_name(self, name):
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        # the naming rule is same as pretrained weight
+        if self.model_type == 'SEResNeXt':
+            bn_name = name + "_bn"
+        return bn_name
+
+    def fix_shortcut_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            name = 'conv' + name + '_prj'
+        return name
+
+    def fix_bottleneck_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            conv_name3 = 'conv' + name + '_x3'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            conv_name3 = name + "_branch2c"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, conv_name3, shortcut_name
+
+    def fix_basicblock_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, shortcut_name
+
+    def fix_layer_warp_name(self, stage_num, count, i):
+        name = 'res' + str(stage_num)
+        if count > 10 and stage_num == 4:
+            if i == 0:
+                conv_name = name + "a"
+            else:
+                conv_name = name + "b" + str(i)
+        else:
+            conv_name = name + chr(ord("a") + i)
+        if self.model_type == 'SEResNeXt':
+            conv_name = str(stage_num + 2) + '_' + str(i + 1)
+        return conv_name
+
+    def fix_c1_stage_name(self):
+        return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"
diff --git a/ppdet/modeling/backbones/resnet.py b/ppdet/modeling/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64f400d9ee964d476b67449874024db004e4ef5
--- /dev/null
+++ b/ppdet/modeling/backbones/resnet.py
@@ -0,0 +1,611 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import math
+from numbers import Integral
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.vision.ops import DeformConv2D
+from .name_adapter import NameAdapter
+from ..shape_spec import ShapeSpec
+
+__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
+
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    152: [3, 8, 36, 3],
+}
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride,
+                 groups=1,
+                 act=None,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 lr=1.0,
+                 dcn_v2=False):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn']
+        self.norm_type = norm_type
+        self.act = act
+        self.dcn_v2 = dcn_v2
+
+        if not self.dcn_v2:
+            self.conv = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+        else:
+            self.offset_channel = 2 * filter_size**2
+            self.mask_channel = filter_size**2
+
+            self.conv_offset = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=3 * filter_size**2,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                weight_attr=ParamAttr(initializer=Constant(0.)),
+                bias_attr=ParamAttr(initializer=Constant(0.)))
+            self.conv = DeformConv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                dilation=1,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.norm = nn.BatchNorm2D(
+                ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.norm.parameters()
+
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, inputs):
+        if not self.dcn_v2:
+            out = self.conv(inputs)
+        else:
+            offset_mask = self.conv_offset(inputs)
+            offset, mask = paddle.split(
+                offset_mask,
+                num_or_sections=[self.offset_channel, self.mask_channel],
+                axis=1)
+            mask = F.sigmoid(mask)
+            out = self.conv(inputs, offset, mask=mask)
+
+        if self.norm_type in ['bn', 'sync_bn']:
+            out = self.norm(out)
+        if self.act:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class SELayer(nn.Layer):
+    def __init__(self, ch, reduction_ratio=16):
+        super(SELayer, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(ch)
+        c_ = ch // reduction_ratio
+        self.squeeze = nn.Linear(
+            ch,
+            c_,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+        stdv = 1.0 / math.sqrt(c_)
+        self.extract = nn.Linear(
+            c_,
+            ch,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+    def forward(self, inputs):
+        out = self.pool(inputs)
+        out = paddle.squeeze(out, axis=[2, 3])
+        out = self.squeeze(out)
+        out = F.relu(out)
+        out = self.extract(out)
+        out = F.sigmoid(out)
+        out = paddle.unsqueeze(out, axis=[2, 3])
+        scale = out * inputs
+        return scale
+
+
+class BasicBlock(nn.Layer):
+
+    expansion = 1
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BasicBlock, self).__init__()
+        assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=ch_out,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            act=None,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out)
+
+    def forward(self, inputs):
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class BottleNeck(nn.Layer):
+
+    expansion = 4
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=4,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BottleNeck, self).__init__()
+        if variant == 'a':
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+
+        # ResNeXt
+        width = int(ch_out * (base_width / 64.)) * groups
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=width,
+            filter_size=1,
+            stride=stride1,
+            groups=1,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=width,
+            ch_out=width,
+            filter_size=3,
+            stride=stride2,
+            groups=groups,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.branch2c = ConvNormLayer(
+            ch_in=width,
+            ch_out=ch_out * self.expansion,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out * self.expansion,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out * self.expansion,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out * self.expansion)
+
+    def forward(self, inputs):
+
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 block,
+                 ch_in,
+                 ch_out,
+                 count,
+                 name_adapter,
+                 stage_num,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(Blocks, self).__init__()
+
+        self.blocks = []
+        for i in range(count):
+            conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
+            layer = self.add_sublayer(
+                conv_name,
+                block(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1,
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=dcn_v2,
+                    std_senet=std_senet))
+            self.blocks.append(layer)
+            if i == 0:
+                ch_in = ch_out * block.expansion
+
+    def forward(self, inputs):
+        block_out = inputs
+        for block in self.blocks:
+            block_out = block(block_out)
+        return block_out
+
+
+@register
+@serializable
+class ResNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=50,
+                 ch_in=64,
+                 variant='b',
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
+                 groups=1,
+                 base_width=64,
+                 norm_type='bn',
+                 norm_decay=0,
+                 freeze_norm=True,
+                 freeze_at=0,
+                 return_idx=[0, 1, 2, 3],
+                 dcn_v2_stages=[-1],
+                 num_stages=4,
+                 std_senet=False,
+                 freeze_stem_only=False):
+        """
+        Residual Network, see https://arxiv.org/abs/1512.03385
+        
+        Args:
+            depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
+            ch_in (int): output channel of first stage, default 64
+            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
+            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
+                                 lower learning rate ratio is need for pretrained model 
+                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
+            groups (int): group convolution cardinality
+            base_width (int): base width of each group convolution
+            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
+            norm_decay (float): weight decay for normalization layer weights
+            freeze_norm (bool): freeze normalization layers
+            freeze_at (int): freeze the backbone at which stage
+            return_idx (list): index of the stages whose feature maps are returned
+            dcn_v2_stages (list): index of stages who select deformable conv v2
+            num_stages (int): total num of stages
+            std_senet (bool): whether use senet, default True
+        """
+        super(ResNet, self).__init__()
+        self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
+        assert num_stages >= 1 and num_stages <= 4
+        self.depth = depth
+        self.variant = variant
+        self.groups = groups
+        self.base_width = base_width
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+        self.freeze_at = freeze_at
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+        assert max(return_idx) < num_stages, \
+            'the maximum return index must smaller than num_stages, ' \
+            'but received maximum return index is {} and num_stages ' \
+            'is {}'.format(max(return_idx), num_stages)
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        assert len(lr_mult_list) == 4, \
+            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+        self.dcn_v2_stages = dcn_v2_stages
+
+        block_nums = ResNet_cfg[depth]
+        na = NameAdapter(self)
+
+        conv1_name = na.fix_c1_stage_name()
+        if variant in ['c', 'd']:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, conv1_name]]
+        self.conv1 = nn.Sequential()
+        for (c_in, c_out, k, s, _name) in conv_def:
+            self.conv1.add_sublayer(
+                _name,
+                ConvNormLayer(
+                    ch_in=c_in,
+                    ch_out=c_out,
+                    filter_size=k,
+                    stride=s,
+                    groups=1,
+                    act='relu',
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=1.0))
+
+        self.ch_in = ch_in
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+
+        self._out_channels = [block.expansion * v for v in ch_out_list]
+        self._out_strides = [4, 8, 16, 32]
+
+        self.res_layers = []
+        for i in range(num_stages):
+            lr_mult = lr_mult_list[i]
+            stage_num = i + 2
+            res_name = "res{}".format(stage_num)
+            res_layer = self.add_sublayer(
+                res_name,
+                Blocks(
+                    block,
+                    self.ch_in,
+                    ch_out_list[i],
+                    count=block_nums[i],
+                    name_adapter=na,
+                    stage_num=stage_num,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr_mult,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=(i in self.dcn_v2_stages),
+                    std_senet=std_senet))
+            self.res_layers.append(res_layer)
+            self.ch_in = self._out_channels[i]
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, num_stages)):
+                    self._freeze_parameters(self.res_layers[i])
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
+@register
+class Res5Head(nn.Layer):
+    def __init__(self, depth=50):
+        super(Res5Head, self).__init__()
+        feat_in, feat_out = [1024, 512]
+        if depth < 50:
+            feat_in = 256
+        na = NameAdapter(self)
+        block = BottleNeck if depth >= 50 else BasicBlock
+        self.res5 = Blocks(
+            block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
+        self.feat_out = feat_out if depth < 50 else feat_out * 4
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(
+            channels=self.feat_out,
+            stride=16, )]
+
+    def forward(self, roi_feat, stage=0):
+        y = self.res5(roi_feat)
+        return y
diff --git a/ppdet/modeling/backbones/swin_transformer.py b/ppdet/modeling/backbones/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..64aabab47811500e2534716d28c0233d82f1973c
--- /dev/null
+++ b/ppdet/modeling/backbones/swin_transformer.py
@@ -0,0 +1,752 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+Ths copyright of microsoft/Swin-Transformer is as follows:
+MIT License [see LICENSE for details]
+"""
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+from .transformer_utils import DropPath, Identity
+from .transformer_utils import add_parameter, to_2tuple
+from .transformer_utils import ones_, zeros_, trunc_normal_
+
+__all__ = ['SwinTransformer']
+
+MODEL_cfg = {
+    # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
+    'swin_T_224': dict(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_S_224': dict(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_B_224': dict(
+        pretrain_img_size=224,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_L_224': dict(
+        pretrain_img_size=224,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_B_384': dict(
+        pretrain_img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
+    ),
+    'swin_L_384': dict(
+        pretrain_img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
+    ),
+}
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [-1, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    _, _, _, C = windows.shape
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = add_parameter(
+            self,
+            paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                          num_heads)))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
+                [2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+
+        index = self.relative_position_index.flatten()
+
+        relative_position_bias = paddle.index_select(
+            self.relative_position_bias_table, index)
+        relative_position_bias = relative_position_bias.reshape([
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([-1, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([-1, H, W, C])
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
+                  data_format='NHWC')
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [x_windows.shape[0], self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [x_windows.shape[0], self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([-1, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.reshape([-1, H, W, C])
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            # paddle F.pad default data_format is 'NCHW'
+            x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
+            H += H % 2
+            W += W % 2
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.reshape([-1, H * W // 4, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, np.ndarray) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32')  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.reshape(
+            [-1, self.window_size * self.window_size])
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        huns = -100.0 * paddle.ones_like(attn_mask)
+        attn_mask = huns * (attn_mask != 0).astype("float32")
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        # TODO # export dynamic shape
+        B, C, H, W = x.shape
+        # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+
+        x = self.proj(x)
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@register
+@serializable
+class SwinTransformer(nn.Layer):
+    """ Swin Transformer backbone
+    Args:
+        arch (str): Architecture of FocalNet
+        pretrain_img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+    """
+
+    def __init__(self,
+                 arch='swin_T_224',
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 pretrained=None):
+        super(SwinTransformer, self).__init__()
+        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
+
+        pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
+        embed_dim = MODEL_cfg[arch]['embed_dim']
+        depths = MODEL_cfg[arch]['depths']
+        num_heads = MODEL_cfg[arch]['num_heads']
+        window_size = MODEL_cfg[arch]['window_size']
+        if pretrained is None:
+            pretrained = MODEL_cfg[arch]['pretrained']
+
+        self.num_layers = len(depths)
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+
+            self.absolute_pos_embed = add_parameter(
+                self,
+                paddle.zeros((1, embed_dim, patches_resolution[0],
+                              patches_resolution[1])))
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths))  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self.apply(self._init_weights)
+        self._freeze_stages()
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.stop_gradient = True
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x['image'])
+        B, _, Wh, Ww = x.shape
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
+        else:
+            x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
+                    (0, 3, 1, 2))
+                outs.append(out)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        out_strides = [4, 8, 16, 32]
+        return [
+            ShapeSpec(
+                channels=self.num_features[i], stride=out_strides[i])
+            for i in self.out_indices
+        ]
diff --git a/ppdet/modeling/backbones/transformer_utils.py b/ppdet/modeling/backbones/transformer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0783e1e99a7c6a89a0f9927ec86f8aad15969f9
--- /dev/null
+++ b/ppdet/modeling/backbones/transformer_utils.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn.initializer import TruncatedNormal, Constant, Assign
+
+# Common initializations
+ones_ = Constant(value=1.)
+zeros_ = Constant(value=0.)
+trunc_normal_ = TruncatedNormal(std=.02)
+
+
+# Common Layers
+def drop_path(x, drop_prob=0., training=False):
+    """
+        Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+# common funcs
+
+
+def to_2tuple(x):
+    if isinstance(x, (list, tuple)):
+        return x
+    return tuple([x] * 2)
+
+
+def add_parameter(layer, datas, name=None):
+    parameter = layer.create_parameter(
+        shape=(datas.shape), default_initializer=Assign(datas))
+    if name:
+        layer.add_parameter(name, parameter)
+    return parameter
+
+
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = paddle.shape(x)
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    x = F.pad(x.transpose([0, 3, 1, 2]),
+              paddle.to_tensor(
+                  [0, int(pad_w), 0, int(pad_h)],
+                  dtype='int32')).transpose([0, 2, 3, 1])
+    Hp, Wp = H + pad_h, W + pad_w
+
+    num_h, num_w = Hp // window_size, Wp // window_size
+
+    x = x.reshape([B, num_h, window_size, num_w, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows, (Hp, Wp), (num_h, num_w)
+
+
+def window_unpartition(x, pad_hw, num_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    num_h, num_w = num_hw
+    H, W = hw
+    B, window_size, _, C = paddle.shape(x)
+    B = B // (num_h * num_w)
+    x = x.reshape([B, num_h, num_w, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
+
+    return x[:, :H, :W, :]
diff --git a/ppdet/modeling/backbones/vision_transformer.py b/ppdet/modeling/backbones/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a21eefc7aca0d2a5fe0bfa94eddf007612f5f464
--- /dev/null
+++ b/ppdet/modeling/backbones/vision_transformer.py
@@ -0,0 +1,652 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from paddle.nn.initializer import Constant
+
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+
+from .transformer_utils import zeros_, DropPath, Identity
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
+
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = self.create_parameter(
+                shape=(self.num_relative_distance, num_heads),
+                default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size[0])
+            coords_w = paddle.arange(window_size[1])
+            coords = paddle.stack(paddle.meshgrid(
+                [coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww 
+            coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
+            coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
+            relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
+            )
+
+            #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Wh
+            relative_coords = relative_coords.transpose(
+                (1, 2, 0))  #.contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[
+                0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index",
+                                 relative_position_index)
+            # trunc_normal_(self.relative_position_bias_table, std=.0)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        x_shape = paddle.shape(x)
+        N, C = x_shape[1], x_shape[2]
+
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+        qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
+
+        qkv = qkv.reshape((-1, N, 3, self.num_heads,
+                           C // self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.reshape([-1])].reshape([
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1
+                ])  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose(
+                (2, 0, 1))  #.contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 window_size=None,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        if init_values is not None:
+            self.gamma_1 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=[224, 224],
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        self.num_patches_w = img_size[0] // patch_size
+        self.num_patches_h = img_size[1] // patch_size
+
+        num_patches = self.num_patches_w * self.num_patches_h
+        self.patch_shape = (img_size[0] // patch_size,
+                            img_size[1] // patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size
+
+    def forward(self, x, mask=None):
+        B, C, H, W = x.shape
+        return self.proj(x)
+
+
+class RelativePositionBias(nn.Layer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = self.create_parameter(
+            shape=(self.num_relative_distance, num_heads),
+            default_initialize=zeros_)
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = coords.flatten(1)  # 2, Wh*Ww
+
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpos(
+            (1, 2, 0))  # Wh*Ww, Wh*Ww, 2 
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                 self.window_size[0] * self.window_size[1] + 1,
+                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH 
+        return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww
+
+
+def get_sinusoid_encoding_table(n_position, d_hid, token=False):
+    ''' Sinusoid position encoding table '''
+
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    if token:
+        sinusoid_table = np.concatenate(
+            [sinusoid_table, np.zeros([1, d_hid])], dim=0)
+
+    return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
+
+
+@register
+@serializable
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=[672, 1092],
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 init_values=None,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 out_indices=[3, 5, 7, 11],
+                 use_abs_pos_emb=False,
+                 use_sincos_pos_emb=True,
+                 with_fpn=True,
+                 num_fpn_levels=4,
+                 use_checkpoint=False,
+                 **args):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.with_fpn = with_fpn
+        self.use_checkpoint = use_checkpoint
+        self.use_sincos_pos_emb = use_sincos_pos_emb
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.final_norm = final_norm
+        self.out_indices = out_indices
+        self.num_fpn_levels = num_fpn_levels
+
+        if use_checkpoint:
+            paddle.seed(0)
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+
+        self.pos_w = self.patch_embed.num_patches_in_w
+        self.pos_h = self.patch_embed.num_patches_in_h
+
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim),
+            default_initializer=paddle.nn.initializer.Constant(value=0.))
+
+        if use_abs_pos_emb:
+            self.pos_embed = self.create_parameter(
+                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+                default_initializer=paddle.nn.initializer.TruncatedNormal(
+                    std=.02))
+        elif use_sincos_pos_emb:
+            pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
+
+            self.pos_embed = pos_embed
+            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
+            self.pos_embed.set_value(pos_embed.numpy())
+            self.pos_embed.stop_gradient = True
+
+        else:
+            self.pos_embed = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+        assert len(out_indices) <= 4, ''
+        self.out_indices = out_indices
+        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
+        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
+        ]
+
+        self.norm = Identity()
+
+        if self.with_fpn:
+            assert num_fpn_levels <= 4, ''
+            self.init_fpn(
+                embed_dim=embed_dim,
+                patch_size=patch_size, )
+
+    def init_weight(self):
+        pretrained = self.pretrained
+
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+
+            load_state_dict = paddle.load(path)
+            model_state_dict = self.state_dict()
+            pos_embed_name = "pos_embed"
+
+            if pos_embed_name in load_state_dict.keys():
+                load_pos_embed = paddle.to_tensor(
+                    load_state_dict[pos_embed_name], dtype="float32")
+                if self.pos_embed.shape != load_pos_embed.shape:
+                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                        load_pos_embed, (pos_size, pos_size),
+                        (self.pos_h, self.pos_w))
+
+                    # self.set_state_dict(model_state_dict)
+                    load_state_dict[pos_embed_name] = model_state_dict[
+                        pos_embed_name]
+
+                    print("Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+            self.set_state_dict(load_state_dict)
+            print("Load load_state_dict....")
+
+    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2D(embed_dim),
+                nn.GELU(),
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn3 = Identity()
+
+            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = Identity()
+
+            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
+
+            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
+
+        if not out_with_norm:
+            self.norm = Identity()
+        else:
+            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        w0 = w // self.patch_embed.patch_size
+        h0 = h // self.patch_embed.patch_size
+        if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
+            return self.pos_embed
+        class_pos_embed = self.pos_embed[:, 0]
+        patch_pos_embed = self.pos_embed[:, 1:]
+        dim = x.shape[-1]
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        # patch_pos_embed = nn.functional.interpolate(
+        #     patch_pos_embed.reshape([
+        #         1, self.patch_embed.num_patches_w,
+        #         self.patch_embed.num_patches_h, dim
+        #     ]).transpose((0, 3, 1, 2)),
+        #     scale_factor=(w0 / self.patch_embed.num_patches_w,
+        #                   h0 / self.patch_embed.num_patches_h),
+        #     mode='bicubic', )
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape([
+                1, self.patch_embed.num_patches_w,
+                self.patch_embed.num_patches_h, dim
+            ]).transpose((0, 3, 1, 2)),
+            (w0, h0),
+            mode='bicubic', )
+
+        assert int(w0) == patch_pos_embed.shape[-2] and int(
+            h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.transpose(
+            (0, 2, 3, 1)).reshape([1, -1, dim])
+        return paddle.concat(
+            (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def build_2d_sincos_position_embedding(
+            self,
+            embed_dim=768,
+            temperature=10000., ):
+        h, w = self.patch_embed.patch_shape
+        grid_w = paddle.arange(w, dtype=paddle.float32)
+        grid_h = paddle.arange(h, dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
+        pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
+        # pos_embed.stop_gradient = True
+
+        return pos_embed
+
+    def forward(self, x):
+        x = x['image'] if isinstance(x, dict) else x
+        _, _, h, w = x.shape
+
+        x = self.patch_embed(x)
+
+        B, D, Hp, Wp = x.shape  # b * c * h * w
+
+        cls_tokens = self.cls_token.expand(
+            (B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
+        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c
+        x = paddle.concat([cls_tokens, x], axis=1)
+
+        if self.pos_embed is not None:
+            # x = x + self.interpolate_pos_encoding(x, w, h)
+            x = x + self.interpolate_pos_encoding(x, h, w)
+
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+
+        feats = []
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    blk, x, rel_pos_bias, **{"preserve_rng_state": True})
+            else:
+                x = blk(x, rel_pos_bias)
+
+            if idx in self.out_indices:
+                xp = paddle.reshape(
+                    paddle.transpose(
+                        self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
+                    shape=[B, D, Hp, Wp])
+                feats.append(xp)
+
+        if self.with_fpn:
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
+                -self.num_fpn_levels:]
+            assert len(fpns) == len(feats) or len(feats) == 1, ''
+            outputs = []
+            for i, m in enumerate(fpns):
+                outputs.append(
+                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))
+
+            return outputs
+
+        return feats
+
+    @property
+    def num_layers(self):
+        return len(self.blocks)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self.out_channels, self.out_strides)
+        ]
diff --git a/ppdet/modeling/backbones/vit_mae.py b/ppdet/modeling/backbones/vit_mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d00da72b54ce2160fcf18f0cd6d128da05f1619
--- /dev/null
+++ b/ppdet/modeling/backbones/vit_mae.py
@@ -0,0 +1,749 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import math
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant, TruncatedNormal
+
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+
+from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
+                                window_unpartition)
+from ..initializer import linear_init_
+
+__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer='nn.GELU',
+                 drop=0.,
+                 lr_factor=1.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.act = eval(act_layer)()
+        self.fc2 = nn.Linear(
+            hidden_features,
+            out_features,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.drop = nn.Dropout(drop)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        linear_init_(self.fc1)
+        linear_init_(self.fc2)
+
+    def forward(self, x):
+        x = self.drop(self.act(self.fc1(x)))
+        x = self.drop(self.fc2(x))
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 use_rel_pos=False,
+                 rel_pos_zero_init=True,
+                 window_size=None,
+                 input_size=None,
+                 qk_scale=None,
+                 lr_factor=1.0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+        self.use_rel_pos = use_rel_pos
+        self.input_size = input_size
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.lr_factor = lr_factor
+
+        self.qkv = nn.Linear(
+            dim,
+            dim * 3,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor)
+            if attn_bias else False)
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.proj = nn.Linear(
+            dim,
+            dim,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.attn_drop = nn.Dropout(attn_drop)
+        if window_size is None:
+            self.window_size = self.input_size[0]
+
+        self._init_weights()
+
+    def _init_weights(self):
+        linear_init_(self.qkv)
+        linear_init_(self.proj)
+
+        if self.use_rel_pos:
+            self.rel_pos_h = self.create_parameter(
+                [2 * self.window_size - 1, self.head_dim],
+                attr=ParamAttr(learning_rate=self.lr_factor),
+                default_initializer=Constant(value=0.))
+            self.rel_pos_w = self.create_parameter(
+                [2 * self.window_size - 1, self.head_dim],
+                attr=ParamAttr(learning_rate=self.lr_factor),
+                default_initializer=Constant(value=0.))
+
+            if not self.rel_pos_zero_init:
+                TruncatedNormal(self.rel_pos_h, std=0.02)
+                TruncatedNormal(self.rel_pos_w, std=0.02)
+
+    def get_rel_pos(self, seq_size, rel_pos):
+        max_rel_dist = int(2 * seq_size - 1)
+        # Interpolate rel pos if needed.
+        if rel_pos.shape[0] != max_rel_dist:
+            # Interpolate rel pos.
+            rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
+            rel_pos = rel_pos.transpose([0, 2, 1])
+            rel_pos_resized = F.interpolate(
+                rel_pos,
+                size=(max_rel_dist, ),
+                mode="linear",
+                data_format='NCW')
+            rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
+            rel_pos_resized = rel_pos_resized.transpose([1, 0])
+        else:
+            rel_pos_resized = rel_pos
+
+        coords = paddle.arange(seq_size, dtype='float32')
+        relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
+        relative_coords += (seq_size - 1)
+        relative_coords = relative_coords.astype('int64').flatten()
+
+        return paddle.index_select(rel_pos_resized, relative_coords).reshape(
+            [seq_size, seq_size, self.head_dim])
+
+    def add_decomposed_rel_pos(self, attn, q, h, w):
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        Args:
+            attn (Tensor): attention map.
+            q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        Returns:
+            attn (Tensor): attention map with added relative positional embeddings.
+        """
+        Rh = self.get_rel_pos(h, self.rel_pos_h)
+        Rw = self.get_rel_pos(w, self.rel_pos_w)
+
+        B, _, dim = q.shape
+        r_q = q.reshape([B, h, w, dim])
+        # bhwc, hch->bhwh1
+        # bwhc, wcw->bhw1w
+        rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
+        rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
+
+        attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
+        return attn.reshape([B, h * w, h * w])
+
+    def forward(self, x):
+        B, H, W, C = paddle.shape(x)
+
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+            qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
+        else:
+            qkv = self.qkv(x).reshape(
+                [B, H * W, 3, self.num_heads, self.head_dim]).transpose(
+                    [2, 0, 3, 1, 4]).reshape(
+                        [3, B * self.num_heads, H * W, self.head_dim])
+
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
+
+        if self.use_rel_pos:
+            attn = self.add_decomposed_rel_pos(attn, q, H, W)
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+        x = attn.matmul(v).reshape(
+            [B, self.num_heads, H * W, self.head_dim]).transpose(
+                [0, 2, 1, 3]).reshape([B, H, W, C])
+        x = self.proj(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 qk_scale=None,
+                 init_values=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 use_rel_pos=True,
+                 rel_pos_zero_init=True,
+                 window_size=None,
+                 input_size=None,
+                 act_layer='nn.GELU',
+                 norm_layer='nn.LayerNorm',
+                 lr_factor=1.0,
+                 epsilon=1e-5):
+        super().__init__()
+        self.window_size = window_size
+
+        self.norm1 = eval(norm_layer)(dim,
+                                      weight_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      bias_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      epsilon=epsilon)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_bias=attn_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            window_size=window_size,
+            input_size=input_size,
+            lr_factor=lr_factor)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim,
+                                      weight_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      bias_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      epsilon=epsilon)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       act_layer=act_layer,
+                       drop=drop,
+                       lr_factor=lr_factor)
+        if init_values is not None:
+            self.gamma_1 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x):
+        y = self.norm1(x)
+        if self.window_size is not None:
+            y, pad_hw, num_hw = window_partition(y, self.window_size)
+        y = self.attn(y)
+        if self.gamma_1 is not None:
+            y = self.gamma_1 * y
+
+        if self.window_size is not None:
+            y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
+        x = x + self.drop_path(y)
+        if self.gamma_2 is None:
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=(224, 224),
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 lr_factor=0.01):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size
+
+    def forward(self, x):
+        out = self.proj(x)
+        return out
+
+
+@register
+@serializable
+class VisionTransformer2D(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=(1024, 1024),
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 qk_scale=None,
+                 init_values=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_layer='nn.GELU',
+                 norm_layer='nn.LayerNorm',
+                 lr_decay_rate=1.0,
+                 global_attn_indexes=(2, 5, 8, 11),
+                 use_abs_pos=False,
+                 use_rel_pos=False,
+                 use_abs_pos_emb=False,
+                 use_sincos_pos_emb=False,
+                 rel_pos_zero_init=True,
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 window_size=None,
+                 out_indices=(11, ),
+                 with_fpn=False,
+                 use_checkpoint=False,
+                 *args,
+                 **kwargs):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.depth = depth
+        self.global_attn_indexes = global_attn_indexes
+        self.epsilon = epsilon
+        self.with_fpn = with_fpn
+        self.use_checkpoint = use_checkpoint
+
+        self.patch_h = img_size[0] // patch_size
+        self.patch_w = img_size[1] // patch_size
+        self.num_patches = self.patch_h * self.patch_w
+        self.use_abs_pos = use_abs_pos
+        self.use_abs_pos_emb = use_abs_pos_emb
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+        if use_checkpoint:
+            paddle.seed(0)
+
+        if use_abs_pos_emb:
+            self.pos_w = self.patch_embed.num_patches_in_w
+            self.pos_h = self.patch_embed.num_patches_in_h
+            self.pos_embed = self.create_parameter(
+                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+                default_initializer=paddle.nn.initializer.TruncatedNormal(
+                    std=.02))
+        elif use_sincos_pos_emb:
+            pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
+                                                              self.patch_w)
+
+            self.pos_embed = pos_embed
+            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
+            self.pos_embed.set_value(pos_embed.numpy())
+            self.pos_embed.stop_gradient = True
+        else:
+            self.pos_embed = None
+
+        self.blocks = nn.LayerList([
+            Block(
+                embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                attn_bias=attn_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=None
+                if i in self.global_attn_indexes else window_size,
+                input_size=[self.patch_h, self.patch_w],
+                act_layer=act_layer,
+                lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
+                norm_layer=norm_layer,
+                init_values=init_values,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        assert len(out_indices) <= 4, 'out_indices out of bound'
+        self.out_indices = out_indices
+        self.pretrained = pretrained
+        self.init_weight()
+
+        self.out_channels = [embed_dim for _ in range(len(out_indices))]
+        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
+        ]
+        self.norm = Identity()
+        if self.with_fpn:
+            self.init_fpn(
+                embed_dim=embed_dim,
+                patch_size=patch_size,
+                out_with_norm=final_norm)
+
+    def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
+        return lr_decay_rate**(self.depth - layer_id)
+
+    def init_weight(self):
+        pretrained = self.pretrained
+        if pretrained:
+            if 'http' in pretrained:
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:
+                path = pretrained
+
+            load_state_dict = paddle.load(path)
+            model_state_dict = self.state_dict()
+            pos_embed_name = "pos_embed"
+
+            if pos_embed_name in load_state_dict.keys(
+            ) and self.use_abs_pos_emb:
+                load_pos_embed = paddle.to_tensor(
+                    load_state_dict[pos_embed_name], dtype="float32")
+                if self.pos_embed.shape != load_pos_embed.shape:
+                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                        load_pos_embed, (pos_size, pos_size),
+                        (self.pos_h, self.pos_w))
+
+                    # self.set_state_dict(model_state_dict)
+                    load_state_dict[pos_embed_name] = model_state_dict[
+                        pos_embed_name]
+
+                    print("Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+            self.set_state_dict(load_state_dict)
+            print("Load load_state_dict....")
+
+    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2D(embed_dim),
+                nn.GELU(),
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn3 = Identity()
+
+            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = Identity()
+
+            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
+
+            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
+
+        if not out_with_norm:
+            self.norm = Identity()
+        else:
+            self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
+        grid_y, grid_x = paddle.meshgrid(
+            paddle.arange(
+                h, dtype=paddle.float32),
+            paddle.arange(
+                w, dtype=paddle.float32))
+        assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = self.embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = (1. / (temperature**omega)).unsqueeze(0)
+
+        out_x = grid_x.reshape([-1, 1]).matmul(omega)
+        out_y = grid_y.reshape([-1, 1]).matmul(omega)
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
+                paddle.cos(out_x)
+            ],
+            axis=1)
+
+        return pos_emb.reshape([1, h, w, self.embed_dim])
+
+    def forward(self, inputs):
+        x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
+        B, Hp, Wp, _ = paddle.shape(x)
+
+        if self.use_abs_pos:
+            x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
+
+        if self.use_abs_pos_emb:
+            x = x + self.resize_pos_embed(self.pos_embed,
+                                          (self.pos_h, self.pos_w), (Hp, Wp))
+
+        feats = []
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    blk, x, **{"preserve_rng_state": True})
+            else:
+                x = blk(x)
+            if idx in self.out_indices:
+                feats.append(self.norm(x.transpose([0, 3, 1, 2])))
+
+        if self.with_fpn:
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+            for i in range(len(feats)):
+                feats[i] = fpns[i](feats[i])
+        return feats
+
+    @property
+    def num_layers(self):
+        return len(self.blocks)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self.out_channels, self.out_strides)
+        ]
+
+
+class LayerNorm(nn.Layer):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).    
+    Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
+
+    In ViT, we use the nn.LayerNorm
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = self.create_parameter([normalized_shape])
+        self.bias = self.create_parameter([normalized_shape])
+        self.eps = eps
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / paddle.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+@register
+@serializable
+class SimpleFeaturePyramid(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 spatial_scales,
+                 num_levels=4,
+                 use_bias=False):
+        """
+        Args:
+            in_channels (list[int]): input channels of each level which can be 
+                derived from the output shape of backbone by from_config
+            out_channel (int): output channel of each level.
+            spatial_scales (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features which can be derived from 
+                the output shape of backbone by from_config
+            num_levels (int): number of levels of output features.
+            use_bias (bool): whether use bias or not.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+
+        self.in_channels = in_channels[0]
+        self.out_channels = out_channels
+        self.num_levels = num_levels
+
+        self.stages = []
+        dim = self.in_channels
+        if num_levels == 4:
+            scale_factors = [2.0, 1.0, 0.5]
+        elif num_levels == 5:
+            scale_factors = [4.0, 2.0, 1.0, 0.5]
+        else:
+            raise NotImplementedError(
+                f"num_levels={num_levels} is not supported yet.")
+
+        dim = in_channels[0]
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.Conv2DTranspose(
+                        dim, dim // 2, kernel_size=2, stride=2),
+                    nn.LayerNorm(dim // 2),
+                    nn.GELU(),
+                    nn.Conv2DTranspose(
+                        dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [
+                    nn.Conv2DTranspose(
+                        dim, dim // 2, kernel_size=2, stride=2)
+                ]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
+
+            layers.extend([
+                nn.Conv2D(
+                    out_dim,
+                    out_channels,
+                    kernel_size=1,
+                    bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias_attr=use_bias, ), LayerNorm(out_channels)
+            ])
+            layers = nn.Sequential(*layers)
+
+            stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
+            self.add_sublayer(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+
+        # top block output feature maps.
+        self.top_block = nn.Sequential(
+            nn.MaxPool2D(
+                kernel_size=1, stride=2, padding=0))
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=self.out_channels)
+            for _ in range(self.num_levels)
+        ]
+
+    def forward(self, feats):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W).
+        """
+        features = feats[0]
+        results = []
+
+        for stage in self.stages:
+            results.append(stage(features))
+
+        top_block_in_feature = results[-1]
+        results.append(self.top_block(top_block_in_feature))
+        assert self.num_levels == len(results)
+
+        return results
diff --git a/ppdet/modeling/backbones/yolov6_efficientrep.py b/ppdet/modeling/backbones/yolov6_efficientrep.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32ff21f202ca385d959eb0838926c272a20e933
--- /dev/null
+++ b/ppdet/modeling/backbones/yolov6_efficientrep.py
@@ -0,0 +1,1131 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/meituan/YOLOv6
+"""
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant
+from ppdet.modeling.initializer import conv_init_, normal_
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['EfficientRep', 'CSPBepBackbone', 'Lite_EffiBackbone']
+
+activation_table = {
+    'relu': nn.ReLU(),
+    'silu': nn.Silu(),
+    'hardswish': nn.Hardswish()
+}
+
+
+class SiLU(nn.Layer):
+    @staticmethod
+    def forward(self, x):
+        return x * F.sigmoid(x)
+
+
+class BaseConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride,
+                 groups=1,
+                 bias=False,
+                 act="silu"):
+        super(BaseConv, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=(ksize - 1) // 2,
+            groups=groups,
+            bias_attr=bias)
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act is not None:
+            self.act = activation_table.get(act)
+        else:
+            self.act = nn.Identity()
+        self._init_weights()
+
+    def _init_weights(self):
+        conv_init_(self.conv)
+
+    def forward(self, x):
+        x = self.bn(self.conv(x))
+        if self.training:
+            y = self.act(x)
+        else:
+            if isinstance(self.act, nn.Silu):
+                self.act = SiLU()
+            y = self.act(x)
+        return y
+
+
+def autopad(k, p=None):  # kernel, padding
+    # Pad to 'same'
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+
+
+class BaseConv_C3(nn.Layer):
+    '''Standard convolution in BepC3-Block'''
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
+        super(BaseConv_C3, self).__init__()
+        self.conv = nn.Conv2D(
+            c1, c2, k, s, autopad(k, p), groups=g, bias_attr=False)
+        self.bn = nn.BatchNorm2D(
+            c2,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act == True:
+            self.act = nn.ReLU()
+        else:
+            if isinstance(act, nn.Layer):
+                self.act = act
+            else:
+                self.act = nn.Identity()
+
+    def forward(self, x):
+        x = self.bn(self.conv(x))
+        if self.training:
+            y = self.act(x)
+        else:
+            if isinstance(self.act, nn.Silu):
+                self.act = SiLU()
+            y = self.act(x)
+        return y
+
+
+class RepConv(nn.Layer):
+    """
+    RepVGG Conv BN Relu Block, see https://arxiv.org/abs/2101.03697
+    named RepVGGBlock in YOLOv6
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 groups=1,
+                 act='relu',
+                 deploy=False):
+        super(RepConv, self).__init__()
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        assert kernel_size == 3
+        assert padding == 1
+        padding_11 = padding - kernel_size // 2
+        self.stride = stride  # not always 1
+
+        self.nonlinearity = nn.ReLU()  # always relu in YOLOv6
+
+        if self.deploy:
+            self.rbr_reparam = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias_attr=True)
+        else:
+            self.rbr_identity = (nn.BatchNorm2D(in_channels)
+                                 if out_channels == in_channels and stride == 1
+                                 else None)
+            self.rbr_dense = nn.Sequential(* [
+                nn.Conv2D(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride,  #
+                    padding,
+                    groups=groups,
+                    bias_attr=False),
+                nn.BatchNorm2D(out_channels),
+            ])
+            self.rbr_1x1 = nn.Sequential(* [
+                nn.Conv2D(
+                    in_channels,
+                    out_channels,
+                    1,
+                    stride,
+                    padding_11,  #
+                    groups=groups,
+                    bias_attr=False),
+                nn.BatchNorm2D(out_channels),
+            ])
+
+    def forward(self, inputs):
+        if hasattr(self, "rbr_reparam"):
+            x = self.rbr_reparam(inputs)
+            y = self.nonlinearity(x)
+            return y
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+
+        x = self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out
+        y = self.nonlinearity(x)
+        return y
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            kernel = branch[0].weight
+            running_mean = branch[1]._mean
+            running_var = branch[1]._variance
+            gamma = branch[1].weight
+            beta = branch[1].bias
+            eps = branch[1]._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2D)
+            if not hasattr(self, "id_tensor"):
+                input_dim = self.in_channels // self.groups
+                kernel_value = paddle.zeros([self.in_channels, input_dim, 3, 3])
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = kernel_value
+            kernel = self.id_tensor
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+    def convert_to_deploy(self):
+        if hasattr(self, "rbr_reparam"):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2D(
+            self.rbr_dense[0]._in_channels,
+            self.rbr_dense[0]._out_channels,
+            self.rbr_dense[0]._kernel_size,
+            self.rbr_dense[0]._stride,
+            padding=self.rbr_dense[0]._padding,
+            groups=self.rbr_dense[0]._groups,
+            bias_attr=True)
+        self.rbr_reparam.weight.set_value(kernel)
+        self.rbr_reparam.bias.set_value(bias)
+        self.__delattr__("rbr_dense")
+        self.__delattr__("rbr_1x1")
+        if hasattr(self, "rbr_identity"):
+            self.__delattr__("rbr_identity")
+        if hasattr(self, "id_tensor"):
+            self.__delattr__("id_tensor")
+        self.deploy = True
+
+
+class RepLayer(nn.Layer):
+    """
+    RepLayer with RepConvs, like CSPLayer(C3) in YOLOv5/YOLOX
+    named RepBlock in YOLOv6
+    """
+
+    def __init__(self, in_channels, out_channels, num_repeats=1, block=RepConv):
+        super(RepLayer, self).__init__()
+        # in n/s
+        self.conv1 = block(in_channels, out_channels)
+        self.block = (nn.Sequential(*(block(out_channels, out_channels)
+                                      for _ in range(num_repeats - 1)))
+                      if num_repeats > 1 else None)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        if self.block is not None:
+            x = self.block(x)
+        return x
+
+
+class BottleRep(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 basic_block=RepConv,
+                 alpha=True):
+        super(BottleRep, self).__init__()
+        # basic_block: RepConv or ConvBNSiLUBlock
+        self.conv1 = basic_block(in_channels, out_channels)
+        self.conv2 = basic_block(out_channels, out_channels)
+        if in_channels != out_channels:
+            self.shortcut = False
+        else:
+            self.shortcut = True
+        if alpha:
+            self.alpha = self.create_parameter(
+                shape=[1],
+                attr=ParamAttr(initializer=Constant(value=1.)),
+                dtype="float32")
+        else:
+            self.alpha = 1.0
+
+    def forward(self, x):
+        outputs = self.conv1(x)
+        outputs = self.conv2(outputs)
+        return outputs + self.alpha * x if self.shortcut else outputs
+
+
+class RepLayer_BottleRep(nn.Layer):
+    """
+    RepLayer with RepConvs for M/L, like CSPLayer(C3) in YOLOv5/YOLOX
+    named RepBlock in YOLOv6
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_repeats=1,
+                 basic_block=RepConv):
+        super(RepLayer_BottleRep, self).__init__()
+        # in m/l
+        self.conv1 = BottleRep(
+            in_channels, out_channels, basic_block=basic_block, alpha=True)
+        num_repeats = num_repeats // 2
+        self.block = nn.Sequential(*(BottleRep(
+            out_channels, out_channels, basic_block=basic_block, alpha=True
+        ) for _ in range(num_repeats - 1))) if num_repeats > 1 else None
+
+    def forward(self, x):
+        x = self.conv1(x)
+        if self.block is not None:
+            x = self.block(x)
+        return x
+
+
+class BepC3Layer(nn.Layer):
+    # Beer-mug RepC3 Block, named BepC3 in YOLOv6
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_repeats=1,
+                 csp_e=0.5,
+                 block=RepConv,
+                 act='relu'):
+        super(BepC3Layer, self).__init__()
+        c_ = int(out_channels * csp_e)  # hidden channels
+        self.cv1 = BaseConv_C3(in_channels, c_, 1, 1)
+        self.cv2 = BaseConv_C3(in_channels, c_, 1, 1)
+        self.cv3 = BaseConv_C3(2 * c_, out_channels, 1, 1)
+        if block == ConvBNSiLUBlock and act == 'silu':
+            self.cv1 = BaseConv_C3(in_channels, c_, 1, 1, act=nn.Silu())
+            self.cv2 = BaseConv_C3(in_channels, c_, 1, 1, act=nn.Silu())
+            self.cv3 = BaseConv_C3(2 * c_, out_channels, 1, 1, act=nn.Silu())
+
+        self.m = RepLayer_BottleRep(c_, c_, num_repeats, basic_block=block)
+
+    def forward(self, x):
+        return self.cv3(paddle.concat((self.m(self.cv1(x)), self.cv2(x)), 1))
+
+
+class SimConv(nn.Layer):
+    """Simplified Conv BN ReLU"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 bias=False):
+        super(SimConv, self).__init__()
+        padding = kernel_size // 2
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=bias)
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.act = nn.ReLU()
+        self._init_weights()
+
+    def _init_weights(self):
+        conv_init_(self.conv)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+
+class SimSPPF(nn.Layer):
+    """Simplified SPPF with SimConv, use relu"""
+
+    def __init__(self, in_channels, out_channels, kernel_size=5):
+        super(SimSPPF, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = SimConv(in_channels, hidden_channels, 1, 1)
+        self.mp = nn.MaxPool2D(
+            kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
+        self.conv2 = SimConv(hidden_channels * 4, out_channels, 1, 1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        y1 = self.mp(x)
+        y2 = self.mp(y1)
+        y3 = self.mp(y2)
+        concats = paddle.concat([x, y1, y2, y3], 1)
+        return self.conv2(concats)
+
+
+class SPPF(nn.Layer):
+    """SPPF with BaseConv, use silu"""
+
+    def __init__(self, in_channels, out_channels, kernel_size=5, act='silu'):
+        super(SPPF, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            hidden_channels * 4, out_channels, ksize=1, stride=1, act=act)
+        self.mp = nn.MaxPool2D(
+            kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        y1 = self.mp(x)
+        y2 = self.mp(y1)
+        y3 = self.mp(y2)
+        concats = paddle.concat([x, y1, y2, y3], 1)
+        return self.conv2(concats)
+
+
+class SimCSPSPPF(nn.Layer):
+    """Simplified CSP SPPF with SimConv, use relu, YOLOv6 v3.0 added"""
+
+    def __init__(self, in_channels, out_channels, kernel_size=5, e=0.5):
+        super(SimCSPSPPF, self).__init__()
+        c_ = int(out_channels * e)  # hidden channels
+        self.cv1 = SimConv(in_channels, c_, 1, 1)
+        self.cv2 = SimConv(in_channels, c_, 1, 1)
+        self.cv3 = SimConv(c_, c_, 3, 1)
+        self.cv4 = SimConv(c_, c_, 1, 1)
+
+        self.mp = nn.MaxPool2D(
+            kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
+        self.cv5 = SimConv(4 * c_, c_, 1, 1)
+        self.cv6 = SimConv(c_, c_, 3, 1)
+        self.cv7 = SimConv(2 * c_, out_channels, 1, 1)
+
+    def forward(self, x):
+        x1 = self.cv4(self.cv3(self.cv1(x)))
+        y0 = self.cv2(x)
+        y1 = self.mp(x1)
+        y2 = self.mp(y1)
+        y3 = self.cv6(self.cv5(paddle.concat([x1, y1, y2, self.mp(y2)], 1)))
+        return self.cv7(paddle.concat([y0, y3], 1))
+
+
+class CSPSPPF(nn.Layer):
+    """CSP SPPF with BaseConv, use silu, YOLOv6 v3.0 added"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=5,
+                 e=0.5,
+                 act='silu'):
+        super(CSPSPPF, self).__init__()
+        c_ = int(out_channels * e)  # hidden channels
+        self.cv1 = BaseConv(in_channels, c_, 1, 1)
+        self.cv2 = BaseConv(in_channels, c_, 1, 1)
+        self.cv3 = BaseConv(c_, c_, 3, 1)
+        self.cv4 = BaseConv(c_, c_, 1, 1)
+
+        self.mp = nn.MaxPool2D(
+            kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
+        self.cv5 = BaseConv(4 * c_, c_, 1, 1)
+        self.cv6 = BaseConv(c_, c_, 3, 1)
+        self.cv7 = BaseConv(2 * c_, out_channels, 1, 1)
+
+    def forward(self, x):
+        x1 = self.cv4(self.cv3(self.conv1(x)))
+        y0 = self.cv2(x)
+        y1 = self.mp(x1)
+        y2 = self.mp(y1)
+        y3 = self.cv6(self.cv5(paddle.concat([x1, y1, y2, self.mp(y2)], 1)))
+        return self.cv7(paddle.concat([y0, y3], 1))
+
+
+class Transpose(nn.Layer):
+    '''Normal Transpose, default for upsampling'''
+
+    def __init__(self, in_channels, out_channels, kernel_size=2, stride=2):
+        super().__init__()
+        self.upsample_transpose = nn.Conv2DTranspose(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            bias_attr=True)
+
+    def forward(self, x):
+        return self.upsample_transpose(x)
+
+
+def make_divisible(x, divisor):
+    return math.ceil(x / divisor) * divisor
+
+
+@register
+@serializable
+class EfficientRep(nn.Layer):
+    """EfficientRep backbone of YOLOv6 n/s """
+    __shared__ = ['width_mult', 'depth_mult', 'act', 'training_mode']
+
+    # num_repeats, channels_list, 'P6' means add P6 layer
+    arch_settings = {
+        'P5': [[1, 6, 12, 18, 6], [64, 128, 256, 512, 1024]],
+        'P6': [[1, 6, 12, 18, 6, 6], [64, 128, 256, 512, 768, 1024]],
+    }
+
+    def __init__(
+            self,
+            arch='P5',
+            width_mult=0.33,
+            depth_mult=0.50,
+            return_idx=[2, 3, 4],
+            training_mode='repvgg',
+            fuse_P2=True,  # add P2 and return 4 layers
+            cspsppf=True,
+            act='relu'):
+        super(EfficientRep, self).__init__()
+        num_repeats, channels_list = self.arch_settings[arch]
+        num_repeats = [(max(round(i * depth_mult), 1) if i > 1 else i)
+                       for i in (num_repeats)]
+        channels_list = [
+            make_divisible(i * width_mult, 8) for i in (channels_list)
+        ]
+        self.return_idx = return_idx
+        self.fuse_P2 = fuse_P2
+        if self.fuse_P2:
+            # stem,p2,p3,p4,p5: [0,1,2,3,4]
+            self.return_idx = [1] + self.return_idx
+        self._out_channels = [channels_list[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+        block = get_block(training_mode)
+        # default block is RepConv
+        self.stem = block(3, channels_list[0], 3, 2)
+        self.blocks = []
+        for i, (out_ch,
+                num_repeat) in enumerate(zip(channels_list, num_repeats)):
+            if i == 0: continue
+            in_ch = channels_list[i - 1]
+            stage = []
+
+            repconv = self.add_sublayer('stage{}.repconv'.format(i + 1),
+                                        block(in_ch, out_ch, 3, 2))
+            stage.append(repconv)
+
+            replayer = self.add_sublayer(
+                'stage{}.replayer'.format(i + 1),
+                RepLayer(
+                    out_ch, out_ch, num_repeat, block=block))
+            stage.append(replayer)
+
+            if i == len(channels_list) - 1:
+                if cspsppf:
+                    simsppf_layer = self.add_sublayer(
+                        'stage{}.simcspsppf'.format(i + 1),
+                        SimCSPSPPF(
+                            out_ch, out_ch, kernel_size=5))
+                    stage.append(simsppf_layer)
+                else:
+                    simsppf_layer = self.add_sublayer(
+                        'stage{}.simsppf'.format(i + 1),
+                        SimSPPF(
+                            out_ch, out_ch, kernel_size=5))
+                    stage.append(simsppf_layer)
+            self.blocks.append(nn.Sequential(*stage))
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
+
+
+@register
+@serializable
+class CSPBepBackbone(nn.Layer):
+    """CSPBepBackbone of YOLOv6 m/l in v3.0"""
+    __shared__ = ['width_mult', 'depth_mult', 'act', 'training_mode']
+
+    # num_repeats, channels_list, 'P6' means add P6 layer
+    arch_settings = {
+        'P5': [[1, 6, 12, 18, 6], [64, 128, 256, 512, 1024]],
+        'P6': [[1, 6, 12, 18, 6, 6], [64, 128, 256, 512, 768, 1024]],
+    }
+
+    def __init__(self,
+                 arch='P5',
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 return_idx=[2, 3, 4],
+                 csp_e=0.5,
+                 training_mode='repvgg',
+                 fuse_P2=True,
+                 cspsppf=False,
+                 act='relu'):
+        super(CSPBepBackbone, self).__init__()
+        num_repeats, channels_list = self.arch_settings[arch]
+        num_repeats = [(max(round(i * depth_mult), 1) if i > 1 else i)
+                       for i in (num_repeats)]
+        channels_list = [
+            make_divisible(i * width_mult, 8) for i in (channels_list)
+        ]
+        self.return_idx = return_idx
+        self.fuse_P2 = fuse_P2
+        if self.fuse_P2:
+            # stem,p2,p3,p4,p5: [0,1,2,3,4]
+            self.return_idx = [1] + self.return_idx
+        self._out_channels = [channels_list[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+        block = get_block(training_mode)
+        # RepConv(or RepVGGBlock) in M, but ConvBNSiLUBlock(or ConvWrapper) in L
+
+        self.stem = block(3, channels_list[0], 3, 2)
+        self.blocks = []
+        if csp_e == 0.67:
+            csp_e = float(2) / 3
+        for i, (out_ch,
+                num_repeat) in enumerate(zip(channels_list, num_repeats)):
+            if i == 0: continue
+            in_ch = channels_list[i - 1]
+            stage = []
+
+            repconv = self.add_sublayer('stage{}.repconv'.format(i + 1),
+                                        block(in_ch, out_ch, 3, 2))
+            stage.append(repconv)
+
+            bepc3layer = self.add_sublayer(
+                'stage{}.bepc3layer'.format(i + 1),
+                BepC3Layer(
+                    out_ch,
+                    out_ch,
+                    num_repeat,
+                    csp_e=csp_e,
+                    block=block,
+                    act=act))
+            stage.append(bepc3layer)
+
+            if i == len(channels_list) - 1:
+                if cspsppf:
+                    # m/l never use cspsppf=True
+                    if training_mode == 'conv_silu':
+                        sppf_layer = self.add_sublayer(
+                            'stage{}.cspsppf'.format(i + 1),
+                            CSPSPPF(
+                                out_ch, out_ch, kernel_size=5, act='silu'))
+                        stage.append(sppf_layer)
+                    else:
+                        simsppf_layer = self.add_sublayer(
+                            'stage{}.simcspsppf'.format(i + 1),
+                            SimCSPSPPF(
+                                out_ch, out_ch, kernel_size=5))
+                        stage.append(simsppf_layer)
+                else:
+                    if training_mode == 'conv_silu':
+                        sppf_layer = self.add_sublayer(
+                            'stage{}.sppf'.format(i + 1),
+                            SPPF(
+                                out_ch, out_ch, kernel_size=5, act='silu'))
+                        stage.append(sppf_layer)
+                    else:
+                        simsppf_layer = self.add_sublayer(
+                            'stage{}.simsppf'.format(i + 1),
+                            SimSPPF(
+                                out_ch, out_ch, kernel_size=5))
+                        stage.append(simsppf_layer)
+            self.blocks.append(nn.Sequential(*stage))
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
+
+
+def get_block(mode):
+    if mode == 'repvgg':
+        return RepConv
+    elif mode == 'conv_silu':
+        return ConvBNSiLUBlock
+    elif mode == 'conv_relu':
+        return ConvBNReLUBlock
+    else:
+        raise ValueError('Unsupported mode :{}'.format(mode))
+
+
+class ConvBNSiLUBlock(nn.Layer):
+    # ConvWrapper
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 bias=True):
+        super().__init__()
+        self.base_block = BaseConv(in_channels, out_channels, kernel_size,
+                                   stride, groups, bias)
+
+    def forward(self, x):
+        return self.base_block(x)
+
+
+class ConvBNReLUBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 bias=True):
+        super().__init__()
+        self.base_block = SimConv(in_channels, out_channels, kernel_size,
+                                  stride, groups, bias)
+
+    def forward(self, x):
+        return self.base_block(x)
+
+
+######################### YOLOv6 lite #########################
+
+
+class ConvBN(nn.Layer):
+    '''Conv and BN without activation'''
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 bias=False):
+        super().__init__()
+        self.base_block = BaseConv(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            groups,
+            bias,
+            act=None)
+
+    def forward(self, x):
+        return self.base_block(x)
+
+
+class ConvBNHS(nn.Layer):
+    '''Conv and BN with Hardswish activation'''
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=None,
+                 groups=1,
+                 bias=False):
+        super().__init__()
+        self.base_block = BaseConv(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            groups,
+            bias,
+            act='hardswish')
+
+    def forward(self, x):
+        return self.base_block(x)
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        out = identity * x
+        return out
+
+
+class DPBlock(nn.Layer):
+    def __init__(self, in_channel=96, out_channel=96, kernel_size=3, stride=1):
+        super().__init__()
+        self.conv_dw_1 = nn.Conv2D(
+            in_channels=in_channel,
+            out_channels=out_channel,
+            kernel_size=kernel_size,
+            groups=out_channel,
+            padding=(kernel_size - 1) // 2,
+            stride=stride)
+        self.bn_1 = nn.BatchNorm2D(out_channel)
+        self.act_1 = nn.Hardswish()
+        self.conv_pw_1 = nn.Conv2D(
+            in_channels=out_channel,
+            out_channels=out_channel,
+            kernel_size=1,
+            groups=1,
+            padding=0)
+        self.bn_2 = nn.BatchNorm2D(out_channel)
+        self.act_2 = nn.Hardswish()
+
+    def forward(self, x):
+        x = self.act_1(self.bn_1(self.conv_dw_1(x)))
+        x = self.act_2(self.bn_2(self.conv_pw_1(x)))
+        return x
+
+    def forward_fuse(self, x):
+        x = self.act_1(self.conv_dw_1(x))
+        x = self.act_2(self.conv_pw_1(x))
+        return x
+
+
+class DarknetBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size=3, expansion=0.5):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv_1 = ConvBNHS(
+            in_channels=in_channels,
+            out_channels=hidden_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.conv_2 = DPBlock(
+            in_channel=hidden_channels,
+            out_channel=out_channels,
+            kernel_size=kernel_size,
+            stride=1)
+
+    def forward(self, x):
+        out = self.conv_1(x)
+        out = self.conv_2(out)
+        return out
+
+
+class CSPBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 expand_ratio=0.5):
+        super().__init__()
+        mid_channels = int(out_channels * expand_ratio)
+        self.conv_1 = ConvBNHS(in_channels, mid_channels, 1, 1, 0)
+        self.conv_2 = ConvBNHS(in_channels, mid_channels, 1, 1, 0)
+        self.conv_3 = ConvBNHS(2 * mid_channels, out_channels, 1, 1, 0)
+        self.blocks = DarknetBlock(mid_channels, mid_channels, kernel_size, 1.0)
+
+    def forward(self, x):
+        x_1 = self.conv_1(x)
+        x_1 = self.blocks(x_1)
+        x_2 = self.conv_2(x)
+        x = paddle.concat((x_1, x_2), axis=1)
+        x = self.conv_3(x)
+        return x
+
+
+def channel_shuffle(x, groups):
+    _, num_channels, height, width = x.shape
+    channels_per_group = num_channels // groups
+    # reshape
+    x = x.reshape([-1, groups, channels_per_group, height, width])
+    x = x.transpose([0, 2, 1, 3, 4])
+    # flatten
+    x = x.reshape([-1, groups * channels_per_group, height, width])
+    return x
+
+
+class Lite_EffiBlockS1(nn.Layer):
+    def __init__(self, in_channels, mid_channels, out_channels, stride):
+        super().__init__()
+        self.conv_pw_1 = ConvBNHS(
+            in_channels=in_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1)
+        self.conv_dw_1 = ConvBN(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=stride,
+            groups=mid_channels)
+        self.se = SEBlock(mid_channels)
+        self.conv_1 = ConvBNHS(
+            in_channels=mid_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1)
+
+    def forward(self, inputs):
+        x1, x2 = paddle.split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self.conv_pw_1(x2)
+        x3 = self.conv_dw_1(x2)
+        x3 = self.se(x3)
+        x3 = self.conv_1(x3)
+        out = paddle.concat([x1, x3], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class Lite_EffiBlockS2(nn.Layer):
+    def __init__(self, in_channels, mid_channels, out_channels, stride):
+        super().__init__()
+        # branch1
+        self.conv_dw_1 = ConvBN(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            groups=in_channels)
+        self.conv_1 = ConvBNHS(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1)
+        # branch2
+        self.conv_pw_2 = ConvBNHS(
+            in_channels=in_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1)
+        self.conv_dw_2 = ConvBN(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            groups=mid_channels // 2)
+        self.se = SEBlock(mid_channels // 2)
+        self.conv_2 = ConvBNHS(
+            in_channels=mid_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1)
+        self.conv_dw_3 = ConvBNHS(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=out_channels)
+        self.conv_pw_3 = ConvBNHS(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1)
+
+    def forward(self, inputs):
+        x1 = self.conv_dw_1(inputs)
+        x1 = self.conv_1(x1)
+        x2 = self.conv_pw_2(inputs)
+        x2 = self.conv_dw_2(x2)
+        x2 = self.se(x2)
+        x2 = self.conv_2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        out = self.conv_dw_3(out)
+        out = self.conv_pw_3(out)
+        return out
+
+
+def make_divisible_lite(v, divisor=16):
+    new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+@register
+@serializable
+class Lite_EffiBackbone(nn.Layer):
+    """Lite_EffiBackbone of YOLOv6-lite"""
+    __shared__ = ['width_mult']
+
+    def __init__(self,
+                 width_mult=1.0,
+                 return_idx=[2, 3, 4],
+                 out_channels=[24, 32, 64, 128, 256],
+                 num_repeat=[1, 3, 7, 3],
+                 scale_size=0.5):
+        super().__init__()
+        self.return_idx = return_idx
+        out_channels = [
+            make_divisible_lite(i * width_mult) for i in out_channels
+        ]
+        mid_channels = [
+            make_divisible_lite(
+                int(i * scale_size), divisor=8) for i in out_channels
+        ]
+
+        out_channels[0] = 24
+        self.conv_0 = ConvBNHS(
+            in_channels=3,
+            out_channels=out_channels[0],
+            kernel_size=3,
+            stride=2,
+            padding=1)
+
+        self.lite_effiblock_1 = self.build_block(
+            num_repeat[0], out_channels[0], mid_channels[1], out_channels[1])
+
+        self.lite_effiblock_2 = self.build_block(
+            num_repeat[1], out_channels[1], mid_channels[2], out_channels[2])
+
+        self.lite_effiblock_3 = self.build_block(
+            num_repeat[2], out_channels[2], mid_channels[3], out_channels[3])
+
+        self.lite_effiblock_4 = self.build_block(
+            num_repeat[3], out_channels[3], mid_channels[4], out_channels[4])
+
+        self._out_channels = [out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.conv_0(x)
+        x = self.lite_effiblock_1(x)
+        x = self.lite_effiblock_2(x)
+        outputs.append(x)
+        x = self.lite_effiblock_3(x)
+        outputs.append(x)
+        x = self.lite_effiblock_4(x)
+        outputs.append(x)
+        return outputs
+
+    @staticmethod
+    def build_block(num_repeat, in_channels, mid_channels, out_channels):
+        block_list = nn.Sequential()
+        for i in range(num_repeat):
+            if i == 0:
+                block = Lite_EffiBlockS2(
+                    in_channels=in_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    stride=2)
+            else:
+                block = Lite_EffiBlockS1(
+                    in_channels=out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    stride=1)
+            block_list.add_sublayer(str(i), block)
+        return block_list
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
diff --git a/ppdet/modeling/backbones/yolov7_elannet.py b/ppdet/modeling/backbones/yolov7_elannet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c076367ccc485450f91823cc95dd8dc955a5b58
--- /dev/null
+++ b/ppdet/modeling/backbones/yolov7_elannet.py
@@ -0,0 +1,605 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.initializer import normal_
+from paddle.nn.initializer import Constant
+from .csp_darknet import BaseConv, DWConv, get_activation, SiLU, Focus
+from ..shape_spec import ShapeSpec
+
+__all__ = [
+    'ELANNet', 'ELANLayer', 'ELAN2Layer', 'MPConvLayer', 'MP', 'DownC',
+    'SPPCSPC', 'SPPELAN', 'ImplicitA', 'ImplicitM', 'RepConv'
+]
+
+
+class ELANLayer(nn.Layer):
+    """ELAN layer used in YOLOv7, like CSPLayer(C3) in YOLOv5/YOLOX"""
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels1,
+                 mid_channels2,
+                 out_channels,
+                 num_blocks=4,
+                 concat_list=[-1, -3, -5, -6],
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(ELANLayer, self).__init__()
+        self.num_blocks = num_blocks
+        self.concat_list = concat_list
+
+        self.conv1 = BaseConv(
+            in_channels, mid_channels1, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, mid_channels1, ksize=1, stride=1, bias=bias, act=act)
+
+        self.bottlenecks = nn.Sequential(* [
+            BaseConv(
+                mid_channels1 if i == 0 else mid_channels2,
+                mid_channels2,
+                ksize=3,
+                stride=1,
+                bias=bias,
+                act=act) for i in range(num_blocks)
+        ])
+
+        concat_chs = mid_channels1 * 2 + mid_channels2 * (len(concat_list) - 2)
+        self.conv3 = BaseConv(
+            int(concat_chs),
+            out_channels,
+            ksize=1,
+            stride=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        outs = []
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        outs.append(x_1)
+        outs.append(x_2)
+        idx = [i + self.num_blocks for i in self.concat_list[:-2]]
+        for i in range(self.num_blocks):
+            x_2 = self.bottlenecks[i](x_2)
+            if i in idx:
+                outs.append(x_2)
+        outs = outs[::-1]  # [-1, -3]
+        x_all = paddle.concat(outs, axis=1)
+        y = self.conv3(x_all)
+        return y
+
+
+class ELAN2Layer(nn.Layer):
+    """ELAN2 layer used in YOLOv7-E6E"""
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels1,
+                 mid_channels2,
+                 out_channels,
+                 num_blocks=4,
+                 concat_list=[-1, -3, -5, -6],
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(ELAN2Layer, self).__init__()
+        self.elan_layer1 = ELANLayer(in_channels, mid_channels1, mid_channels2,
+                                     out_channels, num_blocks, concat_list,
+                                     depthwise, bias, act)
+        self.elan_layer2 = ELANLayer(in_channels, mid_channels1, mid_channels2,
+                                     out_channels, num_blocks, concat_list,
+                                     depthwise, bias, act)
+
+    def forward(self, x):
+        return self.elan_layer1(x) + self.elan_layer2(x)
+
+
+class MPConvLayer(nn.Layer):
+    """MPConvLayer used in YOLOv7"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(MPConvLayer, self).__init__()
+        mid_channels = int(out_channels * expansion)
+        self.maxpool = nn.MaxPool2D(kernel_size=2, stride=2)
+        self.conv1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, bias=bias, act=act)
+
+        self.conv2 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv3 = BaseConv(
+            mid_channels, mid_channels, ksize=3, stride=2, bias=bias, act=act)
+
+    def forward(self, x):
+        x_1 = self.conv1(self.maxpool(x))
+        x_2 = self.conv3(self.conv2(x))
+        x = paddle.concat([x_2, x_1], axis=1)
+        return x
+
+
+class MP(nn.Layer):
+    def __init__(self, kernel_size=2, stride=2):
+        super(MP, self).__init__()
+        self.mp = nn.MaxPool2D(kernel_size=kernel_size, stride=stride)
+
+    def forward(self, x):
+        return self.mp(x)
+
+
+class DownC(nn.Layer):
+    def __init__(self, c1, c2, k=2, act='silu'):
+        super(DownC, self).__init__()
+        c_ = int(c1)  # hidden channels
+        self.mp = nn.MaxPool2D(kernel_size=k, stride=k)
+        self.cv1 = BaseConv(c1, c_, 1, 1, act=act)
+        self.cv2 = BaseConv(c_, c2 // 2, 3, k, act=act)
+        self.cv3 = BaseConv(c1, c2 // 2, 1, 1, act=act)
+
+    def forward(self, x):
+        x_2 = self.cv2(self.cv1(x))
+        x_3 = self.cv3(self.mp(x))
+        return paddle.concat([x_2, x_3], 1)
+
+
+class SPPCSPC(nn.Layer):
+    def __init__(self, c1, c2, g=1, e=0.5, k=(5, 9, 13), act='silu'):
+        super(SPPCSPC, self).__init__()
+        c_ = int(2 * c2 * e)  # hidden channels
+        self.cv1 = BaseConv(c1, c_, 1, 1, act=act)
+        self.cv2 = BaseConv(c1, c_, 1, 1, act=act)
+        self.cv3 = BaseConv(c_, c_, 3, 1, act=act)
+        self.cv4 = BaseConv(c_, c_, 1, 1, act=act)
+        self.maxpoolings = nn.LayerList(
+            [nn.MaxPool2D(
+                kernel_size=x, stride=1, padding=x // 2) for x in k])
+        self.cv5 = BaseConv(4 * c_, c_, 1, 1, act=act)
+        self.cv6 = BaseConv(c_, c_, 3, 1, act=act)
+        self.cv7 = BaseConv(2 * c_, c2, 1, 1, act=act)
+
+    def forward(self, x):
+        x1 = self.cv4(self.cv3(self.cv1(x)))
+        y1 = self.cv6(
+            self.cv5(
+                paddle.concat([x1] + [mp(x1) for mp in self.maxpoolings], 1)))
+        y2 = self.cv2(x)
+        return self.cv7(paddle.concat([y1, y2], axis=1))
+
+
+class SPPELAN(nn.Layer):
+    def __init__(self, c1, c2, g=1, e=0.5, k=(5, 9, 13), act='silu'):
+        super(SPPELAN, self).__init__()
+        c_ = int(2 * c2 * e)  # hidden channels
+        self.cv1 = BaseConv(c1, c_, 1, 1, act=act)
+        self.cv2 = BaseConv(c1, c_, 1, 1, act=act)
+        self.maxpoolings = nn.LayerList(
+            [nn.MaxPool2D(
+                kernel_size=x, stride=1, padding=x // 2) for x in k])
+        self.cv3 = BaseConv(4 * c_, c_, 1, 1, act=act)
+        self.cv4 = BaseConv(2 * c_, c2, 1, 1, act=act)
+
+    def forward(self, x):
+        x_1 = self.cv1(x)
+        x_2 = self.cv2(x)
+        x_cats = [x_2] + [mp(x_2) for mp in self.maxpoolings]
+        y_cats = self.cv3(paddle.concat(x_cats[::-1], 1))
+        y = paddle.concat([y_cats, x_1], 1)
+        return self.cv4(y)
+
+
+class ImplicitA(nn.Layer):
+    def __init__(self, channel, mean=0., std=.02):
+        super(ImplicitA, self).__init__()
+        self.ia = self.create_parameter(
+            shape=([1, channel, 1, 1]),
+            attr=ParamAttr(initializer=Constant(0.)))
+        normal_(self.ia, mean=mean, std=std)
+
+    def forward(self, x):
+        return self.ia + x
+
+
+class ImplicitM(nn.Layer):
+    def __init__(self, channel, mean=1., std=.02):
+        super(ImplicitM, self).__init__()
+        self.im = self.create_parameter(
+            shape=([1, channel, 1, 1]),
+            attr=ParamAttr(initializer=Constant(1.)))
+        normal_(self.im, mean=mean, std=std)
+
+    def forward(self, x):
+        return self.im * x
+
+
+class RepConv(nn.Layer):
+    # RepVGG, see https://arxiv.org/abs/2101.03697
+    def __init__(self, c1, c2, k=3, s=1, p=1, g=1, act='silu', deploy=False):
+        super(RepConv, self).__init__()
+        self.deploy = deploy
+        self.groups = g
+        self.in_channels = c1
+        self.out_channels = c2
+        assert k == 3
+        assert p == 1
+        padding_11 = p - k // 2
+
+        self.act = get_activation(act)
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2D(
+                c1, c2, k, s, p, groups=g, bias_attr=True)
+        else:
+            self.rbr_identity = (nn.BatchNorm2D(c1)
+                                 if c2 == c1 and s == 1 else None)
+            self.rbr_dense = nn.Sequential(* [
+                nn.Conv2D(
+                    c1, c2, k, s, p, groups=g, bias_attr=False),
+                nn.BatchNorm2D(c2),
+            ])
+            self.rbr_1x1 = nn.Sequential(* [
+                nn.Conv2D(
+                    c1, c2, 1, s, padding_11, groups=g, bias_attr=False),
+                nn.BatchNorm2D(c2),
+            ])
+
+    def forward(self, inputs):
+        if hasattr(self, "rbr_reparam"):
+            x = self.rbr_reparam(inputs)
+            if self.training:
+                y = self.act(x)
+            else:
+                if isinstance(self.act, nn.Silu):
+                    self.act = SiLU()
+                y = self.act(x)
+            return y
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+
+        x = self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out
+        if self.training:
+            y = self.act(x)
+        else:
+            if isinstance(self.act, nn.Silu):
+                self.act = SiLU()
+            y = self.act(x)
+        return y
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return (
+            kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid,
+            bias3x3 + bias1x1 + biasid, )
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            kernel = branch[0].weight
+            running_mean = branch[1]._mean
+            running_var = branch[1]._variance
+            gamma = branch[1].weight
+            beta = branch[1].bias
+            eps = branch[1]._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2D)
+            if not hasattr(self, "id_tensor"):
+                input_dim = self.in_channels // self.groups
+                kernel_value = paddle.zeros([self.in_channels, input_dim, 3, 3])
+
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = kernel_value
+            kernel = self.id_tensor
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+    def convert_to_deploy(self):
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        if not hasattr(self, 'rbr_reparam'):
+            self.rbr_reparam = nn.Conv2D(
+                self.in_channels,
+                self.out_channels,
+                3,
+                1,
+                1,
+                groups=self.groups,
+                bias_attr=True)
+        self.rbr_reparam.weight.set_value(kernel)
+        self.rbr_reparam.bias.set_value(bias)
+        self.__delattr__("rbr_dense")
+        self.__delattr__("rbr_1x1")
+        if hasattr(self, "rbr_identity"):
+            self.__delattr__("rbr_identity")
+        if hasattr(self, "id_tensor"):
+            self.__delattr__("id_tensor")
+        self.deploy = True
+
+
+@register
+@serializable
+class ELANNet(nn.Layer):
+    """
+    ELANNet, YOLOv7's backbone.
+    Args:
+        arch (str): Architecture of ELANNet, from {tiny, L, X, W6, E6, D6, E6E}, default as 'L',
+        depth_mult (float): Depth multiplier, multiply number of channels in
+            each layer, default as 1.0.
+        width_mult (float): Width multiplier, multiply number of blocks in
+            CSPLayer, default as 1.0.
+        depthwise (bool): Whether to use depth-wise conv layer.
+        act (str): Activation function type, default as 'silu'.
+        trt (bool): Whether use trt infer.
+        return_idx (list): Index of stages whose feature maps are returned.
+    """
+    __shared__ = ['arch', 'depth_mult', 'width_mult', 'act', 'trt']
+
+    # in_channels, out_channels of 1 stem + 4 stages
+    ch_settings = {
+        'tiny': [[32, 64], [64, 64], [64, 128], [128, 256], [256, 512]],
+        'L': [[32, 64], [64, 256], [256, 512], [512, 1024], [1024, 1024]],
+        'X': [[40, 80], [80, 320], [320, 640], [640, 1280], [1280, 1280]],
+        'W6':
+        [[64, 64], [64, 128], [128, 256], [256, 512], [512, 768], [768, 1024]],
+        'E6':
+        [[80, 80], [80, 160], [160, 320], [320, 640], [640, 960], [960, 1280]],
+        'D6': [[96, 96], [96, 192], [192, 384], [384, 768], [768, 1152],
+               [1152, 1536]],
+        'E6E':
+        [[80, 80], [80, 160], [160, 320], [320, 640], [640, 960], [960, 1280]],
+    }
+    # mid_ch1, mid_ch2 of 4 stages' ELANLayer
+    mid_ch_settings = {
+        'tiny': [[32, 32], [64, 64], [128, 128], [256, 256]],
+        'L': [[64, 64], [128, 128], [256, 256], [256, 256]],
+        'X': [[64, 64], [128, 128], [256, 256], [256, 256]],
+        'W6': [[64, 64], [128, 128], [256, 256], [384, 384], [512, 512]],
+        'E6': [[64, 64], [128, 128], [256, 256], [384, 384], [512, 512]],
+        'D6': [[64, 64], [128, 128], [256, 256], [384, 384], [512, 512]],
+        'E6E': [[64, 64], [128, 128], [256, 256], [384, 384], [512, 512]],
+    }
+    # concat_list of 4 stages
+    concat_list_settings = {
+        'tiny': [-1, -2, -3, -4],
+        'L': [-1, -3, -5, -6],
+        'X': [-1, -3, -5, -7, -8],
+        'W6': [-1, -3, -5, -6],
+        'E6': [-1, -3, -5, -7, -8],
+        'D6': [-1, -3, -5, -7, -9, -10],
+        'E6E': [-1, -3, -5, -7, -8],
+    }
+    num_blocks = {
+        'tiny': 2,
+        'L': 4,
+        'X': 6,
+        'W6': 4,
+        'E6': 6,
+        'D6': 8,
+        'E6E': 6
+    }
+
+    def __init__(self,
+                 arch='L',
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 depthwise=False,
+                 act='silu',
+                 trt=False,
+                 return_idx=[2, 3, 4]):
+        super(ELANNet, self).__init__()
+        self.arch = arch
+        self.return_idx = return_idx
+        Conv = DWConv if depthwise else BaseConv
+
+        ch_settings = self.ch_settings[arch]
+        mid_ch_settings = self.mid_ch_settings[arch]
+        concat_list_settings = self.concat_list_settings[arch]
+        num_blocks = self.num_blocks[arch]
+
+        layers_num = 0
+        ch_1 = ch_settings[0][0]
+        ch_2 = ch_settings[0][0] * 2
+        ch_out = ch_settings[0][-1]
+        if self.arch in ['L', 'X']:
+            self.stem = nn.Sequential(* [
+                Conv(
+                    3, ch_1, 3, 1, bias=False, act=act),
+                Conv(
+                    ch_1, ch_2, 3, 2, bias=False, act=act),
+                Conv(
+                    ch_2, ch_out, 3, 1, bias=False, act=act),
+            ])
+            layers_num = 3
+        elif self.arch in ['tiny']:
+            self.stem = nn.Sequential(* [
+                Conv(
+                    3, ch_1, 3, 2, bias=False, act=act),
+                Conv(
+                    ch_1, ch_out, 3, 2, bias=False, act=act),
+            ])
+            layers_num = 2
+        elif self.arch in ['W6', 'E6', 'D6', 'E6E']:
+            # ReOrg
+            self.stem = Focus(3, ch_out, 3, 1, bias=False, act=act)
+            layers_num = 2
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(self.arch))
+
+        self._out_channels = [chs[-1] for chs in ch_settings]
+        # for SPPCSPC(L,X,W6,E6,D6,E6E) or SPPELAN(tiny)
+        self._out_channels[-1] //= 2
+        self._out_channels = [self._out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+        self.blocks = []
+        for i, (in_ch, out_ch) in enumerate(ch_settings[1:]):
+            stage = []
+
+            # 1.Downsample methods: Conv, DownC, MPConvLayer, MP, None
+            if i == 0:
+                if self.arch in ['L', 'X', 'W6']:
+                    # Conv
+                    _out_ch = out_ch if self.arch == 'W6' else out_ch // 2
+                    conv_layer = self.add_sublayer(
+                        'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
+                        Conv(
+                            in_ch, _out_ch, 3, 2, bias=False, act=act))
+                    stage.append(conv_layer)
+                    layers_num += 1
+                elif self.arch in ['E6', 'D6', 'E6E']:
+                    # DownC
+                    downc_layer = self.add_sublayer(
+                        'layers{}.stage{}.downc_layer'.format(layers_num,
+                                                              i + 1),
+                        DownC(
+                            in_ch, out_ch, 2, act=act))
+                    stage.append(downc_layer)
+                    layers_num += 1
+                elif self.arch in ['tiny']:
+                    # None
+                    pass
+                else:
+                    raise AttributeError("Unsupported arch type: {}".format(
+                        self.arch))
+            else:
+                if self.arch in ['L', 'X']:
+                    # MPConvLayer
+                    # Note: out channels of MPConvLayer is int(in_ch * 0.5 * 2)
+                    # no relationship with out_ch when used in backbone
+                    conv_res_layer = self.add_sublayer(
+                        'layers{}.stage{}.mpconv_layer'.format(layers_num,
+                                                               i + 1),
+                        MPConvLayer(
+                            in_ch, in_ch, 0.5, depthwise, bias=False, act=act))
+                    stage.append(conv_res_layer)
+                    layers_num += 5  # 1 maxpool + 3 convs + 1 concat
+                elif self.arch in ['tiny']:
+                    # MP
+                    mp_layer = self.add_sublayer(
+                        'layers{}.stage{}.mp_layer'.format(layers_num, i + 1),
+                        MP(kernel_size=2, stride=2))
+                    stage.append(mp_layer)
+                    layers_num += 1
+                elif self.arch in ['W6']:
+                    # Conv
+                    conv_layer = self.add_sublayer(
+                        'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
+                        Conv(
+                            in_ch, out_ch, 3, 2, bias=False, act=act))
+                    stage.append(conv_layer)
+                    layers_num += 1
+                elif self.arch in ['E6', 'D6', 'E6E']:
+                    # DownC
+                    downc_layer = self.add_sublayer(
+                        'layers{}.stage{}.downc_layer'.format(layers_num,
+                                                              i + 1),
+                        DownC(
+                            in_ch, out_ch, 2, act=act))
+                    stage.append(downc_layer)
+                    layers_num += 1
+                else:
+                    raise AttributeError("Unsupported arch type: {}".format(
+                        self.arch))
+
+            # 2.ELANLayer Block: like CSPLayer(C3) in YOLOv5/YOLOX
+            elan_in_ch = in_ch
+            if i == 0 and self.arch in ['L', 'X']:
+                elan_in_ch = in_ch * 2
+            if self.arch in ['W6', 'E6', 'D6', 'E6E']:
+                elan_in_ch = out_ch
+            ELANBlock = ELAN2Layer if self.arch in ['E6E'] else ELANLayer
+            elan_layer = self.add_sublayer(
+                'layers{}.stage{}.elan_layer'.format(layers_num, i + 1),
+                ELANBlock(
+                    elan_in_ch,
+                    mid_ch_settings[i][0],
+                    mid_ch_settings[i][1],
+                    out_ch,
+                    num_blocks=num_blocks,
+                    concat_list=concat_list_settings,
+                    depthwise=depthwise,
+                    bias=False,
+                    act=act))
+            stage.append(elan_layer)
+            layers_num += int(2 + num_blocks + 2)
+            # conv1 + conv2 + bottleneck + concat + conv3
+
+            # 3.SPP(Spatial Pyramid Pooling) methods: SPPCSPC, SPPELAN
+            if i == len(ch_settings[1:]) - 1:
+                if self.arch in ['L', 'X', 'W6', 'E6', 'D6', 'E6E']:
+                    sppcspc_layer = self.add_sublayer(
+                        'layers{}.stage{}.sppcspc_layer'.format(layers_num,
+                                                                i + 1),
+                        SPPCSPC(
+                            out_ch, out_ch // 2, k=(5, 9, 13), act=act))
+                    stage.append(sppcspc_layer)
+                    layers_num += 1
+                elif self.arch in ['tiny']:
+                    sppelan_layer = self.add_sublayer(
+                        'layers{}.stage{}.sppelan_layer'.format(layers_num,
+                                                                i + 1),
+                        SPPELAN(
+                            out_ch, out_ch // 2, k=(5, 9, 13), act=act))
+                    stage.append(sppelan_layer)
+                    layers_num += 9
+                else:
+                    raise AttributeError("Unsupported arch type: {}".format(
+                        self.arch))
+
+            self.blocks.append(nn.Sequential(*stage))
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
diff --git a/ppdet/modeling/backbones/yolov8_csp_darknet.py b/ppdet/modeling/backbones/yolov8_csp_darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a4b5e175e1919a5335c52dcef067bdec96166c
--- /dev/null
+++ b/ppdet/modeling/backbones/yolov8_csp_darknet.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from .csp_darknet import BaseConv, DWConv, BottleNeck, SPPFLayer
+from ..shape_spec import ShapeSpec
+
+__all__ = ['C2fLayer', 'C2Layer', 'YOLOv8CSPDarkNet']
+
+
+class C2fLayer(nn.Layer):
+    """C2f layer with 2 convs, named C2f in YOLOv8"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=1,
+                 shortcut=False,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(C2fLayer, self).__init__()
+        self.c = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, 2 * self.c, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            (2 + num_blocks) * self.c,
+            out_channels,
+            ksize=1,
+            stride=1,
+            bias=bias,
+            act=act)
+        self.bottlenecks = nn.LayerList([
+            BottleNeck(
+                self.c,
+                self.c,
+                shortcut=shortcut,
+                kernel_sizes=(3, 3),
+                expansion=1.0,
+                depthwise=depthwise,
+                bias=bias,
+                act=act) for _ in range(num_blocks)
+        ])
+
+    def forward(self, x):
+        y = list(self.conv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in self.bottlenecks)
+        return self.conv2(paddle.concat(y, 1))
+
+
+class C2Layer(nn.Layer):
+    """C2 layer with 2 convs, named C2 in YOLOv8"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=1,
+                 shortcut=False,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(C2Layer, self).__init__()
+        self.c = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, 2 * self.c, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            2 * self.c, out_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*(BottleNeck(
+            self.c,
+            self.c,
+            shortcut=shortcut,
+            kernel_sizes=(3, 3),
+            expansion=1.0,
+            depthwise=depthwise,
+            bias=bias,
+            act=act) for _ in range(num_blocks)))
+
+    def forward(self, x):
+        a, b = self.conv1(x).split((self.c, self.c), 1)
+        return self.conv2(paddle.concat((self.bottlenecks(a), b), 1))
+
+
+@register
+@serializable
+class YOLOv8CSPDarkNet(nn.Layer):
+    """
+    YOLOv8 CSPDarkNet backbone.
+    diff with YOLOv5 CSPDarkNet:
+    1. self.stem ksize 3 in YOLOv8 while 6 in YOLOv5
+    2. use C2fLayer in YOLOv8 while CSPLayer in YOLOv5
+    3. num_blocks [3,6,6,3] in YOLOv8 while [3,6,9,3] in YOLOv5
+    4. channels of last stage in M/L/X
+
+    Args:
+        arch (str): Architecture of YOLOv8 CSPDarkNet, from {P5, P6}
+        depth_mult (float): Depth multiplier, multiply number of channels in
+            each layer, default as 1.0.
+        width_mult (float): Width multiplier, multiply number of blocks in
+            C2fLayer, default as 1.0.
+        depthwise (bool): Whether to use depth-wise conv layer.
+        act (str): Activation function type, default as 'silu'.
+        return_idx (list): Index of stages whose feature maps are returned.
+    """
+
+    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
+
+    # in_channels, out_channels, num_blocks, add_shortcut, use_sppf
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, True, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, True, True]],
+    }
+
+    def __init__(self,
+                 arch='P5',
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 last_stage_ch=1024,
+                 last2_stage_ch=512,
+                 depthwise=False,
+                 act='silu',
+                 trt=False,
+                 return_idx=[2, 3, 4]):
+        super(YOLOv8CSPDarkNet, self).__init__()
+        self.return_idx = return_idx
+        Conv = DWConv if depthwise else BaseConv
+
+        arch_setting = self.arch_settings[arch]
+        # channels of last stage in M/L/X will be smaller
+        if last_stage_ch != 1024:
+            assert last_stage_ch > 0
+            arch_setting[-1][1] = last_stage_ch
+            if arch == 'P6' and last2_stage_ch != 768:
+                assert last2_stage_ch > 0
+                arch_setting[-2][1] = last2_stage_ch
+                arch_setting[-1][0] = last2_stage_ch
+        base_channels = int(arch_setting[0][0] * width_mult)
+
+        self.stem = Conv(
+            3, base_channels, ksize=3, stride=2, bias=False, act=act)
+
+        _out_channels = [base_channels]
+        layers_num = 1
+        self.csp_dark_blocks = []
+
+        for i, (in_channels, out_channels, num_blocks, shortcut,
+                use_sppf) in enumerate(arch_setting):
+            in_channels = int(in_channels * width_mult)
+            out_channels = int(out_channels * width_mult)
+            _out_channels.append(out_channels)
+            num_blocks = max(round(num_blocks * depth_mult), 1)
+            stage = []
+
+            conv_layer = self.add_sublayer(
+                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
+                Conv(
+                    in_channels, out_channels, 3, 2, bias=False, act=act))
+            stage.append(conv_layer)
+            layers_num += 1
+
+            c2f_layer = self.add_sublayer(
+                'layers{}.stage{}.c2f_layer'.format(layers_num, i + 1),
+                C2fLayer(
+                    out_channels,
+                    out_channels,
+                    num_blocks=num_blocks,
+                    shortcut=shortcut,
+                    depthwise=depthwise,
+                    bias=False,
+                    act=act))
+            stage.append(c2f_layer)
+            layers_num += 1
+
+            if use_sppf:
+                sppf_layer = self.add_sublayer(
+                    'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
+                    SPPFLayer(
+                        out_channels,
+                        out_channels,
+                        ksize=5,
+                        bias=False,
+                        act=act))
+                stage.append(sppf_layer)
+                layers_num += 1
+
+            self.csp_dark_blocks.append(nn.Sequential(*stage))
+
+        self._out_channels = [_out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.csp_dark_blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
diff --git a/ppdet/modeling/bbox_utils.py b/ppdet/modeling/bbox_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbb355a5a3629246c4f1e7dabfb86e4838b3f953
--- /dev/null
+++ b/ppdet/modeling/bbox_utils.py
@@ -0,0 +1,620 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import numpy as np
+
+
+def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    """
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    wx, wy, ww, wh = weights
+    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
+    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
+    dw = ww * paddle.log(tgt_w / src_w)
+    dh = wh * paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    return deltas
+
+
+def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
+    """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+    # Prevent sending too large values into paddle.exp()
+    dw = paddle.clip(dw, max=clip_scale)
+    dh = paddle.clip(dh, max=clip_scale)
+
+    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def bbox2delta_v2(src_boxes,
+                  tgt_boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    Modified from bbox2delta() which just use weight parameters to multiply deltas.
+    """
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    dx = (tgt_ctr_x - src_ctr_x) / src_w
+    dy = (tgt_ctr_y - src_ctr_y) / src_h
+    dw = paddle.log(tgt_w / src_w)
+    dh = paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    deltas = (
+        deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
+    return deltas
+
+
+def delta2bbox_v2(deltas,
+                  boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0],
+                  max_shape=None,
+                  ctr_clip=32.0):
+    """Decode deltas to bboxes.
+    Modified from delta2bbox() which just use weight parameters to be divided by deltas.
+    Used in YOLOFHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
+    dx = deltas[:, 0::4]
+    dy = deltas[:, 1::4]
+    dw = deltas[:, 2::4]
+    dh = deltas[:, 3::4]
+
+    # Prevent sending too large values into paddle.exp()
+    dx = dx * widths.unsqueeze(1)
+    dy = dy * heights.unsqueeze(1)
+    if ctr_clip is not None:
+        dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
+        dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
+        dw = paddle.clip(dw, max=clip_scale)
+        dh = paddle.clip(dh, max=clip_scale)
+    else:
+        dw = dw.clip(min=-clip_scale, max=clip_scale)
+        dh = dh.clip(min=-clip_scale, max=clip_scale)
+
+    pred_ctr_x = dx + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def expand_bbox(bboxes, scale):
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
+    bboxes_exp[:, 0] = x_c - w_half
+    bboxes_exp[:, 2] = x_c + w_half
+    bboxes_exp[:, 1] = y_c - h_half
+    bboxes_exp[:, 3] = y_c + h_half
+
+    return bboxes_exp
+
+
+def clip_bbox(boxes, im_shape):
+    h, w = im_shape[0], im_shape[1]
+    x1 = boxes[:, 0].clip(0, w)
+    y1 = boxes[:, 1].clip(0, h)
+    x2 = boxes[:, 2].clip(0, w)
+    y2 = boxes[:, 3].clip(0, h)
+    return paddle.stack([x1, y1, x2, y2], axis=1)
+
+
+def nonempty_bbox(boxes, min_size=0, return_mask=False):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    mask = paddle.logical_and(h > min_size, w > min_size)
+    if return_mask:
+        return mask
+    keep = paddle.nonzero(mask).flatten()
+    return keep
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def bbox_overlaps(boxes1, boxes2):
+    """
+    Calculate overlaps between boxes1 and boxes2
+
+    Args:
+        boxes1 (Tensor): boxes with shape [M, 4]
+        boxes2 (Tensor): boxes with shape [N, 4]
+
+    Return:
+        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
+    """
+    M = boxes1.shape[0]
+    N = boxes2.shape[0]
+    if M * N == 0:
+        return paddle.zeros([M, N], dtype='float32')
+    area1 = bbox_area(boxes1)
+    area2 = bbox_area(boxes2)
+
+    xy_max = paddle.minimum(
+        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+    xy_min = paddle.maximum(
+        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+    width_height = xy_max - xy_min
+    width_height = width_height.clip(min=0)
+    inter = width_height.prod(axis=2)
+
+    overlaps = paddle.where(inter > 0, inter /
+                            (paddle.unsqueeze(area1, 1) + area2 - inter),
+                            paddle.zeros_like(inter))
+    return overlaps
+
+
+def batch_bbox_overlaps(bboxes1,
+                        bboxes2,
+                        mode='iou',
+                        is_aligned=False,
+                        eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+    If ``is_aligned `` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned `` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "iof" (intersection over
+            foreground).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+    """
+    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
+    # Either the boxes are empty or the length of boxes's last dimenstion is 4
+    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
+    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
+    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return paddle.full(batch_shape + (rows, ), 1)
+        else:
+            return paddle.full(batch_shape + (rows, cols), 1)
+
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+
+    if is_aligned:
+        lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2])  # [B, rows, 2]
+        rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:])  # [B, rows, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])
+            enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])
+    else:
+        lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),
+                            bboxes2[:, :2])  # [B, rows, cols, 2]
+        rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),
+                            bboxes2[:, 2:])  # [B, rows, cols, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1.reshape([rows,1]) \
+                    + area2.reshape([1,cols]) - overlap
+        else:
+            union = area1[:, None]
+        if mode == 'giou':
+            enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),
+                                         bboxes2[:, :2])
+            enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),
+                                         bboxes2[:, 2:])
+
+    eps = paddle.to_tensor([eps])
+    union = paddle.maximum(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+    enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]
+    enclose_area = paddle.maximum(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return 1 - gious
+
+
+def xywh2xyxy(box):
+    x, y, w, h = box
+    x1 = x - w * 0.5
+    y1 = y - h * 0.5
+    x2 = x + w * 0.5
+    y2 = y + h * 0.5
+    return [x1, y1, x2, y2]
+
+
+def make_grid(h, w, dtype):
+    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
+    return paddle.stack((xv, yv), 2).cast(dtype=dtype)
+
+
+def decode_yolo(box, anchor, downsample_ratio):
+    """decode yolo box
+
+    Args:
+        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        anchor (list): anchor with the shape [na, 2]
+        downsample_ratio (int): downsample ratio, default 32
+        scale (float): scale, default 1.
+
+    Return:
+        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
+    """
+    x, y, w, h = box
+    na, grid_h, grid_w = x.shape[1:4]
+    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
+    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
+    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
+
+    anchor = paddle.to_tensor(anchor, dtype=x.dtype)
+    anchor = anchor.reshape((1, na, 1, 1, 2))
+    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
+    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
+
+    return [x1, y1, w1, h1]
+
+
+def batch_iou_similarity(box1, box2, eps=1e-9):
+    """Calculate iou of box1 and box2 in batch
+
+    Args:
+        box1 (Tensor): box with the shape [N, M1, 4]
+        box2 (Tensor): box with the shape [N, M2, 4]
+
+    Return:
+        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
+    """
+    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
+    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
+    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
+    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
+    x1y1 = paddle.maximum(px1y1, gx1y1)
+    x2y2 = paddle.minimum(px2y2, gx2y2)
+    overlap = (x2y2 - x1y1).clip(0).prod(-1)
+    area1 = (px2y2 - px1y1).clip(0).prod(-1)
+    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+    union = area1 + area2 - overlap + eps
+    return overlap / union
+
+
+def bbox_iou(box1,
+             box2,
+             x1y1x2y2=True,
+             giou=False,
+             diou=False,
+             ciou=False,
+             eps=1e-9):
+    """calculate the iou of box1 and box2
+
+    Args:
+        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        giou (bool): whether use giou or not, default False
+        diou (bool): whether use diou or not, default False
+        ciou (bool): whether use ciou or not, default False
+        eps (float): epsilon to avoid divide by zero
+
+    Return:
+        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
+    """
+    if x1y1x2y2:
+        px1, py1, px2, py2 = box1
+        gx1, gy1, gx2, gy2 = box2
+    else:  # transform from xywh to xyxy
+        px1, px2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
+        py1, py2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
+        gx1, gx2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
+        gy1, gy2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
+
+    x1 = paddle.maximum(px1, gx1)
+    y1 = paddle.maximum(py1, gy1)
+    x2 = paddle.minimum(px2, gx2)
+    y2 = paddle.minimum(py2, gy2)
+
+    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
+
+    area1 = (px2 - px1) * (py2 - py1)
+    area1 = area1.clip(0)
+
+    area2 = (gx2 - gx1) * (gy2 - gy1)
+    area2 = area2.clip(0)
+
+    union = area1 + area2 - overlap + eps
+    iou = overlap / union
+
+    if giou or ciou or diou:
+        # convex w, h
+        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
+        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
+        if giou:
+            c_area = cw * ch + eps
+            return iou - (c_area - union) / c_area
+        else:
+            # convex diagonal squared
+            c2 = cw**2 + ch**2 + eps
+            # center distance
+            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
+            if diou:
+                return iou - rho2 / c2
+            else:
+                w1, h1 = px2 - px1, py2 - py1 + eps
+                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
+                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
+                v = (4 / math.pi**2) * paddle.pow(delta, 2)
+                alpha = v / (1 + eps - iou + v)
+                alpha.stop_gradient = True
+                return iou - (rho2 / c2 + v * alpha)
+    else:
+        return iou
+
+
+def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
+    """
+    Calculate the iou of box1 and box2 with numpy.
+
+    Args:
+        box1 (ndarray): [N, 4]
+        box2 (ndarray): [M, 4], usually N != M
+        x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True
+        eps (float): epsilon to avoid divide by zero
+    Return:
+        iou (ndarray): iou of box1 and box2, [N, M]
+    """
+    N, M = len(box1), len(box2)  # usually N != M
+    if x1y1x2y2:
+        b1_x1, b1_y1 = box1[:, 0], box1[:, 1]
+        b1_x2, b1_y2 = box1[:, 2], box1[:, 3]
+        b2_x1, b2_y1 = box2[:, 0], box2[:, 1]
+        b2_x2, b2_y2 = box2[:, 2], box2[:, 3]
+    else:
+        # cxcywh style
+        # Transform from center and width to exact coordinates
+        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+
+    # get the coordinates of the intersection rectangle
+    inter_rect_x1 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_y1 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_x2 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_y2 = np.zeros((N, M), dtype=np.float32)
+    for i in range(len(box2)):
+        inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])
+        inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])
+        inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])
+        inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])
+    # Intersection area
+    inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(
+        inter_rect_y2 - inter_rect_y1, 0)
+    # Union Area
+    b1_area = np.repeat(
+        ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)
+    b2_area = np.repeat(
+        ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)
+
+    ious = inter_area / (b1_area + b2_area - inter_area + eps)
+    return ious
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4), "xyxy" format
+        max_dis (float): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clip(min=0, max=max_dis - eps)
+        top = top.clip(min=0, max=max_dis - eps)
+        right = right.clip(min=0, max=max_dis - eps)
+        bottom = bottom.clip(min=0, max=max_dis - eps)
+    return paddle.stack([left, top, right, bottom], -1)
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+        Args:
+            points (Tensor): Shape (n, 2), [x, y].
+            distance (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom).
+            max_shape (tuple): Shape of the image.
+        Returns:
+            Tensor: Decoded bboxes.
+        """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clip(min=0, max=max_shape[1])
+        y1 = y1.clip(min=0, max=max_shape[0])
+        x2 = x2.clip(min=0, max=max_shape[1])
+        y2 = y2.clip(min=0, max=max_shape[0])
+    return paddle.stack([x1, y1, x2, y2], -1)
+
+
+def bbox_center(boxes):
+    """Get bbox centers from boxes.
+    Args:
+        boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
+    Returns:
+        Tensor: boxes centers with shape (..., 2), "cx, cy" format.
+    """
+    boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
+    boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
+    return paddle.stack([boxes_cx, boxes_cy], axis=-1)
+
+
+def batch_distance2bbox(points, distance, max_shapes=None):
+    """Decode distance prediction to bounding box for batch.
+    Args:
+        points (Tensor): [B, ..., 2], "xy" format
+        distance (Tensor): [B, ..., 4], "ltrb" format
+        max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes, "x1y1x2y2" format.
+    """
+    lt, rb = paddle.split(distance, 2, -1)
+    # while tensor add parameters, parameters should be better placed on the second place
+    x1y1 = -lt + points
+    x2y2 = rb + points
+    out_bbox = paddle.concat([x1y1, x2y2], -1)
+    if max_shapes is not None:
+        max_shapes = max_shapes.flip(-1).tile([1, 2])
+        delta_dim = out_bbox.ndim - max_shapes.ndim
+        for _ in range(delta_dim):
+            max_shapes.unsqueeze_(1)
+        out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
+        out_bbox = paddle.where(out_bbox > 0, out_bbox,
+                                paddle.zeros_like(out_bbox))
+    return out_bbox
+
+
+def iou_similarity(box1, box2, eps=1e-10):
+    """Calculate iou of box1 and box2
+
+    Args:
+        box1 (Tensor): box with the shape [M1, 4]
+        box2 (Tensor): box with the shape [M2, 4]
+
+    Return:
+        iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
+    """
+    box1 = box1.unsqueeze(1)  # [M1, 4] -> [M1, 1, 4]
+    box2 = box2.unsqueeze(0)  # [M2, 4] -> [1, M2, 4]
+    px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
+    gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
+    x1y1 = paddle.maximum(px1y1, gx1y1)
+    x2y2 = paddle.minimum(px2y2, gx2y2)
+    overlap = (x2y2 - x1y1).clip(0).prod(-1)
+    area1 = (px2y2 - px1y1).clip(0).prod(-1)
+    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+    union = area1 + area2 - overlap + eps
+    return overlap / union
diff --git a/ppdet/modeling/heads/__init__.py b/ppdet/modeling/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfb325acea3e1647de1ad4e42d92b44c65c10da3
--- /dev/null
+++ b/ppdet/modeling/heads/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import yolo_head
+from . import ppyoloe_head
+from . import yolof_head
+from . import detr_head
+from . import yolov5_head
+from . import yolov6_head
+from . import yolov7_head
+from . import rtmdet_head
+from . import yolov8_head
+
+from .yolo_head import *
+from .ppyoloe_head import *
+from .yolof_head import *
+from .detr_head import *
+from .yolov5_head import *
+from .yolov6_head import *
+from .yolov7_head import *
+from .rtmdet_head import *
+from .yolov8_head import *
diff --git a/ppdet/modeling/heads/detr_head.py b/ppdet/modeling/heads/detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f65a984347c0907fd3991d3be1bff238c397ccb4
--- /dev/null
+++ b/ppdet/modeling/heads/detr_head.py
@@ -0,0 +1,533 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+import pycocotools.mask as mask_util
+from ..initializer import linear_init_, constant_
+from ..transformers.utils import inverse_sigmoid
+
+__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead']
+
+
+class MLP(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/detr.py
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.LayerList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for l in self.layers:
+            linear_init_(l)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MultiHeadAttentionMap(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
+
+        This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
+    """
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
+                 bias=True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant()) if bias else False
+
+        self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
+        self.k_proj = nn.Conv2D(
+            query_dim,
+            hidden_dim,
+            1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
+
+    def forward(self, q, k, mask=None):
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
+                                      self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
+        qh = q.reshape([bs, num_queries, n, c])
+        kh = k.reshape([bs, n, c, h, w])
+        # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
+        qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
+        kh = kh.reshape([-1, c, h * w])
+        weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
+            [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
+
+        if mask is not None:
+            weights += mask
+        # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
+        weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
+        weights = self.dropout(weights)
+        return weights
+
+
+class MaskHeadFPNConv(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
+
+        Simple convolutional head, using group norm.
+        Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
+        super().__init__()
+
+        inter_dims = [input_dim,
+                      ] + [context_dim // (2**i) for i in range(1, 5)]
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform())
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant())
+
+        self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
+                                       weight_attr, bias_attr)
+        self.conv_inter = nn.LayerList()
+        for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
+            self.conv_inter.append(
+                self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
+                                  bias_attr))
+
+        self.conv_out = nn.Conv2D(
+            inter_dims[-1],
+            1,
+            3,
+            padding=1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+        self.adapter = nn.LayerList()
+        for i in range(len(fpn_dims)):
+            self.adapter.append(
+                nn.Conv2D(
+                    fpn_dims[i],
+                    inter_dims[i + 1],
+                    1,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr))
+
+    def _make_layers(self,
+                     in_dims,
+                     out_dims,
+                     kernel_size,
+                     num_groups,
+                     weight_attr=None,
+                     bias_attr=None):
+        return nn.Sequential(
+            nn.Conv2D(
+                in_dims,
+                out_dims,
+                kernel_size,
+                padding=kernel_size // 2,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr),
+            nn.GroupNorm(num_groups, out_dims),
+            nn.ReLU())
+
+    def forward(self, x, bbox_attention_map, fpns):
+        x = paddle.concat([
+            x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
+            bbox_attention_map.flatten(0, 1)
+        ], 1)
+        x = self.conv0(x)
+        for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
+                                                    self.adapter, fpns):
+            feat = adapter_layer(feat).tile(
+                [bbox_attention_map.shape[1], 1, 1, 1])
+            x = inter_layer(x)
+            x = feat + F.interpolate(x, size=feat.shape[-2:])
+
+        x = self.conv_inter[-1](x)
+        x = self.conv_out(x)
+        return x
+
+
+@register
+class DETRHead(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_mlp_layers=3,
+                 loss='DETRLoss',
+                 fpn_dims=[1024, 512, 256],
+                 with_mask_head=False,
+                 use_focal_loss=False):
+        super(DETRHead, self).__init__()
+        # add background class
+        self.num_classes = num_classes if use_focal_loss else num_classes + 1
+        self.hidden_dim = hidden_dim
+        self.loss = loss
+        self.with_mask_head = with_mask_head
+        self.use_focal_loss = use_focal_loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(hidden_dim,
+                             hidden_dim,
+                             output_dim=4,
+                             num_layers=num_mlp_layers)
+        if self.with_mask_head:
+            self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
+                                                        nhead)
+            self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
+                                             hidden_dim)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+
+        return {
+            'hidden_dim': hidden_dim,
+            'nhead': nhead,
+            'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
+        }
+
+    @staticmethod
+    def get_gt_mask_from_polygons(gt_poly, pad_mask):
+        out_gt_mask = []
+        for polygons, padding in zip(gt_poly, pad_mask):
+            height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
+            masks = []
+            for obj_poly in polygons:
+                rles = mask_util.frPyObjects(obj_poly, height, width)
+                rle = mask_util.merge(rles)
+                masks.append(
+                    paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
+            masks = paddle.stack(masks)
+            masks_pad = paddle.zeros(
+                [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
+            masks_pad[:, :height, :width] = masks
+            out_gt_mask.append(masks_pad)
+        return out_gt_mask
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size, hidden_dim, h, w],
+                            src_proj: [batch_size, h*w, hidden_dim],
+                            src_mask: [batch_size, 1, 1, h, w])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, src_proj, src_mask = out_transformer
+        outputs_logit = self.score_head(feats)
+        outputs_bbox = F.sigmoid(self.bbox_head(feats))
+        outputs_seg = None
+        if self.with_mask_head:
+            bbox_attention_map = self.bbox_attention(feats[-1], memory,
+                                                     src_mask)
+            fpn_feats = [a for a in body_feats[::-1]][1:]
+            outputs_seg = self.mask_head(src_proj, bbox_attention_map,
+                                         fpn_feats)
+            outputs_seg = outputs_seg.reshape([
+                feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
+                outputs_seg.shape[-1]
+            ])
+
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+            gt_mask = self.get_gt_mask_from_polygons(
+                inputs['gt_poly'],
+                inputs['pad_mask']) if 'gt_poly' in inputs else None
+            return self.loss(
+                outputs_bbox,
+                outputs_logit,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                masks=outputs_seg,
+                gt_mask=gt_mask)
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
+
+
+@register
+class DeformableDETRHead(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=512,
+                 nhead=8,
+                 num_mlp_layers=3,
+                 loss='DETRLoss'):
+        super(DeformableDETRHead, self).__init__()
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.loss = loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(hidden_dim,
+                             hidden_dim,
+                             output_dim=4,
+                             num_layers=num_mlp_layers)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+        constant_(self.score_head.bias, -4.595)
+        constant_(self.bbox_head.layers[-1].weight)
+
+        with paddle.no_grad():
+            bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
+            bias[2:] = -2.0
+            self.bbox_head.layers[-1].bias.set_value(bias)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+        return {'hidden_dim': hidden_dim, 'nhead': nhead}
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size,
+                                \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
+                            reference_points: [batch_size, num_queries, 2])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, reference_points = out_transformer
+        reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
+        outputs_bbox = self.bbox_head(feats)
+
+        # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
+        # but the gradient is wrong in paddle.
+        outputs_bbox = paddle.concat(
+            [
+                outputs_bbox[:, :, :, :2] + reference_points,
+                outputs_bbox[:, :, :, 2:]
+            ],
+            axis=-1)
+
+        outputs_bbox = F.sigmoid(outputs_bbox)
+        outputs_logit = self.score_head(feats)
+
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+
+            return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
+                             inputs['gt_class'])
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], None)
+
+
+@register
+class DINOHead(nn.Layer):
+    __inject__ = ['loss']
+
+    def __init__(self, loss='DINOLoss'):
+        super(DINOHead, self).__init__()
+        self.loss = loss
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
+         dn_meta) = out_transformer
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+
+            if dn_meta is not None:
+                if isinstance(dn_meta, list):
+                    dual_groups = len(dn_meta) - 1
+                    dec_out_bboxes = paddle.split(
+                        dec_out_bboxes, dual_groups + 1, axis=2)
+                    dec_out_logits = paddle.split(
+                        dec_out_logits, dual_groups + 1, axis=2)
+                    enc_topk_bboxes = paddle.split(
+                        enc_topk_bboxes, dual_groups + 1, axis=1)
+                    enc_topk_logits = paddle.split(
+                        enc_topk_logits, dual_groups + 1, axis=1)
+
+                    dec_out_bboxes_list = []
+                    dec_out_logits_list = []
+                    dn_out_bboxes_list = []
+                    dn_out_logits_list = []
+                    loss = {}
+                    for g_id in range(dual_groups + 1):
+                        if dn_meta[g_id] is not None:
+                            dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
+                                dec_out_bboxes[g_id],
+                                dn_meta[g_id]['dn_num_split'],
+                                axis=2)
+                            dn_out_logits_gid, dec_out_logits_gid = paddle.split(
+                                dec_out_logits[g_id],
+                                dn_meta[g_id]['dn_num_split'],
+                                axis=2)
+                        else:
+                            dn_out_bboxes_gid, dn_out_logits_gid = None, None
+                            dec_out_bboxes_gid = dec_out_bboxes[g_id]
+                            dec_out_logits_gid = dec_out_logits[g_id]
+                        out_bboxes_gid = paddle.concat([
+                            enc_topk_bboxes[g_id].unsqueeze(0),
+                            dec_out_bboxes_gid
+                        ])
+                        out_logits_gid = paddle.concat([
+                            enc_topk_logits[g_id].unsqueeze(0),
+                            dec_out_logits_gid
+                        ])
+                        loss_gid = self.loss(
+                            out_bboxes_gid,
+                            out_logits_gid,
+                            inputs['gt_bbox'],
+                            inputs['gt_class'],
+                            dn_out_bboxes=dn_out_bboxes_gid,
+                            dn_out_logits=dn_out_logits_gid,
+                            dn_meta=dn_meta[g_id])
+                        # sum loss
+                        for key, value in loss_gid.items():
+                            loss.update({
+                                key: loss.get(key, paddle.zeros([1])) + value
+                            })
+
+                    # average across (dual_groups + 1)
+                    for key, value in loss.items():
+                        loss.update({key: value / (dual_groups + 1)})
+                    return loss
+                else:
+                    dn_out_bboxes, dec_out_bboxes = paddle.split(
+                        dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
+                    dn_out_logits, dec_out_logits = paddle.split(
+                        dec_out_logits, dn_meta['dn_num_split'], axis=2)
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+
+            out_bboxes = paddle.concat(
+                [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_logits = paddle.concat(
+                [enc_topk_logits.unsqueeze(0), dec_out_logits])
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_logits=dn_out_logits,
+                dn_meta=dn_meta)
+        else:
+            return (dec_out_bboxes[-1], dec_out_logits[-1], None)
+
+
+@register
+class MaskDINOHead(nn.Layer):
+    __inject__ = ['loss']
+
+    def __init__(self, loss='DINOLoss'):
+        super(MaskDINOHead, self).__init__()
+        self.loss = loss
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,
+         dn_meta) = out_transformer
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+            assert 'gt_segm' in inputs
+
+            if dn_meta is not None:
+                dn_out_logits, dec_out_logits = paddle.split(
+                    dec_out_logits, dn_meta['dn_num_split'], axis=2)
+                dn_out_bboxes, dec_out_bboxes = paddle.split(
+                    dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
+                dn_out_masks, dec_out_masks = paddle.split(
+                    dec_out_masks, dn_meta['dn_num_split'], axis=2)
+                if init_out is not None:
+                    init_out_logits, init_out_bboxes, init_out_masks = init_out
+                    init_out_logits_dn, init_out_logits = paddle.split(
+                        init_out_logits, dn_meta['dn_num_split'], axis=1)
+                    init_out_bboxes_dn, init_out_bboxes = paddle.split(
+                        init_out_bboxes, dn_meta['dn_num_split'], axis=1)
+                    init_out_masks_dn, init_out_masks = paddle.split(
+                        init_out_masks, dn_meta['dn_num_split'], axis=1)
+
+                    dec_out_logits = paddle.concat(
+                        [init_out_logits.unsqueeze(0), dec_out_logits])
+                    dec_out_bboxes = paddle.concat(
+                        [init_out_bboxes.unsqueeze(0), dec_out_bboxes])
+                    dec_out_masks = paddle.concat(
+                        [init_out_masks.unsqueeze(0), dec_out_masks])
+
+                    dn_out_logits = paddle.concat(
+                        [init_out_logits_dn.unsqueeze(0), dn_out_logits])
+                    dn_out_bboxes = paddle.concat(
+                        [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])
+                    dn_out_masks = paddle.concat(
+                        [init_out_masks_dn.unsqueeze(0), dn_out_masks])
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+                dn_out_masks = None
+
+            enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
+            out_logits = paddle.concat(
+                [enc_out_logits.unsqueeze(0), dec_out_logits])
+            out_bboxes = paddle.concat(
+                [enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_masks = paddle.concat(
+                [enc_out_masks.unsqueeze(0), dec_out_masks])
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                masks=out_masks,
+                gt_mask=inputs['gt_segm'],
+                dn_out_logits=dn_out_logits,
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_masks=dn_out_masks,
+                dn_meta=dn_meta)
+        else:
+            return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])
diff --git a/ppdet/modeling/heads/ppyoloe_head.py b/ppdet/modeling/heads/ppyoloe_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f9acd204bc17667c603be278a380deaef8ce054
--- /dev/null
+++ b/ppdet/modeling/heads/ppyoloe_head.py
@@ -0,0 +1,698 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+from paddle.nn.initializer import Normal, Constant
+
+from ..bbox_utils import batch_distance2bbox
+from ..losses import GIoULoss
+from ..initializer import bias_init_with_prob, constant_, normal_
+from ..assigners.utils import generate_anchors_for_grid_cell
+from ppdet.modeling.backbones.cspresnet import ConvBNLayer, RepVggBlock
+from ppdet.modeling.ops import get_static_shape, get_act_fn
+from ppdet.modeling.layers import MultiClassNMS
+
+__all__ = ['PPYOLOEHead', 'SimpleConvHead']
+
+
+class ESEAttn(nn.Layer):
+    def __init__(self, feat_channels, act='swish', attn_conv='convbn'):
+        super(ESEAttn, self).__init__()
+        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
+        if attn_conv == 'convbn':
+            self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)
+        elif attn_conv == 'repvgg':
+            self.conv = RepVggBlock(feat_channels, feat_channels, act=act)
+        else:
+            self.conv = None
+        self._init_weights()
+
+    def _init_weights(self):
+        normal_(self.fc.weight, std=0.001)
+
+    def forward(self, feat, avg_feat):
+        weight = F.sigmoid(self.fc(avg_feat))
+        if self.conv:
+            return self.conv(feat * weight)
+        else:
+            return feat * weight
+
+
+@register
+class PPYOLOEHead(nn.Layer):
+    __shared__ = [
+        'num_classes', 'eval_size', 'trt', 'exclude_nms',
+        'exclude_post_process', 'use_shared_conv', 'for_distill'
+    ]
+    __inject__ = ['static_assigner', 'assigner', 'nms']
+
+    def __init__(self,
+                 in_channels=[1024, 512, 256],
+                 num_classes=80,
+                 act='swish',
+                 fpn_strides=(32, 16, 8),
+                 grid_cell_scale=5.0,
+                 grid_cell_offset=0.5,
+                 reg_max=16,
+                 reg_range=None,
+                 static_assigner_epoch=4,
+                 use_varifocal_loss=True,
+                 static_assigner='ATSSAssigner',
+                 assigner='TaskAlignedAssigner',
+                 nms='MultiClassNMS',
+                 eval_size=None,
+                 loss_weight={
+                     'class': 1.0,
+                     'iou': 2.5,
+                     'dfl': 0.5,
+                 },
+                 trt=False,
+                 attn_conv='convbn',
+                 exclude_nms=False,
+                 exclude_post_process=False,
+                 use_shared_conv=True,
+                 for_distill=False):
+        super(PPYOLOEHead, self).__init__()
+        assert len(in_channels) > 0, "len(in_channels) should > 0"
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.fpn_strides = fpn_strides
+        self.grid_cell_scale = grid_cell_scale
+        self.grid_cell_offset = grid_cell_offset
+        if reg_range:
+            self.sm_use = True
+            self.reg_range = reg_range
+        else:
+            self.sm_use = False
+            self.reg_range = (0, reg_max + 1)
+        self.reg_channels = self.reg_range[1] - self.reg_range[0]
+        self.iou_loss = GIoULoss()
+        self.loss_weight = loss_weight
+        self.use_varifocal_loss = use_varifocal_loss
+        self.eval_size = eval_size
+
+        self.static_assigner_epoch = static_assigner_epoch
+        self.static_assigner = static_assigner
+        self.assigner = assigner
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+        self.use_shared_conv = use_shared_conv
+        self.for_distill = for_distill
+        self.is_teacher = False
+
+        # stem
+        self.stem_cls = nn.LayerList()
+        self.stem_reg = nn.LayerList()
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+        for in_c in self.in_channels:
+            self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
+            self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
+        # pred head
+        self.pred_cls = nn.LayerList()
+        self.pred_reg = nn.LayerList()
+        for in_c in self.in_channels:
+            self.pred_cls.append(
+                nn.Conv2D(
+                    in_c, self.num_classes, 3, padding=1))
+            self.pred_reg.append(
+                nn.Conv2D(
+                    in_c, 4 * self.reg_channels, 3, padding=1))
+        # projection conv
+        self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False)
+        self.proj_conv.skip_quant = True
+        self._init_weights()
+
+        if self.for_distill:
+            self.distill_pairs = {}
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def _init_weights(self):
+        bias_cls = bias_init_with_prob(0.01)
+        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
+            constant_(cls_.weight)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.weight)
+            constant_(reg_.bias, 1.0)
+
+        proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1,
+                               self.reg_channels).reshape(
+                                   [1, self.reg_channels, 1, 1])
+        self.proj_conv.weight.set_value(proj)
+        self.proj_conv.weight.stop_gradient = True
+        if self.eval_size:
+            anchor_points, stride_tensor = self._generate_anchors()
+            self.anchor_points = anchor_points
+            self.stride_tensor = stride_tensor
+
+    def forward_train(self, feats, targets, aux_pred=None):
+        anchors, anchor_points, num_anchors_list, stride_tensor = \
+            generate_anchors_for_grid_cell(
+                feats, self.fpn_strides, self.grid_cell_scale,
+                self.grid_cell_offset)
+
+        cls_score_list, reg_distri_list = [], []
+        for i, feat in enumerate(feats):
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
+                                         feat)
+            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
+            # cls and reg
+            cls_score = F.sigmoid(cls_logit)
+            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
+            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
+        cls_score_list = paddle.concat(cls_score_list, axis=1)
+        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
+
+        if targets.get('is_teacher', False):
+            pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)
+            return cls_score_list, pred_deltas * stride_tensor, pred_dfls
+
+        if targets.get('get_data', False):
+            pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)
+            return cls_score_list, pred_deltas * stride_tensor, pred_dfls
+
+        return self.get_loss([
+            cls_score_list, reg_distri_list, anchors, anchor_points,
+            num_anchors_list, stride_tensor
+        ], targets, aux_pred)
+
+    def _generate_anchors(self, feats=None, dtype='float32'):
+        # just use in eval time
+        anchor_points = []
+        stride_tensor = []
+        for i, stride in enumerate(self.fpn_strides):
+            if feats is not None:
+                _, _, h, w = feats[i].shape
+            else:
+                h = int(self.eval_size[0] / stride)
+                w = int(self.eval_size[1] / stride)
+            shift_x = paddle.arange(end=w) + self.grid_cell_offset
+            shift_y = paddle.arange(end=h) + self.grid_cell_offset
+            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
+            anchor_point = paddle.cast(
+                paddle.stack(
+                    [shift_x, shift_y], axis=-1), dtype=dtype)
+            anchor_points.append(anchor_point.reshape([-1, 2]))
+            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
+        anchor_points = paddle.concat(anchor_points)
+        stride_tensor = paddle.concat(stride_tensor)
+        return anchor_points, stride_tensor
+
+    def forward_eval(self, feats):
+        if self.eval_size:
+            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
+        else:
+            anchor_points, stride_tensor = self._generate_anchors(feats)
+        cls_score_list, reg_dist_list = [], []
+        for i, feat in enumerate(feats):
+            _, _, h, w = feat.shape
+            l = h * w
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
+                                         feat)
+            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
+            reg_dist = reg_dist.reshape(
+                [-1, 4, self.reg_channels, l]).transpose([0, 2, 3, 1])
+            if self.use_shared_conv:
+                reg_dist = self.proj_conv(F.softmax(
+                    reg_dist, axis=1)).squeeze(1)
+            else:
+                reg_dist = F.softmax(reg_dist, axis=1)
+            # cls and reg
+            cls_score = F.sigmoid(cls_logit)
+            cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))
+            reg_dist_list.append(reg_dist)
+
+        cls_score_list = paddle.concat(cls_score_list, axis=-1)
+        if self.use_shared_conv:
+            reg_dist_list = paddle.concat(reg_dist_list, axis=1)
+        else:
+            reg_dist_list = paddle.concat(reg_dist_list, axis=2)
+            reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1)
+
+        return cls_score_list, reg_dist_list, anchor_points, stride_tensor
+
+    def forward(self, feats, targets=None, aux_pred=None):
+        assert len(feats) == len(self.fpn_strides), \
+            "The size of feats is not equal to size of fpn_strides"
+
+        if self.training:
+            return self.forward_train(feats, targets, aux_pred)
+        else:
+            if targets is not None:
+                # only for semi-det
+                self.is_teacher = targets.get('is_teacher', False)
+                if self.is_teacher:
+                    return self.forward_train(feats, targets, aux_pred=None)
+                else:
+                    return self.forward_eval(feats)
+
+            return self.forward_eval(feats)
+
+    @staticmethod
+    def _focal_loss(score, label, alpha=0.25, gamma=2.0):
+        weight = (score - label).pow(gamma)
+        if alpha > 0:
+            alpha_t = alpha * label + (1 - alpha) * (1 - label)
+            weight *= alpha_t
+        loss = F.binary_cross_entropy(
+            score, label, weight=weight, reduction='sum')
+        return loss
+
+    @staticmethod
+    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
+        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
+        loss = F.binary_cross_entropy(
+            pred_score, gt_score, weight=weight, reduction='sum')
+        return loss
+
+    def _bbox_decode(self, anchor_points, pred_dist):
+        _, l, _ = get_static_shape(pred_dist)
+        pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels]))
+        pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1)
+        return batch_distance2bbox(anchor_points, pred_dist)
+
+    def _bbox_decode_fake(self, pred_dist):
+        _, l, _ = get_static_shape(pred_dist)
+        pred_dist_dfl = F.softmax(
+            pred_dist.reshape([-1, l, 4, self.reg_channels]))
+        pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1, 2
+                                                            ])).squeeze(1)
+        return pred_dist, pred_dist_dfl
+
+    def _bbox2distance(self, points, bbox):
+        x1y1, x2y2 = paddle.split(bbox, 2, -1)
+        lt = points - x1y1
+        rb = x2y2 - points
+        return paddle.concat([lt, rb], -1).clip(self.reg_range[0],
+                                                self.reg_range[1] - 1 - 0.01)
+
+    def _df_loss(self, pred_dist, target, lower_bound=0):
+        target_left = paddle.cast(target.floor(), 'int64')
+        target_right = target_left + 1
+        weight_left = target_right.astype('float32') - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(
+            pred_dist, target_left - lower_bound,
+            reduction='none') * weight_left
+        loss_right = F.cross_entropy(
+            pred_dist, target_right - lower_bound,
+            reduction='none') * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+
+    def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels,
+                   assigned_bboxes, assigned_scores, assigned_scores_sum):
+        # select positive samples mask
+        mask_positive = (assigned_labels != self.num_classes)
+
+        if self.for_distill:
+            # only used for LD main_kd distill
+            self.distill_pairs['mask_positive_select'] = mask_positive
+
+        num_pos = mask_positive.sum()
+        # pos/neg loss
+        if num_pos > 0:
+            # l1 + iou
+            bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile(
+                [1, 1, 4]).astype('bool')
+            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
+                                                   bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = paddle.masked_select(
+                assigned_bboxes, bbox_mask).reshape([-1, 4])
+            bbox_weight = paddle.masked_select(
+                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
+
+            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
+
+            loss_iou = self.iou_loss(pred_bboxes_pos,
+                                     assigned_bboxes_pos) * bbox_weight
+            loss_iou = loss_iou.sum() / assigned_scores_sum
+
+            dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile(
+                [1, 1, self.reg_channels * 4]).astype('bool')
+            pred_dist_pos = paddle.masked_select(
+                pred_dist, dist_mask).reshape([-1, 4, self.reg_channels])
+            assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes)
+            assigned_ltrb_pos = paddle.masked_select(
+                assigned_ltrb, bbox_mask).reshape([-1, 4])
+            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos,
+                                     self.reg_range[0]) * bbox_weight
+            loss_dfl = loss_dfl.sum() / assigned_scores_sum
+            if self.for_distill:
+                self.distill_pairs['pred_bboxes_pos'] = pred_bboxes_pos
+                self.distill_pairs['pred_dist_pos'] = pred_dist_pos
+                self.distill_pairs['bbox_weight'] = bbox_weight
+        else:
+            loss_l1 = paddle.zeros([1])
+            loss_iou = paddle.zeros([1])
+            loss_dfl = pred_dist.sum() * 0.
+        return loss_l1, loss_iou, loss_dfl
+
+    def get_loss(self, head_outs, gt_meta, aux_pred=None):
+        pred_scores, pred_distri, anchors,\
+        anchor_points, num_anchors_list, stride_tensor = head_outs
+
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
+
+        if aux_pred is not None:
+            pred_scores_aux = aux_pred[0]
+            pred_bboxes_aux = self._bbox_decode(anchor_points_s, aux_pred[1])
+
+        gt_labels = gt_meta['gt_class']
+        gt_bboxes = gt_meta['gt_bbox']
+        pad_gt_mask = gt_meta['pad_gt_mask']
+        # label assignment
+        if gt_meta['epoch_id'] < self.static_assigner_epoch:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.static_assigner(
+                    anchors,
+                    num_anchors_list,
+                    gt_labels,
+                    gt_bboxes,
+                    pad_gt_mask,
+                    bg_index=self.num_classes,
+                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
+            alpha_l = 0.25
+        else:
+            if self.sm_use:
+                # only used in smalldet of PPYOLOE-SOD model
+                assigned_labels, assigned_bboxes, assigned_scores = \
+                    self.assigner(
+                    pred_scores.detach(),
+                    pred_bboxes.detach() * stride_tensor,
+                    anchor_points,
+                    stride_tensor,
+                    gt_labels,
+                    gt_bboxes,
+                    pad_gt_mask,
+                    bg_index=self.num_classes)
+            else:
+                if aux_pred is None:
+                    if not hasattr(self, "assigned_labels"):
+                        assigned_labels, assigned_bboxes, assigned_scores = \
+                            self.assigner(
+                            pred_scores.detach(),
+                            pred_bboxes.detach() * stride_tensor,
+                            anchor_points,
+                            num_anchors_list,
+                            gt_labels,
+                            gt_bboxes,
+                            pad_gt_mask,
+                            bg_index=self.num_classes)
+                        if self.for_distill:
+                            self.assigned_labels = assigned_labels
+                            self.assigned_bboxes = assigned_bboxes
+                            self.assigned_scores = assigned_scores
+
+                    else:
+                        # only used in distill
+                        assigned_labels = self.assigned_labels
+                        assigned_bboxes = self.assigned_bboxes
+                        assigned_scores = self.assigned_scores
+
+                else:
+                    assigned_labels, assigned_bboxes, assigned_scores = \
+                            self.assigner(
+                            pred_scores_aux.detach(),
+                            pred_bboxes_aux.detach() * stride_tensor,
+                            anchor_points,
+                            num_anchors_list,
+                            gt_labels,
+                            gt_bboxes,
+                            pad_gt_mask,
+                            bg_index=self.num_classes)
+            alpha_l = -1
+        # rescale bbox
+        assigned_bboxes /= stride_tensor
+
+        assign_out_dict = self.get_loss_from_assign(
+            pred_scores, pred_distri, pred_bboxes, anchor_points_s,
+            assigned_labels, assigned_bboxes, assigned_scores, alpha_l)
+
+        if aux_pred is not None:
+            assign_out_dict_aux = self.get_loss_from_assign(
+                aux_pred[0], aux_pred[1], pred_bboxes_aux, anchor_points_s,
+                assigned_labels, assigned_bboxes, assigned_scores, alpha_l)
+            loss = {}
+            for key in assign_out_dict.keys():
+                loss[key] = assign_out_dict[key] + assign_out_dict_aux[key]
+        else:
+            loss = assign_out_dict
+
+        return loss
+
+    def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes,
+                             anchor_points_s, assigned_labels, assigned_bboxes,
+                             assigned_scores, alpha_l):
+        # cls loss
+        if self.use_varifocal_loss:
+            one_hot_label = F.one_hot(assigned_labels,
+                                      self.num_classes + 1)[..., :-1]
+            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
+                                            one_hot_label)
+        else:
+            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
+
+        assigned_scores_sum = assigned_scores.sum()
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(assigned_scores_sum)
+            assigned_scores_sum /= paddle.distributed.get_world_size()
+        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
+        loss_cls /= assigned_scores_sum
+
+        if self.for_distill:
+            self.distill_pairs['pred_cls_scores'] = pred_scores
+            self.distill_pairs['pos_num'] = assigned_scores_sum
+            self.distill_pairs['assigned_scores'] = assigned_scores
+
+            one_hot_label = F.one_hot(assigned_labels,
+                                      self.num_classes + 1)[..., :-1]
+            self.distill_pairs['target_labels'] = one_hot_label
+
+        loss_l1, loss_iou, loss_dfl = \
+            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
+                            assigned_labels, assigned_bboxes, assigned_scores,
+                            assigned_scores_sum)
+        loss = self.loss_weight['class'] * loss_cls + \
+               self.loss_weight['iou'] * loss_iou + \
+               self.loss_weight['dfl'] * loss_dfl
+        out_dict = {
+            'loss': loss,
+            'loss_cls': loss_cls,
+            'loss_iou': loss_iou,
+            'loss_dfl': loss_dfl,
+            'loss_l1': loss_l1,
+        }
+        return out_dict
+
+    def post_process(self, head_outs, scale_factor):
+        pred_scores, pred_dist, anchor_points, stride_tensor = head_outs
+        pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)
+        pred_bboxes *= stride_tensor
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1), None
+        else:
+            # scale bbox to origin
+            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
+            scale_factor = paddle.concat(
+                [scale_x, scale_y, scale_x, scale_y],
+                axis=-1).reshape([-1, 1, 4])
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
+
+
+def get_activation(name="LeakyReLU"):
+    if name == "silu":
+        module = nn.Silu()
+    elif name == "relu":
+        module = nn.ReLU()
+    elif name in ["LeakyReLU", 'leakyrelu', 'lrelu']:
+        module = nn.LeakyReLU(0.1)
+    elif name is None:
+        module = nn.Identity()
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 norm_type='gn',
+                 activation="LeakyReLU"):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn', 'syncbn', 'gn', None]
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=False,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+
+        if norm_type in ['bn', 'sync_bn', 'syncbn']:
+            self.norm = nn.BatchNorm2D(out_channels)
+        elif norm_type == 'gn':
+            self.norm = nn.GroupNorm(num_groups=32, num_channels=out_channels)
+        else:
+            self.norm = None
+
+        self.act = get_activation(activation)
+
+    def forward(self, x):
+        y = self.conv(x)
+        if self.norm is not None:
+            y = self.norm(y)
+        y = self.act(y)
+        return y
+
+
+class ScaleReg(nn.Layer):
+    """
+    Parameter for scaling the regression outputs.
+    """
+
+    def __init__(self, scale=1.0):
+        super(ScaleReg, self).__init__()
+        scale = paddle.to_tensor(scale)
+        self.scale = self.create_parameter(
+            shape=[1],
+            dtype='float32',
+            default_initializer=nn.initializer.Assign(scale))
+
+    def forward(self, x):
+        return x * self.scale
+
+
+@register
+class SimpleConvHead(nn.Layer):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 feat_in=288,
+                 feat_out=288,
+                 num_convs=1,
+                 fpn_strides=[32, 16, 8, 4],
+                 norm_type='gn',
+                 act='LeakyReLU',
+                 prior_prob=0.01,
+                 reg_max=16):
+        super(SimpleConvHead, self).__init__()
+        self.num_classes = num_classes
+        self.feat_in = feat_in
+        self.feat_out = feat_out
+        self.num_convs = num_convs
+        self.fpn_strides = fpn_strides
+        self.reg_max = reg_max
+
+        self.cls_convs = nn.LayerList()
+        self.reg_convs = nn.LayerList()
+        for i in range(self.num_convs):
+            in_c = feat_in if i == 0 else feat_out
+            self.cls_convs.append(
+                ConvNormLayer(
+                    in_c,
+                    feat_out,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_type=norm_type,
+                    activation=act))
+            self.reg_convs.append(
+                ConvNormLayer(
+                    in_c,
+                    feat_out,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_type=norm_type,
+                    activation=act))
+
+        bias_cls = bias_init_with_prob(prior_prob)
+        self.gfl_cls = nn.Conv2D(
+            feat_out,
+            self.num_classes,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(initializer=Normal(
+                mean=0.0, std=0.01)),
+            bias_attr=ParamAttr(initializer=Constant(value=bias_cls)))
+        self.gfl_reg = nn.Conv2D(
+            feat_out,
+            4 * (self.reg_max + 1),
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(initializer=Normal(
+                mean=0.0, std=0.01)),
+            bias_attr=ParamAttr(initializer=Constant(value=0)))
+
+        self.scales = nn.LayerList()
+        for i in range(len(self.fpn_strides)):
+            self.scales.append(ScaleReg(1.0))
+
+    def forward(self, feats):
+        cls_scores = []
+        bbox_preds = []
+        for x, scale in zip(feats, self.scales):
+            cls_feat = x
+            reg_feat = x
+            for cls_conv in self.cls_convs:
+                cls_feat = cls_conv(cls_feat)
+            for reg_conv in self.reg_convs:
+                reg_feat = reg_conv(reg_feat)
+
+            cls_score = self.gfl_cls(cls_feat)
+            cls_score = F.sigmoid(cls_score)
+            cls_score = cls_score.flatten(2).transpose([0, 2, 1])
+            cls_scores.append(cls_score)
+
+            bbox_pred = scale(self.gfl_reg(reg_feat))
+            bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])
+            bbox_preds.append(bbox_pred)
+
+        cls_scores = paddle.concat(cls_scores, axis=1)
+        bbox_preds = paddle.concat(bbox_preds, axis=1)
+        return cls_scores, bbox_preds
diff --git a/ppdet/modeling/heads/rtmdet_head.py b/ppdet/modeling/heads/rtmdet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..466381b5be2008b85b04b74e7949a9956bcc3307
--- /dev/null
+++ b/ppdet/modeling/heads/rtmdet_head.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+from ..bbox_utils import batch_distance2bbox
+from ..losses import GIoULoss, QualityFocalLoss, IouLoss
+from ..initializer import bias_init_with_prob, constant_
+from ppdet.modeling.backbones.csp_darknet import BaseConv
+from ppdet.modeling.assigners.simota_assigner import SimOTAAssigner  #, DynamicSoftLabelAssigner
+from ppdet.modeling.layers import MultiClassNMS
+from paddle import ParamAttr
+from paddle.nn.initializer import Normal
+
+__all__ = ['RTMDetHead']
+
+
+@register
+class RTMDetHead(nn.Layer):
+    __shared__ = [
+        'num_classes', 'width_mult', 'trt', 'exclude_nms',
+        'exclude_post_process'
+    ]
+    __inject__ = ['assigner', 'nms']
+
+    def __init__(
+            self,
+            num_classes=80,
+            width_mult=1.0,
+            in_channels=[1024, 512, 256],
+            feat_channels=256,
+            stacked_convs=2,
+            pred_kernel_size=1,
+            act='swish',
+            fpn_strides=(32, 16, 8),
+            share_conv=True,
+            exp_on_reg=False,
+            assigner='SimOTAAssigner',  # just placeholder
+            grid_cell_offset=0.,
+            nms='MultiClassNMS',
+            loss_weight={
+                'cls': 1.0,
+                'box': 2.0,
+            },
+            trt=False,
+            exclude_nms=False,
+            exclude_post_process=False):
+        super(RTMDetHead, self).__init__()
+        assert len(in_channels) > 0, "len(in_channels) should > 0"
+        self._dtype = paddle.framework.get_default_dtype()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.fpn_strides = fpn_strides
+        self.pred_kernel_size = pred_kernel_size
+        self.stacked_convs = stacked_convs
+        self.feat_channels = int(feat_channels * width_mult)
+        self.share_conv = share_conv
+        self.exp_on_reg = exp_on_reg
+        self.grid_cell_offset = grid_cell_offset
+
+        self.loss_cls = QualityFocalLoss()
+        self.loss_box = IouLoss(loss_weight=1.0, giou=True)
+        self.loss_weight = loss_weight
+        self.assigner = assigner
+
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+
+        # head
+        self.cls_convs = nn.LayerList()
+        self.reg_convs = nn.LayerList()
+        self.cls_preds = nn.LayerList()
+        self.reg_preds = nn.LayerList()
+        for idx in range(len(self.fpn_strides)):
+            cls_convs = nn.LayerList()
+            reg_convs = nn.LayerList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels[idx] if i == 0 else self.feat_channels
+                cls_convs.append(BaseConv(chn, self.feat_channels, 3, 1))
+                reg_convs.append(BaseConv(chn, self.feat_channels, 3, 1))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+
+            self.cls_preds.append(
+                nn.Conv2D(
+                    self.feat_channels,
+                    self.num_classes,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2,
+                    weight_attr=ParamAttr(initializer=Normal(
+                        mean=0., std=0.01)),
+                    bias_attr=True))
+            self.reg_preds.append(
+                nn.Conv2D(
+                    self.feat_channels,
+                    4,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2,
+                    weight_attr=ParamAttr(initializer=Normal(
+                        mean=0., std=0.01)),
+                    bias_attr=True))
+
+        self.share_conv = False  # TODO in deploy
+        if self.share_conv:
+            for n in range(len(self.fpn_strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+        self._init_weights()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def _init_weights(self):
+        bias_cls = bias_init_with_prob(0.01)
+        for cls_, reg_ in zip(self.cls_preds, self.reg_preds):
+            constant_(cls_.weight)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.weight)
+            constant_(reg_.bias, 1.0)
+
+    def forward_train(self, feats, targets):
+        assert len(feats) == len(self.fpn_strides), \
+            "The size of feats is not equal to size of fpn_strides"
+        feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats]
+
+        cls_score_list, reg_distri_list = [], []
+        for idx, x in enumerate(feats):
+            _, _, h, w = x.shape
+            cls_feat = x
+            reg_feat = x
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_logit = self.cls_preds[idx](cls_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+            if self.exp_on_reg:
+                reg_dist = self.reg_preds[idx](reg_feat).exp()
+            else:
+                reg_dist = self.reg_preds[idx](reg_feat)
+            reg_dist = reg_dist * self.fpn_strides[idx]
+            # cls and reg
+            cls_score = F.sigmoid(cls_logit)
+            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
+            reg_distri_list.append(reg_dist.flatten(2).transpose([0, 2, 1]))
+        cls_score_list = paddle.concat(cls_score_list, axis=1)
+        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
+
+        anchor_points, stride_tensor = self._generate_anchor_point(
+            feat_sizes, self.fpn_strides, 0.)
+
+        raise NotImplementedError('RTMDet training not implemented yet.')
+
+        return self.get_loss(
+            [cls_score_list, reg_distri_list, anchor_points,
+             stride_tensor], targets)
+
+    def _generate_anchors(self, feats=None, dtype='float32'):
+        # just use in eval time
+        anchor_points = []
+        stride_tensor = []
+        for i, stride in enumerate(self.fpn_strides):
+            if feats is not None:
+                _, _, h, w = feats[i].shape
+            else:
+                h = int(self.eval_size[0] / stride)
+                w = int(self.eval_size[1] / stride)
+            shift_x = paddle.arange(end=w) + self.grid_cell_offset
+            shift_y = paddle.arange(end=h) + self.grid_cell_offset
+            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
+            anchor_point = paddle.cast(
+                paddle.stack(
+                    [shift_x, shift_y], axis=-1), dtype=dtype)
+            anchor_points.append(anchor_point.reshape([-1, 2]))
+            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
+        anchor_points = paddle.concat(anchor_points)
+        stride_tensor = paddle.concat(stride_tensor)
+        return anchor_points, stride_tensor
+
+    def forward_eval(self, feats):
+        anchor_points, stride_tensor = self._generate_anchors(feats)
+        cls_score_list, reg_dist_list = [], []
+        for idx, x in enumerate(feats):
+            _, _, h, w = x.shape
+            l = h * w
+            cls_feat = x
+            reg_feat = x
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_logit = self.cls_preds[idx](cls_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+            if self.exp_on_reg:
+                reg_dist = self.reg_preds[idx](reg_feat).exp()
+            else:
+                reg_dist = self.reg_preds[idx](reg_feat)
+            # cls and reg
+            cls_score = F.sigmoid(cls_logit)
+            cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))
+            reg_dist_list.append(reg_dist.reshape([-1, 4, l]))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=-1)
+        reg_dist_list = paddle.concat(reg_dist_list, axis=-1)
+
+        return cls_score_list, reg_dist_list, anchor_points, stride_tensor
+
+    def forward(self, feats, targets=None):
+        assert len(feats) == len(self.fpn_strides), \
+            "The size of feats is not equal to size of fpn_strides"
+
+        if self.training:
+            return self.forward_train(feats, targets)
+        else:
+            return self.forward_eval(feats)
+
+    def _generate_anchor_point(self, feat_sizes, strides, offset=0.):
+        anchor_points, stride_tensor = [], []
+        num_anchors_list = []
+        for feat_size, stride in zip(feat_sizes, strides):
+            h, w = feat_size
+            x = (paddle.arange(w) + offset) * stride
+            y = (paddle.arange(h) + offset) * stride
+            y, x = paddle.meshgrid(y, x)
+            anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2]))
+            stride_tensor.append(
+                paddle.full(
+                    [len(anchor_points[-1]), 1], stride, dtype=self._dtype))
+            num_anchors_list.append(len(anchor_points[-1]))
+        anchor_points = paddle.concat(anchor_points).astype(self._dtype)
+        anchor_points.stop_gradient = True
+        stride_tensor = paddle.concat(stride_tensor)
+        stride_tensor.stop_gradient = True
+        return anchor_points, stride_tensor  #, num_anchors_list
+
+    def get_loss(self, head_outs, targets):
+        pred_cls, pred_bboxes, anchor_points, stride_tensor = head_outs
+        raise NotImplementedError('RTMDet training not implemented yet.')
+
+        gt_labels = targets['gt_class']
+        gt_bboxes = targets['gt_bbox']
+
+        loss_cls = paddle.zeros([1])
+        loss_iou = paddle.zeros([1])
+        loss = self.loss_weight['cls'] * loss_cls + \
+               self.loss_weight['box'] * loss_iou
+        return {
+            'loss': loss,
+            'loss_cls': loss_cls,
+            'loss_box': loss_iou,
+        }
+
+    def post_process(self, head_outs, im_shape, scale_factor):
+        pred_scores, pred_dist, anchor_points, stride_tensor = head_outs
+        pred_bboxes = batch_distance2bbox(anchor_points,
+                                          pred_dist.transpose([0, 2, 1]))
+        pred_bboxes *= stride_tensor
+
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1)
+        else:
+            # scale bbox to origin
+            scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
diff --git a/ppdet/modeling/heads/yolo_head.py b/ppdet/modeling/heads/yolo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ea791cf809ca01ce269febdd94338a894cf6723
--- /dev/null
+++ b/ppdet/modeling/heads/yolo_head.py
@@ -0,0 +1,426 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register
+
+import math
+import numpy as np
+from ..initializer import bias_init_with_prob, constant_
+from ..backbones.csp_darknet import BaseConv, DWConv
+from ..losses import IouLoss
+from ppdet.modeling.assigners.simota_assigner import SimOTAAssigner
+from ppdet.modeling.bbox_utils import bbox_overlaps
+from ppdet.modeling.layers import MultiClassNMS
+
+__all__ = ['YOLOv3Head', 'YOLOXHead']
+
+
+def _de_sigmoid(x, eps=1e-7):
+    x = paddle.clip(x, eps, 1. / eps)
+    x = paddle.clip(1. / x - 1., eps, 1. / eps)
+    x = -paddle.log(x)
+    return x
+
+
+@register
+class YOLOv3Head(nn.Layer):
+    __shared__ = ['num_classes', 'data_format']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 in_channels=[1024, 512, 256],
+                 anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
+                          [59, 119], [116, 90], [156, 198], [373, 326]],
+                 anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
+                 num_classes=80,
+                 loss='YOLOv3Loss',
+                 iou_aware=False,
+                 iou_aware_factor=0.4,
+                 data_format='NCHW'):
+        """
+        Head for YOLOv3 network
+
+        Args:
+            num_classes (int): number of foreground classes
+            anchors (list): anchors
+            anchor_masks (list): anchor masks
+            loss (object): YOLOv3Loss instance
+            iou_aware (bool): whether to use iou_aware
+            iou_aware_factor (float): iou aware factor
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(YOLOv3Head, self).__init__()
+        assert len(in_channels) > 0, "in_channels length should > 0"
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.loss = loss
+
+        self.iou_aware = iou_aware
+        self.iou_aware_factor = iou_aware_factor
+
+        self.parse_anchor(anchors, anchor_masks)
+        self.num_outputs = len(self.anchors)
+        self.data_format = data_format
+
+        self.yolo_outputs = []
+        for i in range(len(self.anchors)):
+
+            if self.iou_aware:
+                num_filters = len(self.anchors[i]) * (self.num_classes + 6)
+            else:
+                num_filters = len(self.anchors[i]) * (self.num_classes + 5)
+            name = 'yolo_output.{}'.format(i)
+            conv = nn.Conv2D(
+                in_channels=self.in_channels[i],
+                out_channels=num_filters,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                data_format=data_format,
+                bias_attr=ParamAttr(regularizer=L2Decay(0.)))
+            conv.skip_quant = True
+            yolo_output = self.add_sublayer(name, conv)
+            self.yolo_outputs.append(yolo_output)
+
+    def parse_anchor(self, anchors, anchor_masks):
+        self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks]
+        self.mask_anchors = []
+        anchor_num = len(anchors)
+        for masks in anchor_masks:
+            self.mask_anchors.append([])
+            for mask in masks:
+                assert mask < anchor_num, "anchor mask index overflow"
+                self.mask_anchors[-1].extend(anchors[mask])
+
+    def forward(self, feats, targets=None):
+        assert len(feats) == len(self.anchors)
+        yolo_outputs = []
+        for i, feat in enumerate(feats):
+            yolo_output = self.yolo_outputs[i](feat)
+            if self.data_format == 'NHWC':
+                yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2])
+            yolo_outputs.append(yolo_output)
+
+        if self.training:
+            return self.loss(yolo_outputs, targets, self.anchors)
+        else:
+            if self.iou_aware:
+                y = []
+                for i, out in enumerate(yolo_outputs):
+                    na = len(self.anchors[i])
+                    ioup, x = out[:, 0:na, :, :], out[:, na:, :, :]
+                    b, c, h, w = x.shape
+                    no = c // na
+                    x = x.reshape((b, na, no, h * w))
+                    ioup = ioup.reshape((b, na, 1, h * w))
+                    obj = x[:, :, 4:5, :]
+                    ioup = F.sigmoid(ioup)
+                    obj = F.sigmoid(obj)
+                    obj_t = (obj**(1 - self.iou_aware_factor)) * (
+                        ioup**self.iou_aware_factor)
+                    obj_t = _de_sigmoid(obj_t)
+                    loc_t = x[:, :, :4, :]
+                    cls_t = x[:, :, 5:, :]
+                    y_t = paddle.concat([loc_t, obj_t, cls_t], axis=2)
+                    y_t = y_t.reshape((b, c, h, w))
+                    y.append(y_t)
+                return y
+            else:
+                return yolo_outputs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+
+@register
+class YOLOXHead(nn.Layer):
+    __shared__ = [
+        'num_classes', 'width_mult', 'act', 'trt', 'exclude_nms',
+        'exclude_post_process'
+    ]
+    __inject__ = ['assigner', 'nms']
+
+    def __init__(self,
+                 num_classes=80,
+                 width_mult=1.0,
+                 depthwise=False,
+                 in_channels=[256, 512, 1024],
+                 feat_channels=256,
+                 fpn_strides=(8, 16, 32),
+                 l1_epoch=285,
+                 act='silu',
+                 assigner=SimOTAAssigner(use_vfl=False),
+                 nms='MultiClassNMS',
+                 loss_weight={
+                     'cls': 1.0,
+                     'obj': 1.0,
+                     'iou': 5.0,
+                     'l1': 1.0,
+                 },
+                 trt=False,
+                 exclude_post_process=False,
+                 exclude_nms=False):
+        super(YOLOXHead, self).__init__()
+        self._dtype = paddle.framework.get_default_dtype()
+        self.num_classes = num_classes
+        assert len(in_channels) > 0, "in_channels length should > 0"
+        self.in_channels = in_channels
+        feat_channels = int(feat_channels * width_mult)
+        self.fpn_strides = fpn_strides
+        self.l1_epoch = l1_epoch
+        self.assigner = assigner
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+        self.loss_weight = loss_weight
+        self.iou_loss = IouLoss(loss_weight=1.0)  # default loss_weight 2.5
+
+        ConvBlock = DWConv if depthwise else BaseConv
+
+        self.stem_conv = nn.LayerList()
+        self.conv_cls = nn.LayerList()
+        self.conv_reg = nn.LayerList()  # reg [x,y,w,h] + obj
+        for in_c in self.in_channels:
+            self.stem_conv.append(BaseConv(in_c, feat_channels, 1, 1, act=act))
+
+            self.conv_cls.append(
+                nn.Sequential(* [
+                    ConvBlock(
+                        feat_channels, feat_channels, 3, 1, act=act), ConvBlock(
+                            feat_channels, feat_channels, 3, 1, act=act),
+                    nn.Conv2D(
+                        feat_channels,
+                        self.num_classes,
+                        1,
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+                ]))
+
+            self.conv_reg.append(
+                nn.Sequential(* [
+                    ConvBlock(
+                        feat_channels, feat_channels, 3, 1, act=act),
+                    ConvBlock(
+                        feat_channels, feat_channels, 3, 1, act=act),
+                    nn.Conv2D(
+                        feat_channels,
+                        4 + 1,  # reg [x,y,w,h] + obj
+                        1,
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+                ]))
+
+        self._init_weights()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def _init_weights(self):
+        bias_cls = bias_init_with_prob(0.01)
+        bias_reg = paddle.full([5], math.log(5.), dtype=self._dtype)
+        bias_reg[:2] = 0.
+        bias_reg[-1] = bias_cls
+        for cls_, reg_ in zip(self.conv_cls, self.conv_reg):
+            constant_(cls_[-1].weight)
+            constant_(cls_[-1].bias, bias_cls)
+            constant_(reg_[-1].weight)
+            reg_[-1].bias.set_value(bias_reg)
+
+    def _generate_anchor_point(self, feat_sizes, strides, offset=0.):
+        anchor_points, stride_tensor = [], []
+        num_anchors_list = []
+        for feat_size, stride in zip(feat_sizes, strides):
+            h, w = feat_size
+            x = (paddle.arange(w) + offset) * stride
+            y = (paddle.arange(h) + offset) * stride
+            y, x = paddle.meshgrid(y, x)
+            anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2]))
+            stride_tensor.append(
+                paddle.full(
+                    [len(anchor_points[-1]), 1], stride, dtype=self._dtype))
+            num_anchors_list.append(len(anchor_points[-1]))
+        anchor_points = paddle.concat(anchor_points).astype(self._dtype)
+        anchor_points.stop_gradient = True
+        stride_tensor = paddle.concat(stride_tensor)
+        stride_tensor.stop_gradient = True
+        return anchor_points, stride_tensor, num_anchors_list
+
+    def forward(self, feats, targets=None):
+        assert len(feats) == len(self.fpn_strides), \
+            "The size of feats is not equal to size of fpn_strides"
+
+        feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats]
+        cls_score_list, reg_pred_list = [], []
+        obj_score_list = []
+        for i, feat in enumerate(feats):
+            feat = self.stem_conv[i](feat)
+            cls_logit = self.conv_cls[i](feat)
+            reg_pred = self.conv_reg[i](feat)
+            # cls prediction
+            cls_score = F.sigmoid(cls_logit)
+            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
+            # reg prediction
+            reg_xywh, obj_logit = paddle.split(reg_pred, [4, 1], axis=1)
+            reg_xywh = reg_xywh.flatten(2).transpose([0, 2, 1])
+            reg_pred_list.append(reg_xywh)
+            # obj prediction
+            obj_score = F.sigmoid(obj_logit)
+            obj_score_list.append(obj_score.flatten(2).transpose([0, 2, 1]))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=1)
+        reg_pred_list = paddle.concat(reg_pred_list, axis=1)
+        obj_score_list = paddle.concat(obj_score_list, axis=1)
+
+        # bbox decode
+        anchor_points, stride_tensor, _ =\
+            self._generate_anchor_point(feat_sizes, self.fpn_strides)
+        reg_xy, reg_wh = paddle.split(reg_pred_list, 2, axis=-1)
+        reg_xy += (anchor_points / stride_tensor)
+        reg_wh = paddle.exp(reg_wh) * 0.5
+        bbox_pred_list = paddle.concat(
+            [reg_xy - reg_wh, reg_xy + reg_wh], axis=-1)
+
+        if self.training:
+            anchor_points, stride_tensor, num_anchors_list =\
+                self._generate_anchor_point(feat_sizes, self.fpn_strides, 0.5)
+            yolox_losses = self.get_loss([
+                cls_score_list, bbox_pred_list, obj_score_list, anchor_points,
+                stride_tensor, num_anchors_list
+            ], targets)
+            return yolox_losses
+        else:
+            pred_scores = (cls_score_list * obj_score_list).sqrt()
+            return pred_scores, bbox_pred_list, stride_tensor
+
+    def get_loss(self, head_outs, targets):
+        pred_cls, pred_bboxes, pred_obj,\
+        anchor_points, stride_tensor, num_anchors_list = head_outs
+        gt_labels = targets['gt_class']
+        gt_bboxes = targets['gt_bbox']
+        pred_scores = (pred_cls * pred_obj).sqrt()
+        # label assignment
+        center_and_strides = paddle.concat(
+            [anchor_points, stride_tensor, stride_tensor], axis=-1)
+        pos_num_list, label_list, bbox_target_list = [], [], []
+        for pred_score, pred_bbox, gt_box, gt_label in zip(
+                pred_scores.detach(),
+                pred_bboxes.detach() * stride_tensor, gt_bboxes, gt_labels):
+            pos_num, label, _, bbox_target = self.assigner(
+                pred_score, center_and_strides, pred_bbox, gt_box, gt_label)
+            pos_num_list.append(pos_num)
+            label_list.append(label)
+            bbox_target_list.append(bbox_target)
+        labels = paddle.to_tensor(np.stack(label_list, axis=0))
+        bbox_targets = paddle.to_tensor(np.stack(bbox_target_list, axis=0))
+        bbox_targets /= stride_tensor  # rescale bbox
+
+        # 1. obj score loss
+        mask_positive = (labels != self.num_classes)
+        loss_obj = F.binary_cross_entropy(
+            pred_obj,
+            mask_positive.astype(pred_obj.dtype).unsqueeze(-1),
+            reduction='sum')
+
+        num_pos = sum(pos_num_list)
+
+        if num_pos > 0:
+            num_pos = paddle.to_tensor(num_pos, dtype=self._dtype).clip(min=1)
+            loss_obj /= num_pos
+
+            # 2. iou loss
+            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
+            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
+                                                   bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = paddle.masked_select(
+                bbox_targets, bbox_mask).reshape([-1, 4])
+            bbox_iou = bbox_overlaps(pred_bboxes_pos, assigned_bboxes_pos)
+            bbox_iou = paddle.diag(bbox_iou)
+
+            loss_iou = self.iou_loss(
+                pred_bboxes_pos.split(
+                    4, axis=-1),
+                assigned_bboxes_pos.split(
+                    4, axis=-1))
+            loss_iou = loss_iou.sum() / num_pos
+
+            # 3. cls loss
+            cls_mask = mask_positive.unsqueeze(-1).tile(
+                [1, 1, self.num_classes])
+            pred_cls_pos = paddle.masked_select(
+                pred_cls, cls_mask).reshape([-1, self.num_classes])
+            assigned_cls_pos = paddle.masked_select(labels, mask_positive)
+            assigned_cls_pos = F.one_hot(assigned_cls_pos,
+                                         self.num_classes + 1)[..., :-1]
+            assigned_cls_pos *= bbox_iou.unsqueeze(-1)
+            loss_cls = F.binary_cross_entropy(
+                pred_cls_pos, assigned_cls_pos, reduction='sum')
+            loss_cls /= num_pos
+
+            # 4. l1 loss
+            if targets['epoch_id'] >= self.l1_epoch:
+                loss_l1 = F.l1_loss(
+                    pred_bboxes_pos, assigned_bboxes_pos, reduction='sum')
+                loss_l1 /= num_pos
+            else:
+                loss_l1 = paddle.zeros([1])
+                loss_l1.stop_gradient = False
+        else:
+            loss_cls = paddle.zeros([1])
+            loss_iou = paddle.zeros([1])
+            loss_l1 = paddle.zeros([1])
+            loss_cls.stop_gradient = False
+            loss_iou.stop_gradient = False
+            loss_l1.stop_gradient = False
+
+        loss = self.loss_weight['obj'] * loss_obj + \
+               self.loss_weight['cls'] * loss_cls + \
+               self.loss_weight['iou'] * loss_iou
+
+        if targets['epoch_id'] >= self.l1_epoch:
+            loss += (self.loss_weight['l1'] * loss_l1)
+
+        yolox_losses = {
+            'loss': loss,
+            'loss_cls': loss_cls,
+            'loss_obj': loss_obj,
+            'loss_iou': loss_iou,
+            'loss_l1': loss_l1,
+        }
+        return yolox_losses
+
+    def post_process(self, head_outs, img_shape, scale_factor):
+        pred_scores, pred_bboxes, stride_tensor = head_outs
+        pred_scores = pred_scores.transpose([0, 2, 1])
+        pred_bboxes *= stride_tensor
+
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1)
+        else:
+            # scale bbox to origin
+            scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
diff --git a/ppdet/modeling/heads/yolof_head.py b/ppdet/modeling/heads/yolof_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4893337366e5bd9e828bc08ad9b2e41f0002fa98
--- /dev/null
+++ b/ppdet/modeling/heads/yolof_head.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Normal, Constant
+
+from ppdet.modeling.layers import MultiClassNMS
+from ppdet.core.workspace import register
+from ppdet.modeling.bbox_utils import delta2bbox_v2
+
+__all__ = ['YOLOFHead']
+
+INF = 1e8
+
+
+def reduce_mean(tensor):
+    world_size = paddle.distributed.get_world_size()
+    if world_size == 1:
+        return tensor
+    paddle.distributed.all_reduce(tensor)
+    return tensor / world_size
+
+
+def find_inside_anchor(feat_size, stride, num_anchors, im_shape):
+    feat_h, feat_w = feat_size[:2]
+    im_h, im_w = im_shape[:2]
+    inside_h = min(int(np.ceil(im_h / stride)), feat_h)
+    inside_w = min(int(np.ceil(im_w / stride)), feat_w)
+    inside_mask = paddle.zeros([feat_h, feat_w], dtype=paddle.bool)
+    inside_mask[:inside_h, :inside_w] = True
+    inside_mask = inside_mask.unsqueeze(-1).expand(
+        [feat_h, feat_w, num_anchors])
+    return inside_mask.reshape([-1])
+
+
+@register
+class YOLOFFeat(nn.Layer):
+    def __init__(self,
+                 feat_in=256,
+                 feat_out=256,
+                 num_cls_convs=2,
+                 num_reg_convs=4,
+                 norm_type='bn'):
+        super(YOLOFFeat, self).__init__()
+        assert norm_type == 'bn', "YOLOFFeat only support BN now."
+        self.feat_in = feat_in
+        self.feat_out = feat_out
+        self.num_cls_convs = num_cls_convs
+        self.num_reg_convs = num_reg_convs
+        self.norm_type = norm_type
+
+        cls_subnet, reg_subnet = [], []
+        for i in range(self.num_cls_convs):
+            feat_in = self.feat_in if i == 0 else self.feat_out
+            cls_subnet.append(
+                nn.Conv2D(
+                    feat_in,
+                    self.feat_out,
+                    3,
+                    stride=1,
+                    padding=1,
+                    weight_attr=ParamAttr(initializer=Normal(
+                        mean=0.0, std=0.01)),
+                    bias_attr=ParamAttr(initializer=Constant(value=0.0))))
+            cls_subnet.append(
+                nn.BatchNorm2D(
+                    self.feat_out,
+                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
+            cls_subnet.append(nn.ReLU())
+
+        for i in range(self.num_reg_convs):
+            feat_in = self.feat_in if i == 0 else self.feat_out
+            reg_subnet.append(
+                nn.Conv2D(
+                    feat_in,
+                    self.feat_out,
+                    3,
+                    stride=1,
+                    padding=1,
+                    weight_attr=ParamAttr(initializer=Normal(
+                        mean=0.0, std=0.01)),
+                    bias_attr=ParamAttr(initializer=Constant(value=0.0))))
+            reg_subnet.append(
+                nn.BatchNorm2D(
+                    self.feat_out,
+                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
+            reg_subnet.append(nn.ReLU())
+
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.reg_subnet = nn.Sequential(*reg_subnet)
+
+    def forward(self, fpn_feat):
+        cls_feat = self.cls_subnet(fpn_feat)
+        reg_feat = self.reg_subnet(fpn_feat)
+        return cls_feat, reg_feat
+
+
+@register
+class YOLOFHead(nn.Layer):
+    __shared__ = ['num_classes', 'trt', 'exclude_nms']
+    __inject__ = [
+        'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',
+        'loss_bbox', 'nms'
+    ]
+
+    def __init__(self,
+                 num_classes=80,
+                 conv_feat='YOLOFFeat',
+                 anchor_generator='AnchorGenerator',
+                 bbox_assigner='UniformAssigner',
+                 loss_class='FocalLoss',
+                 loss_bbox='GIoULoss',
+                 ctr_clip=32.0,
+                 delta_mean=[0.0, 0.0, 0.0, 0.0],
+                 delta_std=[1.0, 1.0, 1.0, 1.0],
+                 nms='MultiClassNMS',
+                 prior_prob=0.01,
+                 nms_pre=1000,
+                 use_inside_anchor=False,
+                 trt=False,
+                 exclude_nms=False):
+        super(YOLOFHead, self).__init__()
+        self.num_classes = num_classes
+        self.conv_feat = conv_feat
+        self.anchor_generator = anchor_generator
+        self.na = self.anchor_generator.num_anchors
+        self.bbox_assigner = bbox_assigner
+        self.loss_class = loss_class
+        self.loss_bbox = loss_bbox
+        self.ctr_clip = ctr_clip
+        self.delta_mean = delta_mean
+        self.delta_std = delta_std
+        self.nms = nms
+        self.nms_pre = nms_pre
+        self.use_inside_anchor = use_inside_anchor
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+
+        bias_init_value = -math.log((1 - prior_prob) / prior_prob)
+        self.cls_score = self.add_sublayer(
+            'cls_score',
+            nn.Conv2D(
+                in_channels=conv_feat.feat_out,
+                out_channels=self.num_classes * self.na,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(
+                    value=bias_init_value))))
+
+        self.bbox_pred = self.add_sublayer(
+            'bbox_pred',
+            nn.Conv2D(
+                in_channels=conv_feat.feat_out,
+                out_channels=4 * self.na,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(value=0))))
+
+        self.object_pred = self.add_sublayer(
+            'object_pred',
+            nn.Conv2D(
+                in_channels=conv_feat.feat_out,
+                out_channels=self.na,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(value=0))))
+
+    def forward(self, feats, targets=None):
+        assert len(feats) == 1, "YOLOF only has one level feature."
+        conv_cls_feat, conv_reg_feat = self.conv_feat(feats[0])
+        cls_logits = self.cls_score(conv_cls_feat)
+        objectness = self.object_pred(conv_reg_feat)
+        bboxes_reg = self.bbox_pred(conv_reg_feat)
+
+        N, C, H, W = paddle.shape(cls_logits)[:]
+        cls_logits = cls_logits.reshape((N, self.na, self.num_classes, H, W))
+        objectness = objectness.reshape((N, self.na, 1, H, W))
+        norm_cls_logits = cls_logits + objectness - paddle.log(
+            1.0 + paddle.clip(
+                cls_logits.exp(), max=INF) + paddle.clip(
+                    objectness.exp(), max=INF))
+        norm_cls_logits = norm_cls_logits.reshape((N, C, H, W))
+
+        anchors = self.anchor_generator([norm_cls_logits])
+
+        if self.training:
+            yolof_losses = self.get_loss(
+                [anchors[0], norm_cls_logits, bboxes_reg], targets)
+            return yolof_losses
+        else:
+            return anchors[0], norm_cls_logits, bboxes_reg
+
+    def get_loss(self, head_outs, targets):
+        anchors, cls_logits, bbox_preds = head_outs
+
+        feat_size = cls_logits.shape[-2:]
+        cls_logits = cls_logits.transpose([0, 2, 3, 1])
+        cls_logits = cls_logits.reshape([0, -1, self.num_classes])
+        bbox_preds = bbox_preds.transpose([0, 2, 3, 1])
+        bbox_preds = bbox_preds.reshape([0, -1, 4])
+
+        num_pos_list = []
+        cls_pred_list, cls_tar_list = [], []
+        reg_pred_list, reg_tar_list = [], []
+        # find and gather preds and targets in each image
+        for cls_logit, bbox_pred, gt_bbox, gt_class, im_shape in zip(
+                cls_logits, bbox_preds, targets['gt_bbox'], targets['gt_class'],
+                targets['im_shape']):
+            if self.use_inside_anchor:
+                inside_mask = find_inside_anchor(
+                    feat_size, self.anchor_generator.strides[0], self.na,
+                    im_shape.tolist())
+                cls_logit = cls_logit[inside_mask]
+                bbox_pred = bbox_pred[inside_mask]
+                anchors = anchors[inside_mask]
+
+            bbox_pred = delta2bbox_v2(
+                bbox_pred,
+                anchors,
+                self.delta_mean,
+                self.delta_std,
+                ctr_clip=self.ctr_clip)
+            bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]])
+
+            # -2:ignore, -1:neg, >=0:pos
+            match_labels, pos_bbox_pred, pos_bbox_tar = self.bbox_assigner(
+                bbox_pred, anchors, gt_bbox)
+            pos_mask = (match_labels >= 0)
+            neg_mask = (match_labels == -1)
+            chosen_mask = paddle.logical_or(pos_mask, neg_mask)
+
+            gt_class = gt_class.reshape([-1])
+            bg_class = paddle.to_tensor(
+                [self.num_classes], dtype=gt_class.dtype)
+            # a trick to assign num_classes to negative targets
+            gt_class = paddle.concat([gt_class, bg_class], axis=-1)
+            match_labels = paddle.where(
+                neg_mask,
+                paddle.full_like(match_labels, gt_class.size - 1), match_labels)
+            num_pos_list.append(max(1.0, pos_mask.sum().item()))
+
+            cls_pred_list.append(cls_logit[chosen_mask])
+            cls_tar_list.append(gt_class[match_labels[chosen_mask]])
+            reg_pred_list.append(pos_bbox_pred)
+            reg_tar_list.append(pos_bbox_tar)
+
+        num_tot_pos = paddle.to_tensor(sum(num_pos_list))
+        num_tot_pos = reduce_mean(num_tot_pos).item()
+        num_tot_pos = max(1.0, num_tot_pos)
+
+        cls_pred = paddle.concat(cls_pred_list)
+        cls_tar = paddle.concat(cls_tar_list)
+        cls_loss = self.loss_class(
+            cls_pred, cls_tar, reduction='sum') / num_tot_pos
+
+        reg_pred_list = [_ for _ in reg_pred_list if _ is not None]
+        reg_tar_list = [_ for _ in reg_tar_list if _ is not None]
+        if len(reg_pred_list) == 0:
+            reg_loss = bbox_preds.sum() * 0.0
+        else:
+            reg_pred = paddle.concat(reg_pred_list)
+            reg_tar = paddle.concat(reg_tar_list)
+            reg_loss = self.loss_bbox(reg_pred, reg_tar).sum() / num_tot_pos
+
+        yolof_losses = {
+            'loss': cls_loss + reg_loss,
+            'loss_cls': cls_loss,
+            'loss_reg': reg_loss,
+        }
+        return yolof_losses
+
+    def get_bboxes_single(self,
+                          anchors,
+                          cls_scores,
+                          bbox_preds,
+                          im_shape,
+                          scale_factor,
+                          rescale=True):
+        assert len(cls_scores) == len(bbox_preds)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for anchor, cls_score, bbox_pred in zip(anchors, cls_scores,
+                                                bbox_preds):
+            cls_score = cls_score.reshape([-1, self.num_classes])
+            bbox_pred = bbox_pred.reshape([-1, 4])
+            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
+                max_score = cls_score.max(axis=1)
+                _, topk_inds = max_score.topk(self.nms_pre)
+                bbox_pred = bbox_pred.gather(topk_inds)
+                anchor = anchor.gather(topk_inds)
+                cls_score = cls_score.gather(topk_inds)
+
+            bbox_pred = delta2bbox_v2(
+                bbox_pred,
+                anchor,
+                self.delta_mean,
+                self.delta_std,
+                max_shape=im_shape,
+                ctr_clip=self.ctr_clip).squeeze()
+            mlvl_bboxes.append(bbox_pred)
+            mlvl_scores.append(F.sigmoid(cls_score))
+        mlvl_bboxes = paddle.concat(mlvl_bboxes)
+        mlvl_bboxes = paddle.squeeze(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes = mlvl_bboxes / paddle.concat(
+                [scale_factor[::-1], scale_factor[::-1]])
+        mlvl_scores = paddle.concat(mlvl_scores)
+        mlvl_scores = mlvl_scores.transpose([1, 0])
+        return mlvl_bboxes, mlvl_scores
+
+    def decode(self, anchors, cls_scores, bbox_preds, im_shape, scale_factor):
+        batch_bboxes = []
+        batch_scores = []
+        for img_id in range(cls_scores[0].shape[0]):
+            num_lvls = len(cls_scores)
+            cls_score_list = [cls_scores[i][img_id] for i in range(num_lvls)]
+            bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_lvls)]
+            bboxes, scores = self.get_bboxes_single(
+                anchors, cls_score_list, bbox_pred_list, im_shape[img_id],
+                scale_factor[img_id])
+            batch_bboxes.append(bboxes)
+            batch_scores.append(scores)
+        batch_bboxes = paddle.stack(batch_bboxes, 0)
+        batch_scores = paddle.stack(batch_scores, 0)
+        return batch_bboxes, batch_scores
+
+    def post_process(self, head_outs, im_shape, scale_factor):
+        anchors, cls_scores, bbox_preds = head_outs
+        cls_scores = cls_scores.transpose([0, 2, 3, 1])
+        bbox_preds = bbox_preds.transpose([0, 2, 3, 1])
+        pred_bboxes, pred_scores = self.decode(
+            [anchors], [cls_scores], [bbox_preds], im_shape, scale_factor)
+
+        if self.exclude_nms:
+            # `exclude_nms=True` just use in benchmark
+            return pred_bboxes.sum(), pred_scores.sum()
+        else:
+            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+            return bbox_pred, bbox_num
diff --git a/ppdet/modeling/heads/yolov5_head.py b/ppdet/modeling/heads/yolov5_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..11715b4b933720fb7326b4a2d00f147e95b8e709
--- /dev/null
+++ b/ppdet/modeling/heads/yolov5_head.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register
+from ppdet.modeling.layers import MultiClassNMS
+
+__all__ = ['YOLOv5Head']
+
+
+@register
+class YOLOv5Head(nn.Layer):
+    __shared__ = [
+        'num_classes', 'data_format', 'trt', 'exclude_nms',
+        'exclude_post_process'
+    ]
+    __inject__ = ['loss', 'nms']
+
+    def __init__(self,
+                 num_classes=80,
+                 in_channels=[256, 512, 1024],
+                 anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
+                          [59, 119], [116, 90], [156, 198], [373, 326]],
+                 anchor_masks=[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
+                 stride=[8, 16, 32],
+                 loss='YOLOv5Loss',
+                 data_format='NCHW',
+                 nms='MultiClassNMS',
+                 trt=False,
+                 exclude_post_process=False,
+                 exclude_nms=False):
+        """
+        Head for YOLOv5
+
+        Args:
+            num_classes (int): number of foreground classes
+            in_channels (int): channels of input features
+            anchors (list): anchors
+            anchor_masks (list): anchor masks
+            stride (list): strides
+            loss (object): YOLOv5Loss instance
+            data_format (str): nms format, NCHW or NHWC
+            loss (object): YOLOv5Loss instance
+        """
+        super(YOLOv5Head, self).__init__()
+        assert len(in_channels) > 0, "in_channels length should > 0"
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+
+        self.parse_anchor(anchors, anchor_masks)
+        self.anchors = paddle.to_tensor(self.anchors, dtype='float32')
+        self.anchor_levels = len(self.anchors)
+
+        self.stride = stride
+        self.loss = loss
+        self.data_format = data_format
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+
+        self.num_anchor = len(self.anchors[0])  # self.na
+        self.num_out_ch = self.num_classes + 5  # self.no
+
+        self.yolo_outputs = []
+        for i in range(len(self.anchors)):
+            num_filters = self.num_anchor * self.num_out_ch
+            name = 'yolo_output.{}'.format(i)
+            conv = nn.Conv2D(
+                in_channels=self.in_channels[i],
+                out_channels=num_filters,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                data_format=data_format,
+                bias_attr=ParamAttr(regularizer=L2Decay(0.)))
+            conv.skip_quant = True
+            yolo_output = self.add_sublayer(name, conv)
+            self.yolo_outputs.append(yolo_output)
+
+        self._initialize_biases()
+
+    def _initialize_biases(self):
+        # initialize biases into Detect()
+        # https://arxiv.org/abs/1708.02002 section 3.3
+        for i, conv in enumerate(self.yolo_outputs):
+            b = conv.bias.numpy().reshape([3, -1])
+            b[:, 4] += math.log(8 / (640 / self.stride[i])**2)
+            b[:, 5:self.num_classes + 5] += math.log(0.6 / (
+                self.num_classes - 0.999999))
+            conv.bias.set_value(b.reshape([-1]))
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def parse_anchor(self, anchors, anchor_masks):
+        self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks]
+        self.mask_anchors = []
+        anchor_num = len(anchors)
+        for masks in anchor_masks:
+            self.mask_anchors.append([])
+            for mask in masks:
+                assert mask < anchor_num, "anchor mask index overflow"
+                self.mask_anchors[-1].extend(anchors[mask])
+
+    def forward(self, feats, targets=None):
+        assert len(feats) == len(self.anchors)
+        yolo_outputs = []
+        for i, feat in enumerate(feats):
+            yolo_output = self.yolo_outputs[i](feat)
+            if self.data_format == 'NHWC':
+                yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2])
+            yolo_outputs.append(yolo_output)
+
+        if self.training:
+            return self.loss(yolo_outputs, targets, self.anchors)
+        else:
+            return yolo_outputs
+
+    def make_grid(self, nx, ny, anchor):
+        yv, xv = paddle.meshgrid([
+            paddle.arange(
+                ny, dtype='float32'), paddle.arange(
+                    nx, dtype='float32')
+        ])
+
+        grid = paddle.stack(
+            (xv, yv), axis=2).expand([1, self.num_anchor, ny, nx, 2])
+        anchor_grid = anchor.reshape([1, self.num_anchor, 1, 1, 2]).expand(
+            (1, self.num_anchor, ny, nx, 2))
+        return grid, anchor_grid
+
+    def postprocessing_by_level(self, head_out, stride, anchor, ny, nx):
+        grid, anchor_grid = self.make_grid(nx, ny, anchor)
+        out = F.sigmoid(head_out)
+        xy = (out[..., 0:2] * 2. - 0.5 + grid) * stride
+        wh = (out[..., 2:4] * 2)**2 * anchor_grid
+        lt_xy = (xy - wh / 2.)
+        rb_xy = (xy + wh / 2.)
+        bboxes = paddle.concat((lt_xy, rb_xy), axis=-1)
+        scores = out[..., 5:] * out[..., 4].unsqueeze(-1)
+        return bboxes, scores
+
+    def post_process(self, head_outs, im_shape, scale_factor):
+        bbox_list, score_list = [], []
+        for i, head_out in enumerate(head_outs):
+            _, _, ny, nx = head_out.shape
+            head_out = head_out.reshape(
+                [-1, self.num_anchor, self.num_out_ch, ny, nx]).transpose(
+                    [0, 1, 3, 4, 2])
+            # head_out.shape [bs, self.num_anchor, ny, nx, self.num_out_ch]
+
+            bbox, score = self.postprocessing_by_level(head_out, self.stride[i],
+                                                       self.anchors[i], ny, nx)
+            bbox = bbox.reshape([-1, self.num_anchor * ny * nx, 4])
+            score = score.reshape(
+                [-1, self.num_anchor * ny * nx, self.num_classes]).transpose(
+                    [0, 2, 1])
+            bbox_list.append(bbox)
+            score_list.append(score)
+        pred_bboxes = paddle.concat(bbox_list, axis=1)
+        pred_scores = paddle.concat(score_list, axis=-1)
+
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1)
+        else:
+            # scale bbox to origin
+            scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
diff --git a/ppdet/modeling/heads/yolov6_head.py b/ppdet/modeling/heads/yolov6_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f96e951e5418cb498d03b085de5e6d3324e44a8
--- /dev/null
+++ b/ppdet/modeling/heads/yolov6_head.py
@@ -0,0 +1,1271 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register
+
+from ..bbox_utils import batch_distance2bbox
+from ..losses import GIoULoss, SIoULoss
+from ..initializer import bias_init_with_prob, constant_, normal_
+from ..assigners.utils import generate_anchors_for_grid_cell
+from ..backbones.yolov6_efficientrep import BaseConv, DPBlock
+from ppdet.modeling.ops import get_static_shape
+from ppdet.modeling.layers import MultiClassNMS
+
+__all__ = [
+    'EffiDeHead', 'EffiDeHead_distill_ns', 'EffiDeHead_fuseab',
+    'Lite_EffideHead'
+]
+
+
+@register
+class EffiDeHead(nn.Layer):
+    __shared__ = [
+        'num_classes', 'eval_size', 'trt', 'exclude_nms',
+        'exclude_post_process', 'self_distill'
+    ]
+    __inject__ = ['static_assigner', 'assigner', 'nms']
+
+    def __init__(
+            self,
+            in_channels=[128, 256, 512],
+            num_classes=80,
+            fpn_strides=[8, 16, 32],
+            grid_cell_scale=5.0,
+            grid_cell_offset=0.5,
+            anchors=1,
+            reg_max=16,  # reg_max=0 if use_dfl is False
+            use_dfl=True,  # False in n/s version, True in m/l version
+            static_assigner_epoch=4,  # warmup_epoch
+            static_assigner='ATSSAssigner',
+            assigner='TaskAlignedAssigner',
+            eval_size=None,
+            iou_type='giou',  # 'siou' in n version
+            loss_weight={
+                'cls': 1.0,
+                'iou': 2.5,
+                'dfl': 0.5,  # used in m/l version 
+                'cwd': 10.0,  # used when self_distill=True, in m/l version
+            },
+            nms='MultiClassNMS',
+            trt=False,
+            exclude_nms=False,
+            exclude_post_process=False,
+            self_distill=False,
+            distill_weight={
+                'cls': 1.0,
+                'dfl': 1.0,
+            },
+            print_l1_loss=True):
+        super(EffiDeHead, self).__init__()
+        assert len(in_channels) > 0, "len(in_channels) should > 0"
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.fpn_strides = fpn_strides
+        self.grid_cell_scale = grid_cell_scale
+        self.grid_cell_offset = grid_cell_offset
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+
+        if isinstance(anchors, (list, tuple)):
+            self.na = len(anchors[0]) // 2
+        else:
+            self.na = anchors
+        self.anchors = anchors
+
+        self.static_assigner_epoch = static_assigner_epoch
+        self.static_assigner = static_assigner
+        self.assigner = assigner
+        self.eval_size = eval_size
+        self.iou_loss = GIoULoss()
+        assert iou_type in ['giou', 'siou'], "only support giou and siou loss."
+        if iou_type == 'siou':
+            self.iou_loss = SIoULoss()
+        self.loss_weight = loss_weight
+
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+        self.print_l1_loss = print_l1_loss
+
+        # for self-distillation
+        self.self_distill = self_distill
+        self.distill_weight = distill_weight
+
+        # Init decouple head
+        self.stems = nn.LayerList()
+        self.cls_convs = nn.LayerList()
+        self.cls_preds = nn.LayerList()
+        self.reg_convs = nn.LayerList()
+        self.reg_preds = nn.LayerList()
+
+        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
+        reg_ch = self.reg_max + self.na
+        cls_ch = self.num_classes * self.na
+        for in_c in self.in_channels:
+            self.stems.append(BaseConv(in_c, in_c, 1, 1))
+
+            self.cls_convs.append(BaseConv(in_c, in_c, 3, 1))
+            self.cls_preds.append(
+                nn.Conv2D(
+                    in_c, cls_ch, 1, bias_attr=bias_attr))
+
+            self.reg_convs.append(BaseConv(in_c, in_c, 3, 1))
+            self.reg_preds.append(
+                nn.Conv2D(
+                    in_c, 4 * reg_ch, 1, bias_attr=bias_attr))
+
+        self.use_dfl = use_dfl
+        self.reg_max = reg_max
+        self.proj_conv = nn.Conv2D(self.reg_max + 1, 1, 1, bias_attr=False)
+        self.proj_conv.skip_quant = True
+
+        self.proj = paddle.linspace(0, self.reg_max, self.reg_max + 1)
+        self.proj_conv.weight.set_value(
+            self.proj.reshape([1, self.reg_max + 1, 1, 1]))
+        self.proj_conv.weight.stop_gradient = True
+        self.print_l1_loss = print_l1_loss
+        self._initialize_biases()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def _initialize_biases(self):
+        bias_cls = bias_init_with_prob(0.01)
+        for cls_, reg_ in zip(self.cls_preds, self.reg_preds):
+            constant_(cls_.weight)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.weight)
+            constant_(reg_.bias, 1.0)
+
+        self.proj = paddle.linspace(0, self.reg_max, self.reg_max + 1)
+        self.proj_conv.weight.set_value(
+            self.proj.reshape([1, self.reg_max + 1, 1, 1]))
+        self.proj_conv.weight.stop_gradient = True
+
+        if self.eval_size:
+            anchor_points, stride_tensor = self._generate_anchors()
+            self.anchor_points = anchor_points
+            self.stride_tensor = stride_tensor
+
+    def forward(self, feats, targets=None):
+        if self.training:
+            return self.forward_train(feats, targets)
+        else:
+            return self.forward_eval(feats)
+
+    def forward_train(self, feats, targets):
+        anchors, anchor_points, num_anchors_list, stride_tensor = \
+            generate_anchors_for_grid_cell(
+                feats, self.fpn_strides, self.grid_cell_scale,
+                self.grid_cell_offset)
+
+        cls_score_list, reg_distri_list = [], []
+        for i, feat in enumerate(feats):
+            feat = self.stems[i](feat)
+            cls_x = feat
+            reg_x = feat
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+            # cls and reg
+            cls_output = F.sigmoid(cls_output)
+            cls_score_list.append(cls_output.flatten(2).permute((0, 2, 1)))
+            reg_distri_list.append(reg_output.flatten(2).permute((0, 2, 1)))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=1)
+        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
+
+        return self.get_loss([
+            cls_score_list, reg_distri_list, anchors, anchor_points,
+            num_anchors_list, stride_tensor
+        ], targets)
+
+    def forward_eval(self, feats):
+        anchor_points, stride_tensor = self._generate_anchors(feats)
+        cls_score_list, reg_dist_list = [], []
+        for i, feat in enumerate(feats):
+            _, _, h, w = feat.shape
+            l = h * w
+            feat = self.stems[i](feat)
+            cls_x = feat
+            reg_x = feat
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+
+            if self.use_dfl:
+                reg_output = reg_output.reshape(
+                    [-1, 4, self.reg_max + 1, l]).transpose([0, 2, 1, 3])
+                reg_output = self.proj_conv(F.softmax(reg_output, 1))
+
+            # cls and reg
+            cls_output = F.sigmoid(cls_output)
+            cls_score_list.append(cls_output.reshape([-1, self.num_classes, l]))
+            reg_dist_list.append(reg_output.reshape([-1, 4, l]))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=-1)
+        reg_dist_list = paddle.concat(reg_dist_list, axis=-1)
+
+        return cls_score_list, reg_dist_list, anchor_points, stride_tensor
+
+    def _generate_anchors(self, feats=None, dtype='float32'):
+        # just use in eval time
+        anchor_points = []
+        stride_tensor = []
+        for i, stride in enumerate(self.fpn_strides):
+            if feats is not None:
+                _, _, h, w = feats[i].shape
+            else:
+                h = int(self.eval_size[0] / stride)
+                w = int(self.eval_size[1] / stride)
+            shift_x = paddle.arange(end=w) + self.grid_cell_offset
+            shift_y = paddle.arange(end=h) + self.grid_cell_offset
+            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
+            anchor_point = paddle.cast(
+                paddle.stack(
+                    [shift_x, shift_y], axis=-1), dtype=dtype)
+            anchor_points.append(anchor_point.reshape([-1, 2]))
+            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
+        anchor_points = paddle.concat(anchor_points)
+        stride_tensor = paddle.concat(stride_tensor)
+        return anchor_points, stride_tensor
+
+    @staticmethod
+    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
+        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
+        loss = F.binary_cross_entropy(
+            pred_score, gt_score, weight=weight, reduction='sum')
+        return loss
+
+    def _bbox_decode(self, anchor_points, pred_dist):
+        ### diff with PPYOLOEHead
+        if self.use_dfl:
+            b, l, _ = get_static_shape(pred_dist)
+            pred_dist = F.softmax(
+                pred_dist.reshape([b, l, 4, self.reg_max + 1])).matmul(
+                    self.proj)
+        return batch_distance2bbox(anchor_points, pred_dist)
+
+    def _bbox2distance(self, points, bbox):
+        x1y1, x2y2 = paddle.split(bbox, 2, -1)
+        lt = points - x1y1
+        rb = x2y2 - points
+        return paddle.concat([lt, rb], -1).clip(0, self.reg_max - 0.01)
+
+    def _df_loss(self, pred_dist, target):
+        target_left = paddle.cast(target, 'int64')
+        target_right = target_left + 1
+        weight_left = target_right.astype('float32') - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(
+            pred_dist, target_left, reduction='none') * weight_left
+        loss_right = F.cross_entropy(
+            pred_dist, target_right, reduction='none') * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+
+    def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels,
+                   assigned_bboxes, assigned_scores, assigned_scores_sum):
+        # select positive samples mask
+        mask_positive = (assigned_labels != self.num_classes)
+        num_pos = mask_positive.sum()
+        # pos/neg loss
+        if num_pos > 0:
+            # iou loss
+            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
+            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
+                                                   bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = paddle.masked_select(
+                assigned_bboxes, bbox_mask).reshape([-1, 4])
+            bbox_weight = paddle.masked_select(
+                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
+            loss_iou = self.iou_loss(pred_bboxes_pos,
+                                     assigned_bboxes_pos) * bbox_weight
+            loss_iou = loss_iou.sum() / assigned_scores_sum
+
+            # l1 loss just see the convergence, same in PPYOLOEHead
+            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
+
+            # dfl loss ### diff with PPYOLOEHead
+            if self.use_dfl:
+                dist_mask = mask_positive.unsqueeze(-1).tile(
+                    [1, 1, (self.reg_max + 1) * 4])
+                pred_dist_pos = paddle.masked_select(
+                    pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1])
+                assigned_ltrb = self._bbox2distance(anchor_points,
+                                                    assigned_bboxes)
+                assigned_ltrb_pos = paddle.masked_select(
+                    assigned_ltrb, bbox_mask).reshape([-1, 4])
+                loss_dfl = self._df_loss(pred_dist_pos,
+                                         assigned_ltrb_pos) * bbox_weight
+                loss_dfl = loss_dfl.sum() / assigned_scores_sum
+            else:
+                loss_dfl = pred_dist.sum() * 0.
+        else:
+            loss_l1 = paddle.zeros([1])
+            loss_iou = paddle.zeros([1])
+            loss_dfl = pred_dist.sum() * 0.
+        return loss_l1, loss_iou, loss_dfl
+
+    def get_loss(self, head_outs, gt_meta):
+        pred_scores, pred_distri, anchors,\
+        anchor_points, num_anchors_list, stride_tensor = head_outs
+
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
+
+        gt_labels = gt_meta['gt_class']
+        gt_bboxes = gt_meta['gt_bbox']
+        pad_gt_mask = gt_meta['pad_gt_mask']
+        # label assignment
+        if gt_meta['epoch_id'] < self.static_assigner_epoch:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.static_assigner(
+                    anchors,
+                    num_anchors_list,
+                    gt_labels,
+                    gt_bboxes,
+                    pad_gt_mask,
+                    bg_index=self.num_classes,
+                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
+        else:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.assigner(
+                pred_scores.detach(),
+                pred_bboxes.detach() * stride_tensor,
+                anchor_points,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index=self.num_classes)
+        # rescale bbox
+        assigned_bboxes /= stride_tensor
+
+        # cls loss: varifocal_loss
+        one_hot_label = F.one_hot(assigned_labels,
+                                  self.num_classes + 1)[..., :-1]
+        loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
+                                        one_hot_label)
+        assigned_scores_sum = assigned_scores.sum()
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(assigned_scores_sum)
+            assigned_scores_sum = paddle.clip(
+                assigned_scores_sum / paddle.distributed.get_world_size(),
+                min=1)
+        loss_cls /= assigned_scores_sum
+
+        # bbox loss
+        loss_l1, loss_iou, loss_dfl = \
+            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
+                            assigned_labels, assigned_bboxes, assigned_scores,
+                            assigned_scores_sum)
+
+        if self.use_dfl:
+            loss = self.loss_weight['cls'] * loss_cls + \
+                self.loss_weight['iou'] * loss_iou + \
+                self.loss_weight['dfl'] * loss_dfl
+            num_gpus = gt_meta.get('num_gpus', 8)
+            out_dict = {
+                'loss': loss * num_gpus,
+                'loss_cls': loss_cls,
+                'loss_iou': loss_iou,
+                'loss_dfl': loss_dfl,
+            }
+        else:
+            loss = self.loss_weight['cls'] * loss_cls + \
+                self.loss_weight['iou'] * loss_iou
+            num_gpus = gt_meta.get('num_gpus', 8)
+            out_dict = {
+                'loss': loss * num_gpus,
+                'loss_cls': loss_cls,
+                'loss_iou': loss_iou,
+            }
+
+        if self.print_l1_loss:
+            # just see convergence
+            out_dict.update({'loss_l1': loss_l1})
+        return out_dict
+
+    def post_process(self, head_outs, im_shape, scale_factor):
+        pred_scores, pred_dist, anchor_points, stride_tensor = head_outs
+        pred_bboxes = batch_distance2bbox(anchor_points,
+                                          pred_dist.transpose([0, 2, 1]))
+        pred_bboxes *= stride_tensor
+
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1), None
+        else:
+            # scale bbox to origin
+            scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
+
+
+@register
+class EffiDeHead_distill_ns(EffiDeHead):
+    # add reg_preds_lrtb
+    __shared__ = [
+        'num_classes', 'eval_size', 'trt', 'exclude_nms',
+        'exclude_post_process', 'self_distill'
+    ]
+    __inject__ = ['static_assigner', 'assigner', 'nms']
+
+    def __init__(
+            self,
+            in_channels=[128, 256, 512],
+            num_classes=80,
+            fpn_strides=[8, 16, 32],
+            grid_cell_scale=5.0,
+            grid_cell_offset=0.5,
+            anchors=1,
+            reg_max=16,  # reg_max=0 if use_dfl is False
+            use_dfl=True,  # False in n/s version, True in m/l version
+            static_assigner_epoch=4,  # warmup_epoch
+            static_assigner='ATSSAssigner',
+            assigner='TaskAlignedAssigner',
+            eval_size=None,
+            iou_type='giou',  # 'siou' in n version
+            loss_weight={
+                'cls': 1.0,
+                'iou': 2.5,
+                'dfl': 0.5,  # used in m/l version 
+                'cwd': 10.0,  # used when self_distill=True, in m/l version
+            },
+            nms='MultiClassNMS',
+            trt=False,
+            exclude_nms=False,
+            exclude_post_process=False,
+            self_distill=False,
+            distill_weight={
+                'cls': 1.0,
+                'dfl': 1.0,
+            },
+            print_l1_loss=True):
+        super(EffiDeHead_distill_ns, self).__init__()
+        assert len(in_channels) > 0, "len(in_channels) should > 0"
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.fpn_strides = fpn_strides
+        self.grid_cell_scale = grid_cell_scale
+        self.grid_cell_offset = grid_cell_offset
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+
+        if isinstance(anchors, (list, tuple)):
+            self.na = len(anchors[0]) // 2
+        else:
+            self.na = anchors
+        self.anchors = anchors
+
+        self.static_assigner_epoch = static_assigner_epoch
+        self.static_assigner = static_assigner
+        self.assigner = assigner
+        self.eval_size = eval_size
+        self.iou_loss = GIoULoss()
+        assert iou_type in ['giou', 'siou'], "only support giou and siou loss."
+        if iou_type == 'siou':
+            self.iou_loss = SIoULoss()
+        self.loss_weight = loss_weight
+
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+        self.print_l1_loss = print_l1_loss
+
+        # for self-distillation
+        self.self_distill = self_distill
+        self.distill_weight = distill_weight
+
+        # Init decouple head
+        self.stems = nn.LayerList()
+        self.cls_convs = nn.LayerList()
+        self.cls_preds = nn.LayerList()
+        self.reg_convs = nn.LayerList()
+        self.reg_preds = nn.LayerList()
+        self.reg_preds_lrtb = nn.LayerList()
+
+        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
+        reg_ch = self.reg_max + self.na
+        for in_c in self.in_channels:
+            self.stems.append(BaseConv(in_c, in_c, 1, 1))
+
+            self.cls_convs.append(BaseConv(in_c, in_c, 3, 1))
+            self.cls_preds.append(
+                nn.Conv2D(
+                    in_c, self.num_classes, 1, bias_attr=bias_attr))
+
+            self.reg_convs.append(BaseConv(in_c, in_c, 3, 1))
+            self.reg_preds.append(
+                nn.Conv2D(
+                    in_c, 4 * reg_ch, 1, bias_attr=bias_attr))
+
+            self.reg_preds_lrtb.append(
+                nn.Conv2D(
+                    in_c, 4 * self.na, 1, bias_attr=bias_attr))
+
+        self.use_dfl = use_dfl
+        self.reg_max = reg_max
+        self.proj_conv = nn.Conv2D(self.reg_max + 1, 1, 1, bias_attr=False)
+        self.proj_conv.skip_quant = True
+
+        self.proj = paddle.linspace(0, self.reg_max, self.reg_max + 1)
+        self.proj_conv.weight.set_value(
+            self.proj.reshape([1, self.reg_max + 1, 1, 1]))
+        self.proj_conv.weight.stop_gradient = True
+
+        self.print_l1_loss = print_l1_loss
+        self._initialize_biases()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def _initialize_biases(self):
+        bias_cls = bias_init_with_prob(0.01)
+        for cls_, reg_ in zip(self.cls_preds, self.reg_preds):
+            constant_(cls_.weight)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.weight)
+            constant_(reg_.bias, 1.0)
+
+        self.proj = paddle.linspace(0, self.reg_max, self.reg_max + 1)
+        self.proj_conv.weight.set_value(
+            self.proj.reshape([1, self.reg_max + 1, 1, 1]))
+        self.proj_conv.weight.stop_gradient = True
+
+        if self.eval_size:
+            anchor_points, stride_tensor = self._generate_anchors()
+            self.anchor_points = anchor_points
+            self.stride_tensor = stride_tensor
+
+    def forward(self, feats, targets=None):
+        if self.training:
+            return self.forward_train(feats, targets)
+        else:
+            return self.forward_eval(feats)
+
+    def forward_train(self, feats, targets):
+        anchors, anchor_points, num_anchors_list, stride_tensor = \
+            generate_anchors_for_grid_cell(
+                feats, self.fpn_strides, self.grid_cell_scale,
+                self.grid_cell_offset)
+
+        cls_score_list, reg_distri_list, reg_lrtb_list = [], [], []
+        for i, feat in enumerate(feats):
+            feat = self.stems[i](feat)
+            cls_x = feat
+            reg_x = feat
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+            reg_output_lrtb = self.reg_preds_lrtb[i](reg_feat)
+            # cls and reg
+            cls_output = F.sigmoid(cls_output)
+            cls_score_list.append(cls_output.flatten(2).permute((0, 2, 1)))
+            reg_distri_list.append(reg_output.flatten(2).permute((0, 2, 1)))
+            reg_lrtb_list.append(reg_output_lrtb.flatten(2).permute((0, 2, 1)))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=1)
+        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
+        reg_lrtb_list = paddle.concat(reg_lrtb_list, axis=1)
+
+        return self.get_loss([
+            cls_score_list, reg_distri_list, reg_lrtb_list, anchors,
+            anchor_points, num_anchors_list, stride_tensor
+        ], targets)
+
+    def forward_eval(self, feats):
+        anchor_points, stride_tensor = self._generate_anchors(feats)
+        cls_score_list, reg_lrtb_list = [], []
+        for i, feat in enumerate(feats):
+            _, _, h, w = feat.shape
+            l = h * w
+            feat = self.stems[i](feat)
+            cls_x = feat
+            reg_x = feat
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+            reg_output_lrtb = self.reg_preds_lrtb[i](reg_feat)
+            # cls and reg_lrtb 
+            cls_output = F.sigmoid(cls_output)
+            cls_score_list.append(cls_output.reshape([-1, self.num_classes, l]))
+            reg_lrtb_list.append(reg_output_lrtb.reshape([-1, 4, l]))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=-1)
+        reg_lrtb_list = paddle.concat(reg_lrtb_list, axis=-1)
+
+        return cls_score_list, reg_lrtb_list, anchor_points, stride_tensor
+
+    def get_loss(self, head_outs, gt_meta):
+        pred_scores, pred_distri, pred_ltbrs, anchors,\
+        anchor_points, num_anchors_list, stride_tensor = head_outs
+
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
+
+        gt_labels = gt_meta['gt_class']
+        gt_bboxes = gt_meta['gt_bbox']
+        pad_gt_mask = gt_meta['pad_gt_mask']
+        # label assignment
+        if gt_meta['epoch_id'] < self.static_assigner_epoch:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.static_assigner(
+                    anchors,
+                    num_anchors_list,
+                    gt_labels,
+                    gt_bboxes,
+                    pad_gt_mask,
+                    bg_index=self.num_classes,
+                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
+        else:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.assigner(
+                pred_scores.detach(),
+                pred_bboxes.detach() * stride_tensor,
+                anchor_points,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index=self.num_classes)
+        # rescale bbox
+        assigned_bboxes /= stride_tensor
+
+        # cls loss: varifocal_loss
+        one_hot_label = F.one_hot(assigned_labels,
+                                  self.num_classes + 1)[..., :-1]
+        loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
+                                        one_hot_label)
+        assigned_scores_sum = assigned_scores.sum()
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(assigned_scores_sum)
+            assigned_scores_sum = paddle.clip(
+                assigned_scores_sum / paddle.distributed.get_world_size(),
+                min=1)
+        loss_cls /= assigned_scores_sum
+
+        # bbox loss
+        loss_l1, loss_iou, loss_dfl = \
+            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
+                            assigned_labels, assigned_bboxes, assigned_scores,
+                            assigned_scores_sum)
+
+        if self.use_dfl:
+            loss = self.loss_weight['cls'] * loss_cls + \
+                self.loss_weight['iou'] * loss_iou + \
+                self.loss_weight['dfl'] * loss_dfl
+            num_gpus = gt_meta.get('num_gpus', 8)
+            out_dict = {
+                'loss': loss * num_gpus,
+                'loss_cls': loss_cls,
+                'loss_iou': loss_iou,
+                'loss_dfl': loss_dfl,
+            }
+        else:
+            loss = self.loss_weight['cls'] * loss_cls + \
+                self.loss_weight['iou'] * loss_iou
+            num_gpus = gt_meta.get('num_gpus', 8)
+            out_dict = {
+                'loss': loss * num_gpus,
+                'loss_cls': loss_cls,
+                'loss_iou': loss_iou,
+            }
+
+        if self.print_l1_loss:
+            # just see convergence
+            out_dict.update({'loss_l1': loss_l1})
+        return out_dict
+
+
+@register
+class EffiDeHead_fuseab(EffiDeHead):
+    # add cls_preds_af/reg_preds_af and cls_preds_ab/reg_preds_ab
+    __shared__ = [
+        'num_classes', 'eval_size', 'trt', 'exclude_nms',
+        'exclude_post_process', 'self_distill'
+    ]
+    __inject__ = ['static_assigner', 'assigner', 'nms']
+
+    def __init__(
+            self,
+            in_channels=[128, 256, 512],
+            num_classes=80,
+            fpn_strides=[8, 16, 32],
+            grid_cell_scale=5.0,
+            grid_cell_offset=0.5,
+            anchors=1,
+            reg_max=16,  # reg_max=0 if use_dfl is False
+            use_dfl=True,  # False in n/s version, True in m/l version
+            static_assigner_epoch=4,  # warmup_epoch
+            static_assigner='ATSSAssigner',
+            assigner='TaskAlignedAssigner',
+            eval_size=None,
+            iou_type='giou',  # 'siou' in n version
+            loss_weight={
+                'cls': 1.0,
+                'iou': 2.5,
+                'dfl': 0.5,  # used in m/l version 
+                'cwd': 10.0,  # used when self_distill=True, in m/l version
+            },
+            nms='MultiClassNMS',
+            trt=False,
+            exclude_nms=False,
+            exclude_post_process=False,
+            self_distill=False,
+            distill_weight={
+                'cls': 1.0,
+                'dfl': 1.0,
+            },
+            print_l1_loss=True):
+        super(EffiDeHead_fuseab, self).__init__()
+        assert len(in_channels) > 0, "len(in_channels) should > 0"
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.fpn_strides = fpn_strides
+        self.grid_cell_scale = grid_cell_scale
+        self.grid_cell_offset = grid_cell_offset
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+
+        if isinstance(anchors, (list, tuple)):
+            self.na = len(anchors[0]) // 2
+        else:
+            self.na = anchors
+        self.anchors = anchors
+
+        self.static_assigner_epoch = static_assigner_epoch
+        self.static_assigner = static_assigner
+        self.assigner = assigner
+        self.eval_size = eval_size
+        self.iou_loss = GIoULoss()
+        assert iou_type in ['giou', 'siou'], "only support giou and siou loss."
+        if iou_type == 'siou':
+            self.iou_loss = SIoULoss()
+        self.loss_weight = loss_weight
+
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+        self.print_l1_loss = print_l1_loss
+
+        # for self-distillation
+        self.self_distill = self_distill
+        self.distill_weight = distill_weight
+
+        # Init decouple head
+        self.stems = nn.LayerList()
+        self.cls_convs = nn.LayerList()
+        self.cls_preds = nn.LayerList()
+        self.reg_convs = nn.LayerList()
+        self.reg_preds = nn.LayerList()
+        self.reg_preds_lrtb = nn.LayerList()
+
+        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
+        reg_ch = self.reg_max + self.na
+        for in_c in self.in_channels:
+            self.stems.append(BaseConv(in_c, in_c, 1, 1))
+
+            self.cls_convs.append(BaseConv(in_c, in_c, 3, 1))
+            self.cls_preds.append(
+                nn.Conv2D(
+                    in_c, self.num_classes, 1, bias_attr=bias_attr))
+
+            self.reg_convs.append(BaseConv(in_c, in_c, 3, 1))
+            self.reg_preds.append(
+                nn.Conv2D(
+                    in_c, 4 * reg_ch, 1, bias_attr=bias_attr))
+
+            self.reg_preds_lrtb.append(
+                nn.Conv2D(
+                    in_c, 4 * self.na, 1, bias_attr=bias_attr))
+
+        self.use_dfl = use_dfl
+        self.reg_max = reg_max
+        self.proj_conv = nn.Conv2D(self.reg_max + 1, 1, 1, bias_attr=False)
+        self.proj_conv.skip_quant = True
+
+        self.proj = paddle.linspace(0, self.reg_max, self.reg_max + 1)
+        self.proj_conv.weight.set_value(
+            self.proj.reshape([1, self.reg_max + 1, 1, 1]))
+        self.proj_conv.weight.stop_gradient = True
+        self.print_l1_loss = print_l1_loss
+        self._initialize_biases()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def _initialize_biases(self):
+        bias_cls = bias_init_with_prob(0.01)
+        for cls_, reg_ in zip(self.cls_preds, self.reg_preds):
+            constant_(cls_.weight)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.weight)
+            constant_(reg_.bias, 1.0)
+
+        self.proj = paddle.linspace(0, self.reg_max, self.reg_max + 1)
+        self.proj_conv.weight.set_value(
+            self.proj.reshape([1, self.reg_max + 1, 1, 1]))
+        self.proj_conv.weight.stop_gradient = True
+
+        if self.eval_size:
+            anchor_points, stride_tensor = self._generate_anchors()
+            self.anchor_points = anchor_points
+            self.stride_tensor = stride_tensor
+
+    def forward(self, feats, targets=None):
+        if self.training:
+            return self.forward_train(feats, targets)
+        else:
+            return self.forward_eval(feats)
+
+    def forward_train(self, feats, targets):
+        anchors, anchor_points, num_anchors_list, stride_tensor = \
+            generate_anchors_for_grid_cell(
+                feats, self.fpn_strides, self.grid_cell_scale,
+                self.grid_cell_offset)
+
+        cls_score_list, reg_distri_list, reg_lrtb_list = [], [], []
+        for i, feat in enumerate(feats):
+            feat = self.stems[i](feat)
+            cls_x = feat
+            reg_x = feat
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+            reg_output_lrtb = self.reg_preds_lrtb[i](reg_feat)
+            # cls and reg
+            cls_output = F.sigmoid(cls_output)
+            cls_score_list.append(cls_output.flatten(2).permute((0, 2, 1)))
+            reg_distri_list.append(reg_output.flatten(2).permute((0, 2, 1)))
+            reg_lrtb_list.append(reg_output_lrtb.flatten(2).permute((0, 2, 1)))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=1)
+        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
+        reg_lrtb_list = paddle.concat(reg_lrtb_list, axis=1)
+
+        return self.get_loss([
+            cls_score_list, reg_distri_list, reg_lrtb_list, anchors,
+            anchor_points, num_anchors_list, stride_tensor
+        ], targets)
+
+    def forward_eval(self, feats):
+        anchor_points, stride_tensor = self._generate_anchors(feats)
+        cls_score_list, reg_lrtb_list = [], []
+        for i, feat in enumerate(feats):
+            _, _, h, w = feat.shape
+            l = h * w
+            feat = self.stems[i](feat)
+            cls_x = feat
+            reg_x = feat
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            # reg_output = self.reg_preds[i](reg_feat)
+            reg_output_lrtb = self.reg_preds_lrtb[i](reg_feat)
+            # cls and reg_lrtb 
+            cls_output = F.sigmoid(cls_output)
+            cls_score_list.append(cls_output.reshape([-1, self.num_classes, l]))
+            reg_lrtb_list.append(reg_output_lrtb.reshape([-1, 4, l]))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=-1)
+        reg_lrtb_list = paddle.concat(reg_lrtb_list, axis=-1)
+
+        return cls_score_list, reg_lrtb_list, anchor_points, stride_tensor
+
+    def get_loss(self, head_outs, gt_meta):
+        pred_scores, pred_distri, pred_ltbrs, anchors,\
+        anchor_points, num_anchors_list, stride_tensor = head_outs
+
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
+
+        gt_labels = gt_meta['gt_class']
+        gt_bboxes = gt_meta['gt_bbox']
+        pad_gt_mask = gt_meta['pad_gt_mask']
+        # label assignment
+        if gt_meta['epoch_id'] < self.static_assigner_epoch:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.static_assigner(
+                    anchors,
+                    num_anchors_list,
+                    gt_labels,
+                    gt_bboxes,
+                    pad_gt_mask,
+                    bg_index=self.num_classes,
+                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
+        else:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.assigner(
+                pred_scores.detach(),
+                pred_bboxes.detach() * stride_tensor,
+                anchor_points,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index=self.num_classes)
+        # rescale bbox
+        assigned_bboxes /= stride_tensor
+
+        # cls loss: varifocal_loss
+        one_hot_label = F.one_hot(assigned_labels,
+                                  self.num_classes + 1)[..., :-1]
+        loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
+                                        one_hot_label)
+        assigned_scores_sum = assigned_scores.sum()
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(assigned_scores_sum)
+            assigned_scores_sum = paddle.clip(
+                assigned_scores_sum / paddle.distributed.get_world_size(),
+                min=1)
+        loss_cls /= assigned_scores_sum
+
+        # bbox loss
+        loss_l1, loss_iou, loss_dfl = \
+            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
+                            assigned_labels, assigned_bboxes, assigned_scores,
+                            assigned_scores_sum)
+
+        if self.use_dfl:
+            loss = self.loss_weight['cls'] * loss_cls + \
+                self.loss_weight['iou'] * loss_iou + \
+                self.loss_weight['dfl'] * loss_dfl
+            num_gpus = gt_meta.get('num_gpus', 8)
+            out_dict = {
+                'loss': loss * num_gpus,
+                'loss_cls': loss_cls,
+                'loss_iou': loss_iou,
+                'loss_dfl': loss_dfl,
+            }
+        else:
+            loss = self.loss_weight['cls'] * loss_cls + \
+                self.loss_weight['iou'] * loss_iou
+            num_gpus = gt_meta.get('num_gpus', 8)
+            out_dict = {
+                'loss': loss * num_gpus,
+                'loss_cls': loss_cls,
+                'loss_iou': loss_iou,
+            }
+
+        if self.print_l1_loss:
+            # just see convergence
+            out_dict.update({'loss_l1': loss_l1})
+        return out_dict
+
+
+@register
+class Lite_EffideHead(nn.Layer):
+    __shared__ = [
+        'num_classes', 'eval_size', 'trt', 'exclude_nms', 'exclude_post_process'
+    ]
+    __inject__ = ['static_assigner', 'assigner', 'nms']
+    '''Efficient Decoupled Head
+    With hardware-aware degisn, the decoupled head is optimized with
+    hybridchannels methods.
+    '''
+
+    def __init__(
+            self,
+            in_channels=[96, 96, 96, 96],
+            num_classes=80,
+            fpn_strides=[8, 16, 32, 64],
+            grid_cell_scale=5.0,
+            grid_cell_offset=0.5,
+            anchors=1,
+            reg_max=0,
+            use_dfl=False,
+            static_assigner_epoch=4,  # warmup_epoch
+            static_assigner='ATSSAssigner',
+            assigner='TaskAlignedAssigner',
+            eval_size=None,
+            iou_type='siou',
+            loss_weight={
+                'cls': 1.0,
+                'iou': 2.5,
+            },
+            nms='MultiClassNMS',
+            trt=False,
+            exclude_nms=False,
+            exclude_post_process=False,
+            print_l1_loss=True):
+        super().__init__()
+        assert len(in_channels) > 0, "len(in_channels) should > 0"
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.fpn_strides = fpn_strides
+        self.grid_cell_scale = grid_cell_scale
+        self.grid_cell_offset = grid_cell_offset
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+
+        if isinstance(anchors, (list, tuple)):
+            self.na = len(anchors[0]) // 2
+        else:
+            self.na = anchors
+        self.anchors = anchors
+
+        self.static_assigner_epoch = static_assigner_epoch
+        self.static_assigner = static_assigner
+        self.assigner = assigner
+        self.eval_size = eval_size
+        self.iou_loss = SIoULoss()
+        self.loss_weight = loss_weight
+
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+
+        self.grid = [paddle.zeros([1])] * len(fpn_strides)
+        self.prior_prob = 1e-2
+        stride = [8, 16, 32] if len(fpn_strides) == 3 else [
+            8, 16, 32, 64
+        ]  # strides computed during build
+        self.stride = paddle.to_tensor(stride)
+        self.grid_cell_offset = 0.5
+        self.grid_cell_size = 5.0
+
+        # Init decouple head
+        self.stems = nn.LayerList()
+        self.cls_convs = nn.LayerList()
+        self.reg_convs = nn.LayerList()
+        self.cls_preds = nn.LayerList()
+        self.reg_preds = nn.LayerList()
+
+        # Efficient decoupled head layers
+        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
+        reg_ch = self.reg_max + self.na
+        cls_ch = self.num_classes * self.na
+        for in_c in self.in_channels:
+            self.stems.append(DPBlock(in_c, in_c, 5, 1))
+
+            self.cls_convs.append(DPBlock(in_c, in_c, 5, 1))
+            self.cls_preds.append(
+                nn.Conv2D(
+                    in_c, cls_ch, 1, bias_attr=bias_attr))
+
+            self.reg_convs.append(DPBlock(in_c, in_c, 5, 1))
+            self.reg_preds.append(
+                nn.Conv2D(
+                    in_c, 4 * reg_ch, 1, bias_attr=bias_attr))
+
+        self.proj_conv = nn.Conv2D(self.reg_max + 1, 1, 1, bias_attr=False)
+        self.proj_conv.skip_quant = True
+
+        self.proj = paddle.linspace(0, self.reg_max, self.reg_max + 1)
+        self.proj_conv.weight.set_value(
+            self.proj.reshape([1, self.reg_max + 1, 1, 1]))
+        self.proj_conv.weight.stop_gradient = True
+        self.print_l1_loss = print_l1_loss
+        self._initialize_biases()
+
+    def _initialize_biases(self):
+        bias_cls = bias_init_with_prob(0.01)
+        for cls_, reg_ in zip(self.cls_preds, self.reg_preds):
+            constant_(cls_.weight)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.weight)
+            constant_(reg_.bias, 1.0)
+
+    def forward(self, feats, targets=None):
+        if self.training:
+            return self.forward_train(feats, targets)
+        else:
+            return self.forward_eval(feats)
+
+    def forward_train(self, feats, targets):
+        anchors, anchor_points, num_anchors_list, stride_tensor = \
+            generate_anchors_for_grid_cell(
+                feats, self.fpn_strides, self.grid_cell_scale,
+                self.grid_cell_offset)
+
+        cls_score_list, reg_distri_list = [], []
+        for i, feat in enumerate(feats):
+            feat = self.stems[i](feat)
+            cls_x = feat
+            reg_x = feat
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+
+            cls_output = F.sigmoid(cls_output)
+            cls_score_list.append(cls_output.flatten(2).transpose((0, 2, 1)))
+            reg_distri_list.append(reg_output.flatten(2).transpose((0, 2, 1)))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=1)
+        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
+
+        return self.get_loss([
+            cls_score_list, reg_distri_list, anchors, anchor_points,
+            num_anchors_list, stride_tensor
+        ], targets)
+
+    def forward_eval(self, feats):
+        anchor_points, stride_tensor = self._generate_anchors(feats)
+        cls_score_list, reg_dist_list = [], []
+        for i, feat in enumerate(feats):
+            b, _, h, w = feat.shape
+            l = h * w
+            feat = self.stems[i](feat)
+            cls_x = feat
+            reg_x = feat
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+
+            cls_output = F.sigmoid(cls_output)
+            cls_score_list.append(cls_output.reshape([b, self.num_classes, l]))
+            reg_dist_list.append(reg_output.reshape([b, 4, l]))
+
+        cls_score_list = paddle.concat(cls_score_list, axis=-1)
+        reg_dist_list = paddle.concat(reg_dist_list, axis=-1)
+
+        return cls_score_list, reg_dist_list, anchor_points, stride_tensor
+
+    def _generate_anchors(self, feats=None, dtype='float32'):
+        # just use in eval time
+        anchor_points = []
+        stride_tensor = []
+        for i, stride in enumerate(self.fpn_strides):
+            if feats is not None:
+                _, _, h, w = feats[i].shape
+            else:
+                h = int(self.eval_size[0] / stride)
+                w = int(self.eval_size[1] / stride)
+            shift_x = paddle.arange(end=w) + self.grid_cell_offset
+            shift_y = paddle.arange(end=h) + self.grid_cell_offset
+            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
+            anchor_point = paddle.cast(
+                paddle.stack(
+                    [shift_x, shift_y], axis=-1), dtype=dtype)
+            anchor_points.append(anchor_point.reshape([-1, 2]))
+            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
+        anchor_points = paddle.concat(anchor_points)
+        stride_tensor = paddle.concat(stride_tensor)
+        return anchor_points, stride_tensor
+
+    def get_loss(self, head_outs, gt_meta):
+        pred_scores, pred_distri, anchors,\
+        anchor_points, num_anchors_list, stride_tensor = head_outs
+
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
+
+        gt_labels = gt_meta['gt_class']
+        gt_bboxes = gt_meta['gt_bbox']
+        pad_gt_mask = gt_meta['pad_gt_mask']
+        # label assignment
+        if gt_meta['epoch_id'] < self.static_assigner_epoch:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.static_assigner(
+                    anchors,
+                    num_anchors_list,
+                    gt_labels,
+                    gt_bboxes,
+                    pad_gt_mask,
+                    bg_index=self.num_classes,
+                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
+        else:
+            assigned_labels, assigned_bboxes, assigned_scores = \
+                self.assigner(
+                pred_scores.detach(),
+                pred_bboxes.detach() * stride_tensor,
+                anchor_points,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index=self.num_classes)
+        # rescale bbox
+        assigned_bboxes /= stride_tensor
+
+        # cls loss: varifocal_loss
+        one_hot_label = F.one_hot(assigned_labels,
+                                  self.num_classes + 1)[..., :-1]
+        loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
+                                        one_hot_label)
+        assigned_scores_sum = assigned_scores.sum()
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(assigned_scores_sum)
+            assigned_scores_sum = paddle.clip(
+                assigned_scores_sum / paddle.distributed.get_world_size(),
+                min=1)
+        loss_cls /= assigned_scores_sum
+
+        # bbox loss, no need loss_dfl
+        loss_l1, loss_iou, loss_dfl = \
+            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
+                            assigned_labels, assigned_bboxes, assigned_scores,
+                            assigned_scores_sum)
+
+        loss = self.loss_weight['cls'] * loss_cls + \
+                self.loss_weight['iou'] * loss_iou
+        num_gpus = gt_meta.get('num_gpus', 8)
+        out_dict = {
+            'loss': loss * num_gpus,
+            'loss_cls': loss_cls * num_gpus,
+            'loss_iou': loss_iou * num_gpus
+        }
+        if self.print_l1_loss:
+            # just see convergence
+            out_dict.update({'loss_l1': loss_l1 * num_gpus})
+        return out_dict
+
+    def post_process(self, head_outs, im_shape, scale_factor):
+        pred_scores, pred_dist, anchor_points, stride_tensor = head_outs
+        pred_bboxes = batch_distance2bbox(anchor_points,
+                                          pred_dist.transpose([0, 2, 1]))
+        pred_bboxes *= stride_tensor
+
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1), None
+        else:
+            # scale bbox to origin
+            scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
diff --git a/ppdet/modeling/heads/yolov7_head.py b/ppdet/modeling/heads/yolov7_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d63dbf74bfbfea75956ec16a6a46271c98e414e2
--- /dev/null
+++ b/ppdet/modeling/heads/yolov7_head.py
@@ -0,0 +1,641 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from ..initializer import constant_
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register
+from ppdet.modeling.backbones.yolov7_elannet import ImplicitA, ImplicitM
+from ppdet.modeling.layers import MultiClassNMS
+
+from ppdet.modeling.bbox_utils import batch_distance2bbox
+from ppdet.modeling.bbox_utils import bbox_iou
+from ppdet.modeling.assigners.utils import generate_anchors_for_grid_cell
+from ppdet.modeling.backbones.csp_darknet import BaseConv
+from ppdet.modeling.layers import MultiClassNMS
+
+__all__ = ['YOLOv7Head', 'YOLOv7uHead']
+
+
+@register
+class YOLOv7Head(nn.Layer):
+    __shared__ = [
+        'num_classes', 'data_format', 'use_aux', 'use_implicit', 'trt',
+        'exclude_nms', 'exclude_post_process'
+    ]
+    __inject__ = ['loss', 'nms']
+
+    def __init__(self,
+                 num_classes=80,
+                 in_channels=[256, 512, 1024],
+                 anchors=[[12, 16], [19, 36], [40, 28], [36, 75], [76, 55],
+                          [72, 146], [142, 110], [192, 243], [459, 401]],
+                 anchor_masks=[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
+                 stride=[8, 16, 32],
+                 use_aux=False,
+                 use_implicit=False,
+                 loss='YOLOv7Loss',
+                 data_format='NCHW',
+                 nms='MultiClassNMS',
+                 trt=False,
+                 exclude_post_process=False,
+                 exclude_nms=False):
+        """
+        Head for YOLOv7
+
+        Args:
+            num_classes (int): number of foreground classes
+            in_channels (int): channels of input features
+            anchors (list): anchors
+            anchor_masks (list): anchor masks
+            stride (list): strides
+            use_aux (bool): whether to use Aux Head, only in P6 models
+            use_implicit (bool): whether to use ImplicitA and ImplicitM
+            loss (object): YOLOv7Loss instance
+            data_format (str): nms format, NCHW or NHWC
+            nms (object): MultiClassNMS instance
+            trt (bool): whether to use trt infer
+            exclude_nms (bool): whether to use exclude_nms for speed test
+        """
+        super(YOLOv7Head, self).__init__()
+        assert len(in_channels) > 0, "in_channels length should > 0"
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+
+        self.parse_anchor(anchors, anchor_masks)
+        self.anchors = paddle.to_tensor(self.anchors, dtype='int32')
+        self.anchor_levels = len(self.anchors)
+
+        self.stride = stride
+        self.use_aux = use_aux
+        self.use_implicit = use_implicit
+        self.loss = loss
+        self.data_format = data_format
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+
+        self.num_anchor = len(self.anchors[0])  # self.na
+        self.num_out_ch = self.num_classes + 5  # self.no
+
+        self.yolo_outputs = []
+        if self.use_aux:
+            self.yolo_outputs_aux = []
+        if self.use_implicit:
+            self.ia, self.im = [], []
+        self.num_levels = len(self.anchors)
+        for i in range(self.num_levels):
+            num_filters = self.num_anchor * self.num_out_ch
+            name = 'yolo_output.{}'.format(i)
+            conv = nn.Conv2D(
+                in_channels=self.in_channels[i],
+                out_channels=num_filters,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                data_format=data_format,
+                bias_attr=ParamAttr(regularizer=L2Decay(0.)))
+            conv.skip_quant = True
+            yolo_output = self.add_sublayer(name, conv)
+            self.yolo_outputs.append(yolo_output)
+
+            if self.use_aux:
+                name_aux = 'yolo_output_aux.{}'.format(i)
+                conv_aux = nn.Conv2D(
+                    in_channels=self.in_channels[i + self.num_levels],
+                    out_channels=num_filters,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    data_format=data_format,
+                    bias_attr=ParamAttr(regularizer=L2Decay(0.)))
+                conv_aux.skip_quant = True
+                yolo_output_aux = self.add_sublayer(name_aux, conv_aux)
+                self.yolo_outputs_aux.append(yolo_output_aux)
+
+            if self.use_implicit:
+                ia = ImplicitA(self.in_channels[i])
+                yolo_output_ia = self.add_sublayer(
+                    'yolo_output_ia.{}'.format(i), ia)
+                self.ia.append(yolo_output_ia)
+
+                im = ImplicitM(num_filters)
+                yolo_output_im = self.add_sublayer(
+                    'yolo_output_im.{}'.format(i), im)
+                self.im.append(yolo_output_im)
+
+        self._initialize_biases()
+
+    def fuse(self):
+        if self.use_implicit:
+            # fuse ImplicitA and Convolution
+            for i in range(len(self.yolo_outputs)):
+                c1, c2, _, _ = self.yolo_outputs[
+                    i].weight.shape  # [255, 256, 1, 1]
+                c1_, c2_, _, _ = self.ia[i].ia.shape  # [1, 256, 1, 1]
+                cc = paddle.matmul(self.yolo_outputs[i].weight.reshape(
+                    [c1, c2]), self.ia[i].ia.reshape([c2_, c1_])).squeeze(1)
+                self.yolo_outputs[i].bias.set_value(self.yolo_outputs[i].bias +
+                                                    cc)
+
+            # fuse ImplicitM and Convolution
+            for i in range(len(self.yolo_outputs)):
+                c1, c2, _, _ = self.im[i].im.shape  # [1, 255, 1, 1]
+                self.yolo_outputs[i].bias.set_value(self.yolo_outputs[i].bias *
+                                                    self.im[i].im.reshape([c2]))
+                self.yolo_outputs[i].weight.set_value(
+                    self.yolo_outputs[i].weight * paddle.transpose(
+                        self.im[i].im, [1, 0, 2, 3]))
+
+    def _initialize_biases(self):
+        # initialize biases, see https://arxiv.org/abs/1708.02002 section 3.3
+        for i, conv in enumerate(self.yolo_outputs):
+            b = conv.bias.numpy().reshape([3, -1])  # [255] to [3,85]
+            b[:, 4] += math.log(8 / (640 / self.stride[i])**2)
+            b[:, 5:self.num_classes + 5] += math.log(0.6 / (self.num_classes - 0.999999))
+            conv.bias.set_value(b.reshape([-1]))
+
+        if self.use_aux:
+            for i, conv in enumerate(self.yolo_outputs_aux):
+                b = conv.bias.numpy().reshape([3, -1])  # [255] to [3,85]
+                b[:, 4] += math.log(8 / (640 / self.stride[i])**2)
+                b[:, 5:] += math.log(0.6 / (self.num_classes - 0.999999))
+                conv.bias.set_value(b.reshape([-1]))
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def parse_anchor(self, anchors, anchor_masks):
+        self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks]
+        self.mask_anchors = []
+        anchor_num = len(anchors)
+        for masks in anchor_masks:
+            self.mask_anchors.append([])
+            for mask in masks:
+                assert mask < anchor_num, "anchor mask index overflow"
+                self.mask_anchors[-1].extend(anchors[mask])
+
+    def forward(self, feats, targets=None):
+        yolo_outputs = []
+        if self.training and self.use_aux:
+            yolo_outputs_aux = []
+        for i in range(self.num_levels):
+            if self.training and self.use_implicit:
+                yolo_output = self.im[i](self.yolo_outputs[i](self.ia[i](feats[
+                    i])))
+            else:
+                yolo_output = self.yolo_outputs[i](feats[i])
+            if self.data_format == 'NHWC':
+                yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2])
+            yolo_outputs.append(yolo_output)
+
+            if self.training and self.use_aux:
+                yolo_output_aux = self.yolo_outputs_aux[i](feats[
+                    i + self.num_levels])
+                yolo_outputs_aux.append(yolo_output_aux)
+
+        if self.training:
+            if self.use_aux:
+                return self.loss(yolo_outputs + yolo_outputs_aux, targets,
+                                 self.anchors)
+            else:
+                return self.loss(yolo_outputs, targets, self.anchors)
+        else:
+            return yolo_outputs
+
+    def make_grid(self, nx, ny, anchor):
+        yv, xv = paddle.meshgrid([
+            paddle.arange(
+                ny, dtype='int32'), paddle.arange(
+                    nx, dtype='int32')
+        ])
+        grid = paddle.stack((xv, yv), axis=2).reshape([1, 1, ny, nx, 2])
+        anchor_grid = anchor.reshape([1, self.num_anchor, 1, 1, 2])
+        return grid, anchor_grid
+
+    def postprocessing_by_level(self, head_out, stride, anchor, ny, nx):
+        grid, anchor_grid = self.make_grid(nx, ny, anchor)
+        out = F.sigmoid(head_out)
+        xy = (out[..., 0:2] * 2. - 0.5 + grid) * stride
+        wh = (out[..., 2:4] * 2)**2 * anchor_grid
+        lt_xy = (xy - wh / 2.)
+        rb_xy = (xy + wh / 2.)
+        bboxes = paddle.concat((lt_xy, rb_xy), axis=-1)
+        scores = out[..., 5:] * out[..., 4].unsqueeze(-1)
+        return bboxes, scores
+
+    def post_process(self, head_outs, img_shape, scale_factor):
+        bbox_list, score_list = [], []
+
+        for i, head_out in enumerate(head_outs):
+            _, _, ny, nx = head_out.shape
+            head_out = head_out.reshape(
+                [-1, self.num_anchor, self.num_out_ch, ny, nx]).transpose(
+                    [0, 1, 3, 4, 2])
+            # head_out.shape [bs, self.num_anchor, ny, nx, self.num_out_ch]
+
+            bbox, score = self.postprocessing_by_level(head_out, self.stride[i],
+                                                       self.anchors[i], ny, nx)
+            bbox = bbox.reshape([-1, self.num_anchor * ny * nx, 4])
+            score = score.reshape(
+                [-1, self.num_anchor * ny * nx, self.num_classes]).transpose(
+                    [0, 2, 1])
+            bbox_list.append(bbox)
+            score_list.append(score)
+        pred_bboxes = paddle.concat(bbox_list, axis=1)
+        pred_scores = paddle.concat(score_list, axis=-1)
+
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1)
+        else:
+            # scale bbox to origin
+            scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
+
+
+@register
+class YOLOv7uHead(nn.Layer):
+    # YOLOv7 Anchor-Free Head = YOLOv8Head + use_implicit
+    __shared__ = [
+        'num_classes', 'eval_size', 'use_implicit', 'trt', 'exclude_nms',
+        'exclude_post_process'
+    ]
+    __inject__ = ['assigner', 'nms']
+
+    def __init__(self,
+                 in_channels=[256, 512, 1024],
+                 num_classes=80,
+                 act='silu',
+                 fpn_strides=[8, 16, 32],
+                 grid_cell_scale=5.0,
+                 grid_cell_offset=0.5,
+                 reg_max=16,
+                 reg_range=None,
+                 use_varifocal_loss=False,
+                 assigner='TaskAlignedAssigner',
+                 nms='MultiClassNMS',
+                 eval_size=None,
+                 use_implicit=True,
+                 loss_weight={
+                     'class': 0.5,
+                     'iou': 7.5,
+                     'dfl': 1.5,
+                 },
+                 trt=False,
+                 exclude_nms=False,
+                 exclude_post_process=False,
+                 print_l1_loss=False):
+        super(YOLOv7uHead, self).__init__()
+        assert len(in_channels) > 0, "len(in_channels) should > 0"
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.fpn_strides = fpn_strides
+        self.grid_cell_scale = grid_cell_scale
+        self.grid_cell_offset = grid_cell_offset
+        self.reg_max = reg_max
+        if reg_range:
+            self.reg_range = reg_range
+        else:
+            self.reg_range = (0, reg_max)  # not reg_max+1
+        self.reg_channels = self.reg_range[1] - self.reg_range[0]
+        self.use_varifocal_loss = use_varifocal_loss
+        self.assigner = assigner
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.eval_size = eval_size
+        self.loss_weight = loss_weight
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+        self.print_l1_loss = print_l1_loss
+        self.use_implicit = use_implicit
+
+        # cls loss
+        self.bce = nn.BCEWithLogitsLoss(reduction='none')
+
+        # pred head
+        c2 = max((16, in_channels[0] // 4, self.reg_max * 4))
+        c3 = max(in_channels[0], self.num_classes)
+        self.conv_reg = nn.LayerList()
+        self.conv_cls = nn.LayerList()
+        for in_c in self.in_channels:
+            self.conv_reg.append(
+                nn.Sequential(* [
+                    BaseConv(
+                        in_c, c2, 3, 1, act=act),
+                    BaseConv(
+                        c2, c2, 3, 1, act=act),
+                    nn.Conv2D(
+                        c2,
+                        self.reg_max * 4,
+                        1,
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+                ]))
+            self.conv_cls.append(
+                nn.Sequential(* [
+                    BaseConv(
+                        in_c, c3, 3, 1, act=act),
+                    BaseConv(
+                        c3, c3, 3, 1, act=act),
+                    nn.Conv2D(
+                        c3,
+                        self.num_classes,
+                        1,
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+                ]))
+        self.proj = paddle.arange(self.reg_max).astype('float32')
+        if self.use_implicit:
+            self.ia2 = nn.LayerList()
+            self.ia3 = nn.LayerList()
+            self.im2 = nn.LayerList()
+            self.im3 = nn.LayerList()
+            for in_c in self.in_channels:
+                self.ia2.append(ImplicitA(in_c))
+                self.ia3.append(ImplicitA(in_c))
+                self.im2.append(ImplicitM(self.reg_max * 4))
+                self.im3.append(ImplicitM(self.num_classes))
+        self._initialize_biases()
+
+    def fuse(self):
+        pass
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def _initialize_biases(self):
+        for a, b, s in zip(self.conv_reg, self.conv_cls, self.fpn_strides):
+            constant_(a[-1].weight)
+            constant_(a[-1].bias, 1.0)
+            constant_(b[-1].weight)
+            constant_(b[-1].bias, math.log(5 / self.num_classes / (640 / s)**2))
+
+    def forward(self, feats, targets=None):
+        if self.training:
+            return self.forward_train(feats, targets)
+        else:
+            return self.forward_eval(feats)
+
+    def forward_train(self, feats, targets):
+        anchors, anchor_points, num_anchors_list, stride_tensor = \
+            generate_anchors_for_grid_cell(
+                feats, self.fpn_strides, self.grid_cell_scale,
+                self.grid_cell_offset)
+
+        cls_logits_list, bbox_preds_list, bbox_dist_preds_list = [], [], []
+        for i, feat in enumerate(feats):
+            _, _, h, w = feat.shape
+            l = h * w
+            bbox_dist_preds = self.im2[i](self.conv_reg[i](self.ia2[i](feat)))
+            cls_logit = self.im3[i]((self.conv_cls[i](self.ia3[i](feat))))
+            bbox_dist_preds = bbox_dist_preds.reshape([-1, 4, self.reg_max, l]).transpose([0, 3, 1, 2])
+            bbox_preds = F.softmax(bbox_dist_preds, axis=3).matmul(self.proj.reshape([-1, 1])).squeeze(-1)
+
+            cls_logits_list.append(cls_logit)
+            bbox_preds_list.append(bbox_preds.transpose([0, 2, 1]).reshape([-1, 4, h, w]))
+            bbox_dist_preds_list.append(bbox_dist_preds)
+
+        return self.get_loss([
+            cls_logits_list, bbox_preds_list, bbox_dist_preds_list, anchors, anchor_points,
+            num_anchors_list, stride_tensor
+        ], targets)
+
+    def forward_eval(self, feats):
+        anchor_points, stride_tensor = self._generate_anchors(feats)
+
+        cls_logits_list, bbox_preds_list = [], []
+        feats_shapes = []
+        for i, feat in enumerate(feats):
+            _, _, h, w = feat.shape
+            l = h * w
+            bbox_dist_preds = self.im2[i](self.conv_reg[i](self.ia2[i](feat)))
+            cls_logit = self.im3[i]((self.conv_cls[i](self.ia3[i](feat))))
+
+            bbox_dist_preds = bbox_dist_preds.reshape(
+                [-1, 4, self.reg_max, l]).transpose([0, 3, 1, 2])
+            bbox_preds = F.softmax(bbox_dist_preds, axis=3).matmul(self.proj.reshape([-1, 1])).squeeze(-1)
+            cls_logits_list.append(cls_logit)
+            bbox_preds_list.append(bbox_preds.transpose([0, 2, 1]).reshape([-1, 4, h, w]))
+            feats_shapes.append(l)
+
+        pred_scores = [
+            cls_score.transpose([0, 2, 3, 1]).reshape([-1, size, self.num_classes])
+            for size, cls_score in zip(feats_shapes, cls_logits_list)
+        ]
+        pred_dists = [
+            bbox_pred.transpose([0, 2, 3, 1]).reshape([-1, size, 4])
+            for size, bbox_pred in zip(feats_shapes, bbox_preds_list)
+        ]
+        pred_scores = F.sigmoid(paddle.concat(pred_scores, 1))
+        pred_bboxes = paddle.concat(pred_dists, 1)
+
+        return pred_scores, pred_bboxes, anchor_points, stride_tensor
+
+    def _generate_anchors(self, feats=None, dtype='float32'):
+        # just use in eval time
+        anchor_points = []
+        stride_tensor = []
+        for i, stride in enumerate(self.fpn_strides):
+            if feats is not None:
+                _, _, h, w = feats[i].shape
+            else:
+                h = int(self.eval_size[0] / stride)
+                w = int(self.eval_size[1] / stride)
+            shift_x = paddle.arange(end=w) + self.grid_cell_offset
+            shift_y = paddle.arange(end=h) + self.grid_cell_offset
+            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
+            anchor_point = paddle.cast(
+                paddle.stack(
+                    [shift_x, shift_y], axis=-1), dtype=dtype)
+            anchor_points.append(anchor_point.reshape([-1, 2]))
+            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
+        anchor_points = paddle.concat(anchor_points)
+        stride_tensor = paddle.concat(stride_tensor)
+        return anchor_points, stride_tensor
+
+    def _bbox2distance(self, points, bbox, reg_max=15, eps=0.01):
+        x1y1, x2y2 = paddle.split(bbox, 2, -1)
+        lt = points - x1y1
+        rb = x2y2 - points
+        return paddle.concat([lt, rb], -1).clip(0, reg_max - eps)
+
+    def _df_loss(self, pred_dist, target, lower_bound=0):
+        target_left = paddle.cast(target.floor(), 'int64')
+        target_right = target_left + 1
+        weight_left = target_right.astype('float32') - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(
+            pred_dist, target_left - lower_bound,
+            reduction='none') * weight_left
+        loss_right = F.cross_entropy(
+            pred_dist, target_right - lower_bound,
+            reduction='none') * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+
+    def get_loss(self, head_outs, gt_meta):
+        cls_scores, bbox_preds, bbox_dist_preds, anchors,\
+        anchor_points, num_anchors_list, stride_tensor = head_outs
+
+
+        bs = cls_scores[0].shape[0]
+        flatten_cls_preds = [
+            cls_pred.transpose([0, 2, 3, 1]).reshape([bs, -1, self.num_classes])
+            for cls_pred in cls_scores
+        ]
+        flatten_pred_bboxes = [
+            bbox_pred.transpose([0, 2, 3, 1]).reshape([bs, -1, 4])
+            for bbox_pred in bbox_preds
+        ]
+        flatten_pred_dists = [
+            bbox_pred_org.reshape([bs, -1, self.reg_max * 4])
+            for bbox_pred_org in bbox_dist_preds
+        ]
+
+        flatten_dist_preds = paddle.concat(flatten_pred_dists, 1)
+        pred_scores = paddle.concat(flatten_cls_preds, 1)
+        pred_distri = paddle.concat(flatten_pred_bboxes, 1)
+
+
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = batch_distance2bbox(anchor_points_s, pred_distri) # xyxy
+        pred_bboxes = pred_bboxes * stride_tensor
+
+        gt_labels = gt_meta['gt_class']
+        gt_bboxes = gt_meta['gt_bbox'] # xyxy
+        pad_gt_mask = gt_meta['pad_gt_mask']
+
+        assigned_labels, assigned_bboxes, assigned_scores = \
+            self.assigner(
+            F.sigmoid(pred_scores.detach()),
+            pred_bboxes.detach(),
+            anchor_points,
+            num_anchors_list,
+            gt_labels,
+            gt_bboxes, # xyxy
+            pad_gt_mask,
+            bg_index=self.num_classes)
+        # rescale bbox
+        assigned_bboxes /= stride_tensor
+        pred_bboxes /= stride_tensor
+
+        # cls loss
+        loss_cls = self.bce(pred_scores, assigned_scores).sum()
+
+        assigned_scores_sum = assigned_scores.sum()
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(assigned_scores_sum)
+            assigned_scores_sum /= paddle.distributed.get_world_size()
+        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
+        loss_cls /= assigned_scores_sum
+
+        # select positive samples mask
+        mask_positive = (assigned_labels != self.num_classes)
+        num_pos = mask_positive.sum()
+        # pos/neg loss
+        if num_pos > 0:
+            # ciou loss
+            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
+            pred_bboxes_pos = paddle.masked_select(
+                pred_bboxes, bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = paddle.masked_select(
+                assigned_bboxes, bbox_mask).reshape([-1, 4])
+            bbox_weight = paddle.masked_select(
+                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
+            iou = bbox_iou( 
+                pred_bboxes_pos.split(4, axis=-1),
+                assigned_bboxes_pos.split(4, axis=-1),
+                x1y1x2y2=True, # xyxy
+                ciou=True,
+                eps=1e-7)
+            loss_iou = ((1.0 - iou) * bbox_weight).sum() / assigned_scores_sum
+
+            if self.print_l1_loss:
+                loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
+            else:
+                loss_l1 = paddle.zeros([1])
+
+            # dfl loss
+            dist_mask = mask_positive.unsqueeze(-1).tile(
+                [1, 1, self.reg_max * 4])
+            pred_dist_pos = paddle.masked_select(
+                flatten_dist_preds, dist_mask).reshape([-1, 4, self.reg_max])
+            assigned_ltrb = self._bbox2distance(
+                anchor_points_s,
+                assigned_bboxes,
+                reg_max=self.reg_max - 1,
+                eps=0.01)
+            assigned_ltrb_pos = paddle.masked_select(
+                assigned_ltrb, bbox_mask).reshape([-1, 4])
+
+            loss_dfl = self._df_loss(pred_dist_pos,
+                                     assigned_ltrb_pos) * bbox_weight
+            loss_dfl = loss_dfl.sum() / assigned_scores_sum
+        else:
+            loss_iou = flatten_dist_preds.sum() * 0.
+            loss_dfl = flatten_dist_preds.sum() * 0.
+            loss_l1 = flatten_dist_preds.sum() * 0.
+
+        loss_cls *= self.loss_weight['class']
+        loss_iou *= self.loss_weight['iou']
+        loss_dfl *= self.loss_weight['dfl']
+        loss_total = loss_cls + loss_iou + loss_dfl
+
+        num_gpus = gt_meta.get('num_gpus', 8)
+        total_bs = bs * num_gpus
+
+        out_dict = {
+            'loss': loss_total * total_bs,
+            'loss_cls': loss_cls * total_bs,
+            'loss_iou': loss_iou * total_bs,
+            'loss_dfl': loss_dfl * total_bs,
+        }
+        if self.print_l1_loss:
+            # just see convergence
+            out_dict.update({'loss_l1': loss_l1 * total_bs})
+        return out_dict
+
+    def post_process(self, head_outs, im_shape, scale_factor):
+        pred_scores, pred_bboxes, anchor_points, stride_tensor = head_outs
+
+        pred_bboxes = batch_distance2bbox(anchor_points, pred_bboxes)
+        pred_bboxes *= stride_tensor
+
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores], axis=-1), None
+        else:
+            pred_scores = pred_scores.transpose([0, 2, 1])
+            # scale bbox to origin
+            scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
diff --git a/ppdet/modeling/heads/yolov8_head.py b/ppdet/modeling/heads/yolov8_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f2f9d7d63769d4fcb549e90fd99fa71287f4f55
--- /dev/null
+++ b/ppdet/modeling/heads/yolov8_head.py
@@ -0,0 +1,378 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ..initializer import constant_
+from ppdet.core.workspace import register
+
+from ..bbox_utils import batch_distance2bbox
+from ..bbox_utils import bbox_iou
+from ..assigners.utils import generate_anchors_for_grid_cell
+from ppdet.modeling.backbones.csp_darknet import BaseConv
+from ppdet.modeling.layers import MultiClassNMS
+
+__all__ = ['YOLOv8Head']
+
+
+@register
+class YOLOv8Head(nn.Layer):
+    __shared__ = [
+        'num_classes', 'eval_size', 'trt', 'exclude_nms',
+        'exclude_post_process'
+    ]
+    __inject__ = ['assigner', 'nms']
+
+    def __init__(self,
+                 in_channels=[256, 512, 1024],
+                 num_classes=80,
+                 act='silu',
+                 fpn_strides=[8, 16, 32],
+                 grid_cell_scale=5.0,
+                 grid_cell_offset=0.5,
+                 reg_max=16,
+                 reg_range=None,
+                 use_varifocal_loss=False,
+                 assigner='TaskAlignedAssigner',
+                 nms='MultiClassNMS',
+                 eval_size=None,
+                 loss_weight={
+                     'class': 0.5,
+                     'iou': 7.5,
+                     'dfl': 1.5,
+                 },
+                 trt=False,
+                 exclude_nms=False,
+                 exclude_post_process=False,
+                 print_l1_loss=True):
+        super(YOLOv8Head, self).__init__()
+        assert len(in_channels) > 0, "len(in_channels) should > 0"
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.fpn_strides = fpn_strides
+        self.grid_cell_scale = grid_cell_scale
+        self.grid_cell_offset = grid_cell_offset
+        self.reg_max = reg_max
+        if reg_range:
+            self.reg_range = reg_range
+        else:
+            self.reg_range = (0, reg_max)  # not reg_max+1
+        self.reg_channels = self.reg_range[1] - self.reg_range[0]
+        self.use_varifocal_loss = use_varifocal_loss
+        self.assigner = assigner
+        self.nms = nms
+        if isinstance(self.nms, MultiClassNMS) and trt:
+            self.nms.trt = trt
+        self.eval_size = eval_size
+        self.loss_weight = loss_weight
+        self.exclude_nms = exclude_nms
+        self.exclude_post_process = exclude_post_process
+        self.print_l1_loss = print_l1_loss
+
+        # cls loss
+        self.bce = nn.BCEWithLogitsLoss(reduction='none')
+
+        # pred head
+        c2 = max((16, in_channels[0] // 4, self.reg_max * 4))
+        c3 = max(in_channels[0], self.num_classes)
+        self.conv_reg = nn.LayerList()
+        self.conv_cls = nn.LayerList()
+        for in_c in self.in_channels:
+            self.conv_reg.append(
+                nn.Sequential(* [
+                    BaseConv(
+                        in_c, c2, 3, 1, act=act),
+                    BaseConv(
+                        c2, c2, 3, 1, act=act),
+                    nn.Conv2D(
+                        c2,
+                        self.reg_max * 4,
+                        1,
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+                ]))
+            self.conv_cls.append(
+                nn.Sequential(* [
+                    BaseConv(
+                        in_c, c3, 3, 1, act=act),
+                    BaseConv(
+                        c3, c3, 3, 1, act=act),
+                    nn.Conv2D(
+                        c3,
+                        self.num_classes,
+                        1,
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+                ]))
+        self.proj = paddle.arange(self.reg_max).astype('float32')
+        self._initialize_biases()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    def _initialize_biases(self):
+        for a, b, s in zip(self.conv_reg, self.conv_cls, self.fpn_strides):
+            constant_(a[-1].weight)
+            constant_(a[-1].bias, 1.0)
+            constant_(b[-1].weight)
+            constant_(b[-1].bias, math.log(5 / self.num_classes / (640 / s)**2))
+
+    def forward(self, feats, targets=None):
+        if self.training:
+            return self.forward_train(feats, targets)
+        else:
+            return self.forward_eval(feats)
+
+    def forward_train(self, feats, targets):
+        anchors, anchor_points, num_anchors_list, stride_tensor = \
+            generate_anchors_for_grid_cell(
+                feats, self.fpn_strides, self.grid_cell_scale,
+                self.grid_cell_offset)
+
+        cls_logits_list, bbox_preds_list, bbox_dist_preds_list = [], [], []
+        for i, feat in enumerate(feats):
+            _, _, h, w = feat.shape
+            l = h * w
+            bbox_dist_preds = self.conv_reg[i](feat)
+            cls_logit = self.conv_cls[i](feat)
+            bbox_dist_preds = bbox_dist_preds.reshape([-1, 4, self.reg_max, l]).transpose([0, 3, 1, 2])
+            bbox_preds = F.softmax(bbox_dist_preds, axis=3).matmul(self.proj.reshape([-1, 1])).squeeze(-1)
+
+            cls_logits_list.append(cls_logit)
+            bbox_preds_list.append(bbox_preds.transpose([0, 2, 1]).reshape([-1, 4, h, w]))
+            bbox_dist_preds_list.append(bbox_dist_preds)
+
+        return self.get_loss([
+            cls_logits_list, bbox_preds_list, bbox_dist_preds_list, anchors, anchor_points,
+            num_anchors_list, stride_tensor
+        ], targets)
+
+    def forward_eval(self, feats):
+        anchor_points, stride_tensor = self._generate_anchors(feats)
+
+        cls_logits_list, bbox_preds_list = [], []
+        feats_shapes = []
+        for i, feat in enumerate(feats):
+            _, _, h, w = feat.shape
+            l = h * w
+            bbox_dist_preds = self.conv_reg[i](feat)
+            cls_logit = self.conv_cls[i](feat)
+
+            bbox_dist_preds = bbox_dist_preds.reshape(
+                [-1, 4, self.reg_max, l]).transpose([0, 3, 1, 2])
+            bbox_preds = F.softmax(bbox_dist_preds, axis=3).matmul(self.proj.reshape([-1, 1])).squeeze(-1)
+            cls_logits_list.append(cls_logit)
+            bbox_preds_list.append(bbox_preds.transpose([0, 2, 1]).reshape([-1, 4, h, w]))
+            feats_shapes.append(l)
+
+        pred_scores = [
+            cls_score.transpose([0, 2, 3, 1]).reshape([-1, size, self.num_classes])
+            for size, cls_score in zip(feats_shapes, cls_logits_list)
+        ]
+        pred_dists = [
+            bbox_pred.transpose([0, 2, 3, 1]).reshape([-1, size, 4])
+            for size, bbox_pred in zip(feats_shapes, bbox_preds_list)
+        ]
+        pred_scores = F.sigmoid(paddle.concat(pred_scores, 1))
+        pred_bboxes = paddle.concat(pred_dists, 1)
+
+        return pred_scores, pred_bboxes, anchor_points, stride_tensor
+
+    def _generate_anchors(self, feats=None, dtype='float32'):
+        # just use in eval time
+        anchor_points = []
+        stride_tensor = []
+        for i, stride in enumerate(self.fpn_strides):
+            if feats is not None:
+                _, _, h, w = feats[i].shape
+            else:
+                h = int(self.eval_size[0] / stride)
+                w = int(self.eval_size[1] / stride)
+            shift_x = paddle.arange(end=w) + self.grid_cell_offset
+            shift_y = paddle.arange(end=h) + self.grid_cell_offset
+            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
+            anchor_point = paddle.cast(
+                paddle.stack(
+                    [shift_x, shift_y], axis=-1), dtype=dtype)
+            anchor_points.append(anchor_point.reshape([-1, 2]))
+            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
+        anchor_points = paddle.concat(anchor_points)
+        stride_tensor = paddle.concat(stride_tensor)
+        return anchor_points, stride_tensor
+
+    def _bbox2distance(self, points, bbox, reg_max=15, eps=0.01):
+        x1y1, x2y2 = paddle.split(bbox, 2, -1)
+        lt = points - x1y1
+        rb = x2y2 - points
+        return paddle.concat([lt, rb], -1).clip(0, reg_max - eps)
+
+    def _df_loss(self, pred_dist, target, lower_bound=0):
+        target_left = paddle.cast(target.floor(), 'int64')
+        target_right = target_left + 1
+        weight_left = target_right.astype('float32') - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(
+            pred_dist, target_left - lower_bound,
+            reduction='none') * weight_left
+        loss_right = F.cross_entropy(
+            pred_dist, target_right - lower_bound,
+            reduction='none') * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+
+    def get_loss(self, head_outs, gt_meta):
+        cls_scores, bbox_preds, bbox_dist_preds, anchors,\
+        anchor_points, num_anchors_list, stride_tensor = head_outs
+
+
+        bs = cls_scores[0].shape[0]
+        flatten_cls_preds = [
+            cls_pred.transpose([0, 2, 3, 1]).reshape([bs, -1, self.num_classes])
+            for cls_pred in cls_scores
+        ]
+        flatten_pred_bboxes = [
+            bbox_pred.transpose([0, 2, 3, 1]).reshape([bs, -1, 4])
+            for bbox_pred in bbox_preds
+        ]
+        flatten_pred_dists = [
+            bbox_pred_org.reshape([bs, -1, self.reg_max * 4])
+            for bbox_pred_org in bbox_dist_preds
+        ]
+
+        flatten_dist_preds = paddle.concat(flatten_pred_dists, 1)
+        pred_scores = paddle.concat(flatten_cls_preds, 1)
+        pred_distri = paddle.concat(flatten_pred_bboxes, 1)
+
+
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = batch_distance2bbox(anchor_points_s, pred_distri) # xyxy
+        pred_bboxes = pred_bboxes * stride_tensor
+
+        gt_labels = gt_meta['gt_class']
+        gt_bboxes = gt_meta['gt_bbox'] # xyxy
+        pad_gt_mask = gt_meta['pad_gt_mask']
+
+        assigned_labels, assigned_bboxes, assigned_scores = \
+            self.assigner(
+            F.sigmoid(pred_scores.detach()),
+            pred_bboxes.detach(),
+            anchor_points,
+            num_anchors_list,
+            gt_labels,
+            gt_bboxes, # xyxy
+            pad_gt_mask,
+            bg_index=self.num_classes)
+        # rescale bbox
+        assigned_bboxes /= stride_tensor
+        pred_bboxes /= stride_tensor
+
+        # cls loss
+        loss_cls = self.bce(pred_scores, assigned_scores).sum()
+
+        assigned_scores_sum = assigned_scores.sum()
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(assigned_scores_sum)
+            assigned_scores_sum /= paddle.distributed.get_world_size()
+        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
+        loss_cls /= assigned_scores_sum
+
+        # select positive samples mask
+        mask_positive = (assigned_labels != self.num_classes)
+        num_pos = mask_positive.sum()
+        # pos/neg loss
+        if num_pos > 0:
+            # ciou loss
+            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
+            pred_bboxes_pos = paddle.masked_select(
+                pred_bboxes, bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = paddle.masked_select(
+                assigned_bboxes, bbox_mask).reshape([-1, 4])
+            bbox_weight = paddle.masked_select(
+                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
+            iou = bbox_iou( 
+                pred_bboxes_pos.split(4, axis=-1),
+                assigned_bboxes_pos.split(4, axis=-1),
+                x1y1x2y2=True, # xyxy
+                ciou=True,
+                eps=1e-7)
+            loss_iou = ((1.0 - iou) * bbox_weight).sum() / assigned_scores_sum
+
+            if self.print_l1_loss:
+                loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
+            else:
+                loss_l1 = paddle.zeros([1])
+
+            # dfl loss
+            dist_mask = mask_positive.unsqueeze(-1).tile(
+                [1, 1, self.reg_max * 4])
+            pred_dist_pos = paddle.masked_select(
+                flatten_dist_preds, dist_mask).reshape([-1, 4, self.reg_max])
+            assigned_ltrb = self._bbox2distance(
+                anchor_points_s,
+                assigned_bboxes,
+                reg_max=self.reg_max - 1,
+                eps=0.01)
+            assigned_ltrb_pos = paddle.masked_select(
+                assigned_ltrb, bbox_mask).reshape([-1, 4])
+
+            loss_dfl = self._df_loss(pred_dist_pos,
+                                     assigned_ltrb_pos) * bbox_weight
+            loss_dfl = loss_dfl.sum() / assigned_scores_sum
+        else:
+            loss_iou = flatten_dist_preds.sum() * 0.
+            loss_dfl = flatten_dist_preds.sum() * 0.
+            loss_l1 = flatten_dist_preds.sum() * 0.
+
+        loss_cls *= self.loss_weight['class']
+        loss_iou *= self.loss_weight['iou']
+        loss_dfl *= self.loss_weight['dfl']
+        loss_total = loss_cls + loss_iou + loss_dfl
+
+        num_gpus = gt_meta.get('num_gpus', 8)
+        total_bs = bs * num_gpus
+
+        out_dict = {
+            'loss': loss_total * total_bs,
+            'loss_cls': loss_cls * total_bs,
+            'loss_iou': loss_iou * total_bs,
+            'loss_dfl': loss_dfl * total_bs,
+        }
+        if self.print_l1_loss:
+            # just see convergence
+            out_dict.update({'loss_l1': loss_l1 * total_bs})
+        return out_dict
+
+    def post_process(self, head_outs, im_shape, scale_factor):
+        pred_scores, pred_bboxes, anchor_points, stride_tensor = head_outs
+
+        pred_bboxes = batch_distance2bbox(anchor_points, pred_bboxes)
+        pred_bboxes *= stride_tensor
+
+        if self.exclude_post_process:
+            return paddle.concat(
+                [pred_bboxes, pred_scores], axis=-1), None
+        else:
+            pred_scores = pred_scores.transpose([0, 2, 1])
+            # scale bbox to origin
+            scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
+            pred_bboxes /= scale_factor
+            if self.exclude_nms:
+                # `exclude_nms=True` just use in benchmark
+                return pred_bboxes, pred_scores
+            else:
+                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
+                return bbox_pred, bbox_num
+
diff --git a/ppdet/modeling/initializer.py b/ppdet/modeling/initializer.py
new file mode 100644
index 0000000000000000000000000000000000000000..308c51baf89c1b673319e6864c6ea4c91dd1c956
--- /dev/null
+++ b/ppdet/modeling/initializer.py
@@ -0,0 +1,325 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    'uniform_',
+    'normal_',
+    'constant_',
+    'ones_',
+    'zeros_',
+    'xavier_uniform_',
+    'xavier_normal_',
+    'kaiming_uniform_',
+    'kaiming_normal_',
+    'linear_init_',
+    'conv_init_',
+    'reset_initialized_parameter',
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(
+                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0., std=1.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0., std=1.):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+        'conv_transpose2d', 'conv_transpose3d'
+    ]
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(
+                param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+                     a=0,
+                     mode='fan_in',
+                     nonlinearity='leaky_relu',
+                     reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+                    a=0,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    if hasattr(module, "bias") and module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+                                    m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1. / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0., std=1.)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_fill_(m.bias, 0)
diff --git a/ppdet/modeling/layers.py b/ppdet/modeling/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dff963817113be474ec5b6fa00784435280c1bc
--- /dev/null
+++ b/ppdet/modeling/layers.py
@@ -0,0 +1,662 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.nn.initializer import Normal, Constant
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from . import ops
+from .initializer import xavier_uniform_, constant_
+from paddle.vision.ops import DeformConv2D
+
+
+def _to_list(l):
+    if isinstance(l, (list, tuple)):
+        return list(l)
+    return [l]
+
+
+class DeformableConvV2(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 weight_attr=None,
+                 bias_attr=None,
+                 lr_scale=1,
+                 regularizer=None,
+                 skip_quant=False,
+                 dcn_bias_regularizer=L2Decay(0.),
+                 dcn_bias_lr_scale=2.):
+        super(DeformableConvV2, self).__init__()
+        self.offset_channel = 2 * kernel_size**2
+        self.mask_channel = kernel_size**2
+
+        if lr_scale == 1 and regularizer is None:
+            offset_bias_attr = ParamAttr(initializer=Constant(0.))
+        else:
+            offset_bias_attr = ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=lr_scale,
+                regularizer=regularizer)
+        self.conv_offset = nn.Conv2D(
+            in_channels,
+            3 * kernel_size**2,
+            kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            weight_attr=ParamAttr(initializer=Constant(0.0)),
+            bias_attr=offset_bias_attr)
+        if skip_quant:
+            self.conv_offset.skip_quant = True
+
+        if bias_attr:
+            # in FCOS-DCN head, specifically need learning_rate and regularizer
+            dcn_bias_attr = ParamAttr(
+                initializer=Constant(value=0),
+                regularizer=dcn_bias_regularizer,
+                learning_rate=dcn_bias_lr_scale)
+        else:
+            # in ResNet backbone, do not need bias
+            dcn_bias_attr = False
+        self.conv_dcn = DeformConv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2 * dilation,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=dcn_bias_attr)
+
+    def forward(self, x):
+        offset_mask = self.conv_offset(x)
+        offset, mask = paddle.split(
+            offset_mask,
+            num_or_sections=[self.offset_channel, self.mask_channel],
+            axis=1)
+        mask = F.sigmoid(mask)
+        y = self.conv_dcn(x, offset, mask=mask)
+        return y
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride,
+                 groups=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 norm_groups=32,
+                 use_dcn=False,
+                 bias_on=False,
+                 lr_scale=1.,
+                 freeze_norm=False,
+                 initializer=Normal(
+                     mean=0., std=0.01),
+                 skip_quant=False,
+                 dcn_lr_scale=2.,
+                 dcn_regularizer=L2Decay(0.)):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn', 'gn', None]
+
+        if bias_on:
+            bias_attr = ParamAttr(
+                initializer=Constant(value=0.), learning_rate=lr_scale)
+        else:
+            bias_attr = False
+
+        if not use_dcn:
+            self.conv = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(
+                    initializer=initializer, learning_rate=1.),
+                bias_attr=bias_attr)
+            if skip_quant:
+                self.conv.skip_quant = True
+        else:
+            # in FCOS-DCN head, specifically need learning_rate and regularizer
+            self.conv = DeformableConvV2(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(
+                    initializer=initializer, learning_rate=1.),
+                bias_attr=True,
+                lr_scale=dcn_lr_scale,
+                regularizer=dcn_regularizer,
+                dcn_bias_regularizer=dcn_regularizer,
+                dcn_bias_lr_scale=dcn_lr_scale,
+                skip_quant=skip_quant)
+
+        norm_lr = 0. if freeze_norm else 1.
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
+        if norm_type in ['bn', 'sync_bn']:
+            self.norm = nn.BatchNorm2D(
+                ch_out, weight_attr=param_attr, bias_attr=bias_attr)
+        elif norm_type == 'gn':
+            self.norm = nn.GroupNorm(
+                num_groups=norm_groups,
+                num_channels=ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr)
+        else:
+            self.norm = None
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        if self.norm is not None:
+            out = self.norm(out)
+        return out
+
+
+class DropBlock(nn.Layer):
+    def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'):
+        """
+        DropBlock layer, see https://arxiv.org/abs/1810.12890
+
+        Args:
+            block_size (int): block size
+            keep_prob (int): keep probability
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(DropBlock, self).__init__()
+        self.block_size = block_size
+        self.keep_prob = keep_prob
+        self.name = name
+        self.data_format = data_format
+
+    def forward(self, x):
+        if not self.training or self.keep_prob == 1:
+            return x
+        else:
+            gamma = (1. - self.keep_prob) / (self.block_size**2)
+            if self.data_format == 'NCHW':
+                shape = x.shape[2:]
+            else:
+                shape = x.shape[1:3]
+            for s in shape:
+                gamma *= s / (s - self.block_size + 1)
+
+            matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype)
+            mask_inv = F.max_pool2d(
+                matrix,
+                self.block_size,
+                stride=1,
+                padding=self.block_size // 2,
+                data_format=self.data_format)
+            mask = 1. - mask_inv
+            y = x * mask * (mask.numel() / mask.sum())
+            return y
+
+
+@register
+@serializable
+class MultiClassNMS(object):
+    def __init__(self,
+                 score_threshold=.05,
+                 nms_top_k=-1,
+                 keep_top_k=100,
+                 nms_threshold=.5,
+                 normalized=True,
+                 nms_eta=1.0,
+                 return_index=False,
+                 return_rois_num=True,
+                 trt=False):
+        super(MultiClassNMS, self).__init__()
+        self.score_threshold = score_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+        self.nms_threshold = nms_threshold
+        self.normalized = normalized
+        self.nms_eta = nms_eta
+        self.return_index = return_index
+        self.return_rois_num = return_rois_num
+        self.trt = trt
+
+    def __call__(self, bboxes, score, background_label=-1):
+        """
+        bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape 
+                                         [N, M, 4], N is the batch size and M
+                                         is the number of bboxes
+                                      2. (List[Tensor]) bboxes and bbox_num,
+                                         bboxes have shape of [M, C, 4], C
+                                         is the class number and bbox_num means
+                                         the number of bboxes of each batch with
+                                         shape [N,] 
+        score (Tensor): Predicted scores with shape [N, C, M] or [M, C]
+        background_label (int): Ignore the background label; For example, RCNN
+                                is num_classes and YOLO is -1. 
+        """
+        kwargs = self.__dict__.copy()
+        if isinstance(bboxes, tuple):
+            bboxes, bbox_num = bboxes
+            kwargs.update({'rois_num': bbox_num})
+        if background_label > -1:
+            kwargs.update({'background_label': background_label})
+        kwargs.pop('trt')
+        # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt
+        if self.trt and (int(paddle.version.major) == 0 or
+                         (int(paddle.version.major) >= 2 and
+                          int(paddle.version.minor) >= 3)):
+            # TODO(wangxinxin08): tricky switch to run nms on tensorrt
+            kwargs.update({'nms_eta': 1.1})
+            bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs)
+            bbox = bbox.reshape([1, -1, 6])
+            idx = paddle.nonzero(bbox[..., 0] != -1)
+            bbox = paddle.gather_nd(bbox, idx)
+            return bbox, bbox_num, None
+        else:
+            return ops.multiclass_nms(bboxes, score, **kwargs)
+
+
+@register
+@serializable
+class MatrixNMS(object):
+    __append_doc__ = True
+
+    def __init__(self,
+                 score_threshold=.05,
+                 post_threshold=.05,
+                 nms_top_k=-1,
+                 keep_top_k=100,
+                 use_gaussian=False,
+                 gaussian_sigma=2.,
+                 normalized=False,
+                 background_label=0):
+        super(MatrixNMS, self).__init__()
+        self.score_threshold = score_threshold
+        self.post_threshold = post_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+        self.normalized = normalized
+        self.use_gaussian = use_gaussian
+        self.gaussian_sigma = gaussian_sigma
+        self.background_label = background_label
+
+    def __call__(self, bbox, score, *args):
+        return ops.matrix_nms(
+            bboxes=bbox,
+            scores=score,
+            score_threshold=self.score_threshold,
+            post_threshold=self.post_threshold,
+            nms_top_k=self.nms_top_k,
+            keep_top_k=self.keep_top_k,
+            use_gaussian=self.use_gaussian,
+            gaussian_sigma=self.gaussian_sigma,
+            background_label=self.background_label,
+            normalized=self.normalized)
+
+
+@register
+@serializable
+class YOLOBox(object):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 conf_thresh=0.005,
+                 downsample_ratio=32,
+                 clip_bbox=True,
+                 scale_x_y=1.):
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.downsample_ratio = downsample_ratio
+        self.clip_bbox = clip_bbox
+        self.scale_x_y = scale_x_y
+
+    def __call__(self,
+                 yolo_head_out,
+                 anchors,
+                 im_shape,
+                 scale_factor,
+                 var_weight=None):
+        boxes_list = []
+        scores_list = []
+        origin_shape = im_shape / scale_factor
+        origin_shape = paddle.cast(origin_shape, 'int32')
+        for i, head_out in enumerate(yolo_head_out):
+            boxes, scores = paddle.vision.ops.yolo_box(
+                head_out,
+                origin_shape,
+                anchors[i],
+                self.num_classes,
+                self.conf_thresh,
+                self.downsample_ratio // 2**i,
+                self.clip_bbox,
+                scale_x_y=self.scale_x_y)
+            boxes_list.append(boxes)
+            scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))
+        yolo_boxes = paddle.concat(boxes_list, axis=1)
+        yolo_scores = paddle.concat(scores_list, axis=2)
+        return yolo_boxes, yolo_scores
+
+
+def Conv2d(in_channels,
+           out_channels,
+           kernel_size,
+           stride=1,
+           padding=0,
+           dilation=1,
+           groups=1,
+           bias=True,
+           weight_init=Normal(std=0.001),
+           bias_init=Constant(0.)):
+    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
+    if bias:
+        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
+    else:
+        bias_attr = False
+    conv = nn.Conv2D(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr)
+    return conv
+
+
+def ConvTranspose2d(in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=1,
+                    padding=0,
+                    output_padding=0,
+                    groups=1,
+                    bias=True,
+                    dilation=1,
+                    weight_init=Normal(std=0.001),
+                    bias_init=Constant(0.)):
+    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
+    if bias:
+        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
+    else:
+        bias_attr = False
+    conv = nn.Conv2DTranspose(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        groups,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr)
+    return conv
+
+
+def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True):
+    if not affine:
+        weight_attr = False
+        bias_attr = False
+    else:
+        weight_attr = None
+        bias_attr = None
+    batchnorm = nn.BatchNorm2D(
+        num_features,
+        momentum,
+        eps,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr)
+    return batchnorm
+
+
+def ReLU():
+    return nn.ReLU()
+
+
+def Upsample(scale_factor=None, mode='nearest', align_corners=False):
+    return nn.Upsample(None, scale_factor, mode, align_corners)
+
+
+def MaxPool(kernel_size, stride, padding, ceil_mode=False):
+    return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode)
+
+
+class Concat(nn.Layer):
+    def __init__(self, dim=0):
+        super(Concat, self).__init__()
+        self.dim = dim
+
+    def forward(self, inputs):
+        return paddle.concat(inputs, axis=self.dim)
+
+    def extra_repr(self):
+        return 'dim={}'.format(self.dim)
+
+
+def _convert_attention_mask(attn_mask, dtype):
+    """
+    Convert the attention mask to the target dtype we expect.
+    Parameters:
+        attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False` 
+                values and the others have `True` values. When the data type is 
+                int, the unwanted positions have 0 values and the others have 1 
+                values. When the data type is float, the unwanted positions have 
+                `-INF` values and the others have 0 values. It can be None when 
+                nothing wanted or needed to be prevented attention to. Default None.
+        dtype (VarType): The target type of `attn_mask` we expect.
+    Returns:
+        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
+    """
+    return nn.layer.transformer._convert_attention_mask(attn_mask, dtype)
+
+
+@register
+class MultiHeadAttention(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in value. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
+            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 kdim=None,
+                 vdim=None,
+                 need_weights=False):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        if self._qkv_same_embed_dim:
+            self.in_proj_weight = self.create_parameter(
+                shape=[embed_dim, 3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=False)
+            self.in_proj_bias = self.create_parameter(
+                shape=[3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self.q_proj = nn.Linear(embed_dim, embed_dim)
+            self.k_proj = nn.Linear(self.kdim, embed_dim)
+            self.v_proj = nn.Linear(self.vdim, embed_dim)
+
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self._type_list = ('q_proj', 'k_proj', 'v_proj')
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+            else:
+                constant_(p)
+
+    def compute_qkv(self, tensor, index):
+        if self._qkv_same_embed_dim:
+            tensor = F.linear(
+                x=tensor,
+                weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
+                                           * self.embed_dim],
+                bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
+                                       self.embed_dim]
+                if self.in_proj_bias is not None else None)
+        else:
+            tensor = getattr(self, self._type_list[index])(tensor)
+        tensor = tensor.reshape(
+            [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        return tensor
+
+    def forward(self, query, key=None, value=None, attn_mask=None):
+        r"""
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        q, k, v = (self.compute_qkv(t, i)
+                   for i, t in enumerate([query, key, value]))
+
+        # scale dot product attention
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        scaling = float(self.head_dim)**-0.5
+        product = product * scaling
+
+        if attn_mask is not None:
+            # Support bool or int mask
+            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
+            product = product + attn_mask
+        weights = F.softmax(product)
+        if self.dropout:
+            weights = F.dropout(
+                weights,
+                self.dropout,
+                training=self.training,
+                mode="upscale_in_train")
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = paddle.transpose(out, perm=[0, 2, 1, 3])
+        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        return out if len(outs) == 1 else tuple(outs)
diff --git a/ppdet/modeling/losses/__init__.py b/ppdet/modeling/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..01f8f6caa0c31d35e4664ac430cfcdab99d8b0ae
--- /dev/null
+++ b/ppdet/modeling/losses/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import yolo_loss
+from . import iou_aware_loss
+from . import iou_loss
+from . import gfocal_loss
+from . import focal_loss
+from . import smooth_l1_loss
+from . import yolov5_loss
+from . import yolov7_loss
+from . import detr_loss
+
+from .yolo_loss import *
+from .iou_aware_loss import *
+from .iou_loss import *
+from .gfocal_loss import *
+from .focal_loss import *
+from .smooth_l1_loss import *
+from .yolov5_loss import *
+from .yolov7_loss import *
+from . import detr_loss
diff --git a/ppdet/modeling/losses/detr_loss.py b/ppdet/modeling/losses/detr_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f14c3d4893826f3d660a2765e5e4a5236e44a5
--- /dev/null
+++ b/ppdet/modeling/losses/detr_loss.py
@@ -0,0 +1,578 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from .iou_loss import GIoULoss
+from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
+from ..bbox_utils import bbox_iou
+
+__all__ = ['DETRLoss', 'DINOLoss']
+
+
+@register
+class DETRLoss(nn.Layer):
+    __shared__ = ['num_classes', 'use_focal_loss']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 1,
+                     'bbox': 5,
+                     'giou': 2,
+                     'no_object': 0.1,
+                     'mask': 1,
+                     'dice': 1
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False,
+                 use_vfl=False,
+                 use_uni_match=False,
+                 uni_match_ind=0):
+        r"""
+        Args:
+            num_classes (int): The number of classes.
+            matcher (HungarianMatcher): It computes an assignment between the targets
+                and the predictions of the network.
+            loss_coeff (dict): The coefficient of loss.
+            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
+            use_focal_loss (bool): Use focal loss or not.
+        """
+        super(DETRLoss, self).__init__()
+
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.loss_coeff = loss_coeff
+        self.aux_loss = aux_loss
+        self.use_focal_loss = use_focal_loss
+        self.use_vfl = use_vfl
+        self.use_uni_match = use_uni_match
+        self.uni_match_ind = uni_match_ind
+
+        if not self.use_focal_loss:
+            self.loss_coeff['class'] = paddle.full([num_classes + 1],
+                                                   loss_coeff['class'])
+            self.loss_coeff['class'][-1] = loss_coeff['no_object']
+        self.giou_loss = GIoULoss()
+
+    def _get_loss_class(self,
+                        logits,
+                        gt_class,
+                        match_indices,
+                        bg_index,
+                        num_gts,
+                        postfix="",
+                        iou_score=None):
+        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
+        name_class = "loss_class" + postfix
+
+        target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
+        bs, num_query_objects = target_label.shape
+        num_gt = sum(len(a) for a in gt_class)
+        if num_gt > 0:
+            index, updates = self._get_index_updates(num_query_objects,
+                                                     gt_class, match_indices)
+            target_label = paddle.scatter(
+                target_label.reshape([-1, 1]), index, updates.astype('int64'))
+            target_label = target_label.reshape([bs, num_query_objects])
+        if self.use_focal_loss:
+            target_label = F.one_hot(target_label,
+                                     self.num_classes + 1)[..., :-1]
+            if iou_score is not None and self.use_vfl:
+                target_score = paddle.zeros([bs, num_query_objects])
+                if num_gt > 0:
+                    target_score = paddle.scatter(
+                        target_score.reshape([-1, 1]), index, iou_score)
+                target_score = target_score.reshape(
+                    [bs, num_query_objects, 1]) * target_label
+                loss_ = self.loss_coeff['class'] * varifocal_loss_with_logits(
+                    logits, target_score, target_label,
+                    num_gts / num_query_objects)
+            else:
+                loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
+                    logits, target_label, num_gts / num_query_objects)
+        else:
+            loss_ = F.cross_entropy(
+                logits, target_label, weight=self.loss_coeff['class'])
+        return {name_class: loss_}
+
+    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
+                       postfix=""):
+        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
+        name_bbox = "loss_bbox" + postfix
+        name_giou = "loss_giou" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_bbox) == 0:
+            loss[name_bbox] = paddle.to_tensor([0.])
+            loss[name_giou] = paddle.to_tensor([0.])
+            return loss
+
+        src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
+                                                            match_indices)
+        loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
+            src_bbox, target_bbox, reduction='sum') / num_gts
+        loss[name_giou] = self.giou_loss(
+            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
+        loss[name_giou] = loss[name_giou].sum() / num_gts
+        loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
+        return loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
+                       postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.])
+            loss[name_dice] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        src_masks = F.interpolate(
+            src_masks.unsqueeze(0),
+            size=target_masks.shape[-2:],
+            mode="bilinear")[0]
+        loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
+            src_masks,
+            target_masks,
+            paddle.to_tensor(
+                [num_gts], dtype='float32'))
+        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _dice_loss(self, inputs, targets, num_gts):
+        inputs = F.sigmoid(inputs)
+        inputs = inputs.flatten(1)
+        targets = targets.flatten(1)
+        numerator = 2 * (inputs * targets).sum(1)
+        denominator = inputs.sum(-1) + targets.sum(-1)
+        loss = 1 - (numerator + 1) / (denominator + 1)
+        return loss.sum() / num_gts
+
+    def _get_loss_aux(self,
+                      boxes,
+                      logits,
+                      gt_bbox,
+                      gt_class,
+                      bg_index,
+                      num_gts,
+                      dn_match_indices=None,
+                      postfix="",
+                      masks=None,
+                      gt_mask=None):
+        loss_class = []
+        loss_bbox, loss_giou = [], []
+        loss_mask, loss_dice = [], []
+        if dn_match_indices is not None:
+            match_indices = dn_match_indices
+        elif self.use_uni_match:
+            match_indices = self.matcher(
+                boxes[self.uni_match_ind],
+                logits[self.uni_match_ind],
+                gt_bbox,
+                gt_class,
+                masks=masks[self.uni_match_ind] if masks is not None else None,
+                gt_mask=gt_mask)
+        for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
+            aux_masks = masks[i] if masks is not None else None
+            if not self.use_uni_match and dn_match_indices is None:
+                match_indices = self.matcher(
+                    aux_boxes,
+                    aux_logits,
+                    gt_bbox,
+                    gt_class,
+                    masks=aux_masks,
+                    gt_mask=gt_mask)
+            if self.use_vfl:
+                if sum(len(a) for a in gt_bbox) > 0:
+                    src_bbox, target_bbox = self._get_src_target_assign(
+                        aux_boxes.detach(), gt_bbox, match_indices)
+                    iou_score = bbox_iou(
+                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
+                else:
+                    iou_score = None
+            else:
+                iou_score = None
+            loss_class.append(
+                self._get_loss_class(aux_logits, gt_class, match_indices,
+                                     bg_index, num_gts, postfix, iou_score)[
+                                         'loss_class' + postfix])
+            loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
+                                        num_gts, postfix)
+            loss_bbox.append(loss_['loss_bbox' + postfix])
+            loss_giou.append(loss_['loss_giou' + postfix])
+            if masks is not None and gt_mask is not None:
+                loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
+                                            num_gts, postfix)
+                loss_mask.append(loss_['loss_mask' + postfix])
+                loss_dice.append(loss_['loss_dice' + postfix])
+        loss = {
+            "loss_class_aux" + postfix: paddle.add_n(loss_class),
+            "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
+            "loss_giou_aux" + postfix: paddle.add_n(loss_giou)
+        }
+        if masks is not None and gt_mask is not None:
+            loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
+            loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
+        return loss
+
+    def _get_index_updates(self, num_query_objects, target, match_indices):
+        batch_idx = paddle.concat([
+            paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
+        ])
+        src_idx = paddle.concat([src for (src, _) in match_indices])
+        src_idx += (batch_idx * num_query_objects)
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
+        ])
+        return src_idx, target_assign
+
+    def _get_src_target_assign(self, src, target, match_indices):
+        src_assign = paddle.concat([
+            paddle.gather(
+                t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (I, _) in zip(src, match_indices)
+        ])
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (_, J) in zip(target, match_indices)
+        ])
+        return src_assign, target_assign
+
+    def _get_num_gts(self, targets, dtype="float32"):
+        num_gts = sum(len(a) for a in targets)
+        num_gts = paddle.to_tensor([num_gts], dtype=dtype)
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(num_gts)
+            num_gts /= paddle.distributed.get_world_size()
+        num_gts = paddle.clip(num_gts, min=1.)
+        return num_gts
+
+    def _get_prediction_loss(self,
+                             boxes,
+                             logits,
+                             gt_bbox,
+                             gt_class,
+                             masks=None,
+                             gt_mask=None,
+                             postfix="",
+                             dn_match_indices=None,
+                             num_gts=1):
+        if dn_match_indices is None:
+            match_indices = self.matcher(
+                boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
+        else:
+            match_indices = dn_match_indices
+
+        if self.use_vfl:
+            if sum(len(a) for a in gt_bbox) > 0:
+                src_bbox, target_bbox = self._get_src_target_assign(
+                    boxes.detach(), gt_bbox, match_indices)
+                iou_score = bbox_iou(
+                    bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                    bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
+            else:
+                iou_score = None
+        else:
+            iou_score = None
+
+        loss = dict()
+        loss.update(
+            self._get_loss_class(logits, gt_class, match_indices,
+                                 self.num_classes, num_gts, postfix, iou_score))
+        loss.update(
+            self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
+                                postfix))
+        if masks is not None and gt_mask is not None:
+            loss.update(
+                self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
+                                    postfix))
+        return loss
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                **kwargs):
+        r"""
+        Args:
+            boxes (Tensor): [l, b, query, 4]
+            logits (Tensor): [l, b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor, optional): [l, b, query, h, w]
+            gt_mask (List(Tensor), optional): list[[n, H, W]]
+            postfix (str): postfix of loss name
+        """
+
+        dn_match_indices = kwargs.get("dn_match_indices", None)
+        num_gts = kwargs.get("num_gts", None)
+        if num_gts is None:
+            num_gts = self._get_num_gts(gt_class)
+
+        total_loss = self._get_prediction_loss(
+            boxes[-1],
+            logits[-1],
+            gt_bbox,
+            gt_class,
+            masks=masks[-1] if masks is not None else None,
+            gt_mask=gt_mask,
+            postfix=postfix,
+            dn_match_indices=dn_match_indices,
+            num_gts=num_gts)
+
+        if self.aux_loss:
+            total_loss.update(
+                self._get_loss_aux(
+                    boxes[:-1],
+                    logits[:-1],
+                    gt_bbox,
+                    gt_class,
+                    self.num_classes,
+                    num_gts,
+                    dn_match_indices,
+                    postfix,
+                    masks=masks[:-1] if masks is not None else None,
+                    gt_mask=gt_mask))
+
+        return total_loss
+
+
+@register
+class DINOLoss(DETRLoss):
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                dn_out_bboxes=None,
+                dn_out_logits=None,
+                dn_meta=None,
+                **kwargs):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(DINOLoss, self).forward(
+            boxes, logits, gt_bbox, gt_class, num_gts=num_gts)
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = \
+                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = self.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group)
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(DINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts)
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + '_dn': paddle.to_tensor([0.])
+                 for k in total_loss.keys()})
+
+        return total_loss
+
+    @staticmethod
+    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
+        dn_match_indices = []
+        for i in range(len(labels)):
+            num_gt = len(labels[i])
+            if num_gt > 0:
+                gt_idx = paddle.arange(end=num_gt, dtype="int64")
+                gt_idx = gt_idx.tile([dn_num_group])
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append((paddle.zeros(
+                    [0], dtype="int64"), paddle.zeros(
+                        [0], dtype="int64")))
+        return dn_match_indices
+
+
+@register
+class MaskDINOLoss(DETRLoss):
+    __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 4,
+                     'bbox': 5,
+                     'giou': 2,
+                     'mask': 5,
+                     'dice': 5
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False,
+                 num_sample_points=12544,
+                 oversample_ratio=3.0,
+                 important_sample_ratio=0.75):
+        super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
+                                           aux_loss, use_focal_loss)
+        assert oversample_ratio >= 1
+        assert important_sample_ratio <= 1 and important_sample_ratio >= 0
+
+        self.num_sample_points = num_sample_points
+        self.oversample_ratio = oversample_ratio
+        self.important_sample_ratio = important_sample_ratio
+        self.num_oversample_points = int(num_sample_points * oversample_ratio)
+        self.num_important_points = int(num_sample_points *
+                                        important_sample_ratio)
+        self.num_random_points = num_sample_points - self.num_important_points
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                dn_out_bboxes=None,
+                dn_out_logits=None,
+                dn_out_masks=None,
+                dn_meta=None,
+                **kwargs):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(MaskDINOLoss, self).forward(
+            boxes,
+            logits,
+            gt_bbox,
+            gt_class,
+            masks=masks,
+            gt_mask=gt_mask,
+            num_gts=num_gts)
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = \
+                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = DINOLoss.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group)
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(MaskDINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                masks=dn_out_masks,
+                gt_mask=gt_mask,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts)
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + '_dn': paddle.to_tensor([0.])
+                 for k in total_loss.keys()})
+
+        return total_loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
+                       postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.])
+            loss[name_dice] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        # sample points
+        sample_points = self._get_point_coords_by_uncertainty(src_masks)
+        sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
+
+        src_masks = F.grid_sample(
+            src_masks.unsqueeze(1), sample_points,
+            align_corners=False).squeeze([1, 2])
+
+        target_masks = F.grid_sample(
+            target_masks.unsqueeze(1), sample_points,
+            align_corners=False).squeeze([1, 2]).detach()
+
+        loss[name_mask] = self.loss_coeff[
+            'mask'] * F.binary_cross_entropy_with_logits(
+                src_masks, target_masks,
+                reduction='none').mean(1).sum() / num_gts
+        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _get_point_coords_by_uncertainty(self, masks):
+        # Sample points based on their uncertainty.
+        masks = masks.detach()
+        num_masks = masks.shape[0]
+        sample_points = paddle.rand(
+            [num_masks, 1, self.num_oversample_points, 2])
+
+        out_mask = F.grid_sample(
+            masks.unsqueeze(1), 2.0 * sample_points - 1.0,
+            align_corners=False).squeeze([1, 2])
+        out_mask = -paddle.abs(out_mask)
+
+        _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
+        batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
+        if self.num_random_points > 0:
+            sample_points = paddle.concat(
+                [
+                    sample_points,
+                    paddle.rand([num_masks, self.num_random_points, 2])
+                ],
+                axis=1)
+        return sample_points
diff --git a/ppdet/modeling/losses/focal_loss.py b/ppdet/modeling/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9a64e1bc22d7e69256b311639ceb450c1381798
--- /dev/null
+++ b/ppdet/modeling/losses/focal_loss.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+__all__ = ['FocalLoss', 'Weighted_FocalLoss']
+
+@register
+class FocalLoss(nn.Layer):
+    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
+    Args:
+        use_sigmoid (bool): currently only support use_sigmoid=True
+        alpha (float): parameter alpha in Focal Loss
+        gamma (float): parameter gamma in Focal Loss
+        loss_weight (float): final loss will be multiplied by this
+    """
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.25,
+                 gamma=2.0,
+                 loss_weight=1.0):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid == True, \
+            'Focal Loss only supports sigmoid at the moment'
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, reduction='none'):
+        """forward function.
+        Args:
+            pred (Tensor): logits of class prediction, of shape (N, num_classes)
+            target (Tensor): target class label, of shape (N, )
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        num_classes = pred.shape[1]
+        target = F.one_hot(target, num_classes+1).cast(pred.dtype)
+        target = target[:, :-1].detach()
+        loss = F.sigmoid_focal_loss(
+            pred, target, alpha=self.alpha, gamma=self.gamma,
+            reduction=reduction)
+        return loss * self.loss_weight
+
+
+@register
+class Weighted_FocalLoss(FocalLoss):
+    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
+    Args:
+        use_sigmoid (bool): currently only support use_sigmoid=True
+        alpha (float): parameter alpha in Focal Loss
+        gamma (float): parameter gamma in Focal Loss
+        loss_weight (float): final loss will be multiplied by this
+    """
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.25,
+                 gamma=2.0,
+                 loss_weight=1.0,
+                 reduction="mean"):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid == True, \
+            'Focal Loss only supports sigmoid at the moment'
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
+        """forward function.
+        Args:
+            pred (Tensor): logits of class prediction, of shape (N, num_classes)
+            target (Tensor): target class label, of shape (N, )
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        num_classes = pred.shape[1]
+        target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
+        target = target[:, :-1].detach()
+        loss = F.sigmoid_focal_loss(
+            pred, target, alpha=self.alpha, gamma=self.gamma,
+            reduction='none')
+
+        if weight is not None:
+            if weight.shape != loss.shape:
+                if weight.shape[0] == loss.shape[0]:
+                    # For most cases, weight is of shape (num_priors, ),
+                    #  which means it does not have the second axis num_class
+                    weight = weight.reshape((-1, 1))
+                else:
+                    # Sometimes, weight per anchor per class is also needed. e.g.
+                    #  in FSAF. But it may be flattened of shape
+                    #  (num_priors x num_class, ), while loss is still of shape
+                    #  (num_priors, num_class).
+                    assert weight.numel() == loss.numel()
+                    weight = weight.reshape((loss.shape[0], -1))
+            assert weight.ndim == loss.ndim
+            loss = loss * weight
+
+        # if avg_factor is not specified, just reduce the loss
+        if avg_factor is None:
+            if reduction == 'mean':
+                loss = loss.mean()
+            elif reduction == 'sum':
+                loss = loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if reduction == 'mean':
+                # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+                # i.e., all labels of an image belong to ignore index.
+                eps = 1e-10
+                loss = loss.sum() / (avg_factor + eps)
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif reduction != 'none':
+                raise ValueError('avg_factor can not be used with reduction="sum"')
+
+        return loss * self.loss_weight
diff --git a/ppdet/modeling/losses/gfocal_loss.py b/ppdet/modeling/losses/gfocal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..37e27f084e624491cc8226420548ea498f86d863
--- /dev/null
+++ b/ppdet/modeling/losses/gfocal_loss.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling import ops
+
+__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']
+
+
+def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
+    """
+    Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        pred (Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+    if use_sigmoid:
+        func = F.binary_cross_entropy_with_logits
+    else:
+        func = F.binary_cross_entropy
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred
+    scale_factor = pred_sigmoid
+    zerolabel = paddle.zeros(pred.shape, dtype='float32')
+    loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.shape[1]
+    pos = paddle.logical_and((label >= 0),
+                             (label < bg_class_ind)).nonzero().squeeze(1)
+    if pos.shape[0] == 0:
+        return loss.sum(axis=1)
+    pos_label = paddle.gather(label, pos, axis=0)
+    pos_mask = np.zeros(pred.shape, dtype=np.int32)
+    pos_mask[pos.numpy(), pos_label.numpy()] = 1
+    pos_mask = paddle.to_tensor(pos_mask, dtype='bool')
+    score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32')
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor_new = score - pred_sigmoid
+
+    loss_pos = func(
+        pred, score, reduction='none') * scale_factor_new.abs().pow(beta)
+    loss = loss * paddle.logical_not(pos_mask) + loss_pos * pos_mask
+    loss = loss.sum(axis=1)
+    return loss
+
+
+def distribution_focal_loss(pred, label):
+    """Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        pred (Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (Tensor): Target distance label for bounding boxes with
+            shape (N,).
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.cast('int64')
+    dis_right = dis_left + 1
+    weight_left = dis_right.cast('float32') - label
+    weight_right = label - dis_left.cast('float32')
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+@register
+@serializable
+class QualityFocalLoss(nn.Layer):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(QualityFocalLoss, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+        Args:
+            pred (Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (tuple([Tensor])): Target category label with shape
+                (N,) and target quality label with shape (N,).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        """
+
+        loss = self.loss_weight * quality_focal_loss(
+            pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid)
+
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
+
+
+@register
+@serializable
+class DistributionFocalLoss(nn.Layer):
+    """Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+        Args:
+            pred (Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        """
+        loss = self.loss_weight * distribution_focal_loss(pred, target)
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
diff --git a/ppdet/modeling/losses/iou_aware_loss.py b/ppdet/modeling/losses/iou_aware_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a9e904dd8266c606f61e35bb52121865476997e
--- /dev/null
+++ b/ppdet/modeling/losses/iou_aware_loss.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from .iou_loss import IouLoss
+from ..bbox_utils import bbox_iou
+
+
+@register
+@serializable
+class IouAwareLoss(IouLoss):
+    """
+    iou aware loss, see https://arxiv.org/abs/1912.05992
+    Args:
+        loss_weight (float): iou aware loss weight, default is 1.0
+        max_height (int): max height of input to support random shape input
+        max_width (int): max width of input to support random shape input
+    """
+
+    def __init__(self, loss_weight=1.0, giou=False, diou=False, ciou=False):
+        super(IouAwareLoss, self).__init__(
+            loss_weight=loss_weight, giou=giou, diou=diou, ciou=ciou)
+
+    def __call__(self, ioup, pbox, gbox):
+        iou = bbox_iou(
+            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
+        iou.stop_gradient = True
+        loss_iou_aware = F.binary_cross_entropy_with_logits(
+            ioup, iou, reduction='none')
+        loss_iou_aware = loss_iou_aware * self.loss_weight
+        return loss_iou_aware
diff --git a/ppdet/modeling/losses/iou_loss.py b/ppdet/modeling/losses/iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5cac22e342e633b5c413805623ba4015073b3b1
--- /dev/null
+++ b/ppdet/modeling/losses/iou_loss.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import math
+import paddle
+
+from ppdet.core.workspace import register, serializable
+from ..bbox_utils import bbox_iou
+
+__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss']
+
+
+@register
+@serializable
+class IouLoss(object):
+    """
+    iou loss, see https://arxiv.org/abs/1908.03851
+    loss = 1.0 - iou * iou
+    Args:
+        loss_weight (float): iou loss weight, default is 2.5
+        max_height (int): max height of input to support random shape input
+        max_width (int): max width of input to support random shape input
+        ciou_term (bool): whether to add ciou_term
+        loss_square (bool): whether to square the iou term
+    """
+
+    def __init__(self,
+                 loss_weight=2.5,
+                 giou=False,
+                 diou=False,
+                 ciou=False,
+                 loss_square=True):
+        self.loss_weight = loss_weight
+        self.giou = giou
+        self.diou = diou
+        self.ciou = ciou
+        self.loss_square = loss_square
+
+    def __call__(self, pbox, gbox):
+        iou = bbox_iou(
+            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
+        if self.loss_square:
+            loss_iou = 1 - iou * iou
+        else:
+            loss_iou = 1 - iou
+
+        loss_iou = loss_iou * self.loss_weight
+        return loss_iou
+
+
+@register
+@serializable
+class GIoULoss(object):
+    """
+    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
+    Args:
+        loss_weight (float): giou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        reduction (string): Options are "none", "mean" and "sum". default as none
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, reduction='none'):
+        self.loss_weight = loss_weight
+        self.eps = eps
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+
+    def bbox_overlap(self, box1, box2, eps=1e-10):
+        """calculate the iou of box1 and box2
+        Args:
+            box1 (Tensor): box1 with the shape (..., 4)
+            box2 (Tensor): box1 with the shape (..., 4)
+            eps (float): epsilon to avoid divide by zero
+        Return:
+            iou (Tensor): iou of box1 and box2
+            overlap (Tensor): overlap of box1 and box2
+            union (Tensor): union of box1 and box2
+        """
+        x1, y1, x2, y2 = box1
+        x1g, y1g, x2g, y2g = box2
+
+        xkis1 = paddle.maximum(x1, x1g)
+        ykis1 = paddle.maximum(y1, y1g)
+        xkis2 = paddle.minimum(x2, x2g)
+        ykis2 = paddle.minimum(y2, y2g)
+        w_inter = (xkis2 - xkis1).clip(0)
+        h_inter = (ykis2 - ykis1).clip(0)
+        overlap = w_inter * h_inter
+
+        area1 = (x2 - x1) * (y2 - y1)
+        area2 = (x2g - x1g) * (y2g - y1g)
+        union = area1 + area2 - overlap + eps
+        iou = overlap / union
+
+        return iou, overlap, union
+
+    def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+        box1 = [x1, y1, x2, y2]
+        box2 = [x1g, y1g, x2g, y2g]
+        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
+        miou = iou - ((area_c - union) / area_c)
+        if loc_reweight is not None:
+            loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
+            loc_thresh = 0.9
+            giou = 1 - (1 - loc_thresh
+                        ) * miou - loc_thresh * miou * loc_reweight
+        else:
+            giou = 1 - miou
+        if self.reduction == 'none':
+            loss = giou
+        elif self.reduction == 'sum':
+            loss = paddle.sum(giou * iou_weight)
+        else:
+            loss = paddle.mean(giou * iou_weight)
+        return loss * self.loss_weight
+
+
+@register
+@serializable
+class DIouLoss(GIoULoss):
+    """
+    Distance-IoU Loss, see https://arxiv.org/abs/1911.08287
+    Args:
+        loss_weight (float): giou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        use_complete_iou_loss (bool): whether to use complete iou loss
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True):
+        super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps)
+        self.use_complete_iou_loss = use_complete_iou_loss
+
+    def __call__(self, pbox, gbox, iou_weight=1.):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+        cx = (x1 + x2) / 2
+        cy = (y1 + y2) / 2
+        w = x2 - x1
+        h = y2 - y1
+
+        cxg = (x1g + x2g) / 2
+        cyg = (y1g + y2g) / 2
+        wg = x2g - x1g
+        hg = y2g - y1g
+
+        x2 = paddle.maximum(x1, x2)
+        y2 = paddle.maximum(y1, y2)
+
+        # A and B
+        xkis1 = paddle.maximum(x1, x1g)
+        ykis1 = paddle.maximum(y1, y1g)
+        xkis2 = paddle.minimum(x2, x2g)
+        ykis2 = paddle.minimum(y2, y2g)
+
+        # A or B
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
+        intsctk = intsctk * paddle.greater_than(
+            xkis2, xkis1) * paddle.greater_than(ykis2, ykis1)
+        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
+                                                        ) - intsctk + self.eps
+        iouk = intsctk / unionk
+
+        # DIOU term
+        dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)
+        dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)
+        diou_term = (dist_intersection + self.eps) / (dist_union + self.eps)
+
+        # CIOU term
+        ciou_term = 0
+        if self.use_complete_iou_loss:
+            ar_gt = wg / hg
+            ar_pred = w / h
+            arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred)
+            ar_loss = 4. / np.pi / np.pi * arctan * arctan
+            alpha = ar_loss / (1 - iouk + ar_loss + self.eps)
+            alpha.stop_gradient = True
+            ciou_term = alpha * ar_loss
+
+        diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight)
+
+        return diou * self.loss_weight
+
+
+@register
+@serializable
+class SIoULoss(GIoULoss):
+    """
+    see https://arxiv.org/pdf/2205.12740.pdf 
+    Args:
+        loss_weight (float): siou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        theta (float): default as 4
+        reduction (str): Options are "none", "mean" and "sum". default as none
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'):
+        super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps)
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.theta = theta
+        self.reduction = reduction
+
+    def __call__(self, pbox, gbox):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+
+        box1 = [x1, y1, x2, y2]
+        box2 = [x1g, y1g, x2g, y2g]
+        iou = bbox_iou(box1, box2)
+
+        cx = (x1 + x2) / 2
+        cy = (y1 + y2) / 2
+        w = x2 - x1 + self.eps
+        h = y2 - y1 + self.eps
+
+        cxg = (x1g + x2g) / 2
+        cyg = (y1g + y2g) / 2
+        wg = x2g - x1g + self.eps
+        hg = y2g - y1g + self.eps
+
+        x2 = paddle.maximum(x1, x2)
+        y2 = paddle.maximum(y1, y2)
+
+        # A or B
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        cw_out = xc2 - xc1
+        ch_out = yc2 - yc1
+
+        ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg)
+        cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg)
+
+        # angle cost
+        dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2)
+        sin_angle_alpha = ch / dist_intersection
+        sin_angle_beta = cw / dist_intersection
+        thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2
+        thred.stop_gradient = True
+        sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta,
+                                 sin_angle_alpha)
+        angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2)
+
+        # distance cost
+        gamma = 2 - angle_cost
+        # gamma.stop_gradient = True
+        beta_x = ((cxg - cx) / cw_out)**2
+        beta_y = ((cyg - cy) / ch_out)**2
+        dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma *
+                                                                     beta_y)
+
+        # shape cost
+        omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg)
+        omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg)
+        omega = (1 - paddle.exp(-omega_w))**self.theta + (
+            1 - paddle.exp(-omega_h))**self.theta
+        siou_loss = 1 - iou + (omega + dist_cost) / 2
+
+        if self.reduction == 'mean':
+            siou_loss = paddle.mean(siou_loss)
+        elif self.reduction == 'sum':
+            siou_loss = paddle.sum(siou_loss)
+
+        return siou_loss * self.loss_weight
diff --git a/ppdet/modeling/losses/smooth_l1_loss.py b/ppdet/modeling/losses/smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..dab666e06a5c6879ba5074ec99aa790e8d526dc3
--- /dev/null
+++ b/ppdet/modeling/losses/smooth_l1_loss.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+__all__ = ['SmoothL1Loss']
+
+
+@register
+class SmoothL1Loss(nn.Layer):
+    """Smooth L1 Loss.
+    Args:
+        beta (float): controls smooth region, it becomes L1 Loss when beta=0.0
+        loss_weight (float): the final loss will be multiplied by this 
+    """
+
+    def __init__(self, beta=1.0, loss_weight=1.0):
+        super(SmoothL1Loss, self).__init__()
+        assert beta >= 0
+        self.beta = beta
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, reduction='none'):
+        """forward function, based on fvcore.
+        Args:
+            pred (Tensor): prediction tensor
+            target (Tensor): target tensor, pred.shape must be the same as target.shape
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        assert reduction in ('none', 'sum', 'mean')
+        target = target.detach()
+        if self.beta < 1e-5:
+            loss = paddle.abs(pred - target)
+        else:
+            n = paddle.abs(pred - target)
+            cond = n < self.beta
+            loss = paddle.where(cond, 0.5 * n**2 / self.beta,
+                                n - 0.5 * self.beta)
+        if reduction == 'mean':
+            loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum()
+        elif reduction == 'sum':
+            loss = loss.sum()
+        return loss * self.loss_weight
diff --git a/ppdet/modeling/losses/varifocal_loss.py b/ppdet/modeling/losses/varifocal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..42d18a659e824f8e1aacbe2b4bdd1b0c9b6bbf04
--- /dev/null
+++ b/ppdet/modeling/losses/varifocal_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling import ops
+
+__all__ = ['VarifocalLoss']
+
+
+def varifocal_loss(pred,
+                   target,
+                   alpha=0.75,
+                   gamma=2.0,
+                   iou_weighted=True,
+                   use_sigmoid=True):
+    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        pred (Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        alpha (float, optional): A balance factor for the negative part of
+            Varifocal Loss, which is different from the alpha of Focal Loss.
+            Defaults to 0.75.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        iou_weighted (bool, optional): Whether to weight the loss of the
+            positive example with the iou target. Defaults to True.
+    """
+    # pred and target should be of the same size
+    assert pred.shape == target.shape
+    if use_sigmoid:
+        pred_new = F.sigmoid(pred)
+    else:
+        pred_new = pred
+    target = target.cast(pred.dtype)
+    if iou_weighted:
+        focal_weight = target * (target > 0.0).cast('float32') + \
+            alpha * (pred_new - target).abs().pow(gamma) * \
+            (target <= 0.0).cast('float32')
+    else:
+        focal_weight = (target > 0.0).cast('float32') + \
+            alpha * (pred_new - target).abs().pow(gamma) * \
+            (target <= 0.0).cast('float32')
+
+    if use_sigmoid:
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, reduction='none') * focal_weight
+    else:
+        loss = F.binary_cross_entropy(
+            pred, target, reduction='none') * focal_weight
+        loss = loss.sum(axis=1)
+    return loss
+
+
+@register
+@serializable
+class VarifocalLoss(nn.Layer):
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.75,
+                 gamma=2.0,
+                 iou_weighted=True,
+                 reduction='mean',
+                 loss_weight=1.0):
+        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            alpha (float, optional): A balance factor for the negative part of
+                Varifocal Loss, which is different from the alpha of Focal
+                Loss. Defaults to 0.75.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            iou_weighted (bool, optional): Whether to weight the loss of the
+                positive examples with the iou target. Defaults to True.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super(VarifocalLoss, self).__init__()
+        assert alpha >= 0.0
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        Returns:
+            Tensor: The calculated loss
+        """
+        loss = self.loss_weight * varifocal_loss(
+            pred,
+            target,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            iou_weighted=self.iou_weighted,
+            use_sigmoid=self.use_sigmoid)
+
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
diff --git a/ppdet/modeling/losses/yolo_loss.py b/ppdet/modeling/losses/yolo_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..59df7cb9a3afdf33ef5930cf2e01c08754dda3fd
--- /dev/null
+++ b/ppdet/modeling/losses/yolo_loss.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+from ..bbox_utils import decode_yolo, xywh2xyxy, batch_iou_similarity
+
+__all__ = ['YOLOv3Loss']
+
+# YOLOv3,PP-YOLO,PP-YOLOv2 use 'YOLOv3Loss'
+
+
+def bbox_transform(pbox, anchor, downsample):
+    pbox = decode_yolo(pbox, anchor, downsample)
+    pbox = xywh2xyxy(pbox)
+    return pbox
+
+
+@register
+class YOLOv3Loss(nn.Layer):
+
+    __inject__ = ['iou_loss', 'iou_aware_loss']
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 ignore_thresh=0.7,
+                 label_smooth=False,
+                 downsample=[32, 16, 8],
+                 scale_x_y=1.,
+                 iou_loss=None,
+                 iou_aware_loss=None):
+        """
+        YOLOv3Loss layer
+
+        Args:
+            num_calsses (int): number of foreground classes
+            ignore_thresh (float): threshold to ignore confidence loss
+            label_smooth (bool): whether to use label smoothing
+            downsample (list): downsample ratio for each detection block
+            scale_x_y (float): scale_x_y factor
+            iou_loss (object): IoULoss instance
+            iou_aware_loss (object): IouAwareLoss instance  
+        """
+        super(YOLOv3Loss, self).__init__()
+        self.num_classes = num_classes
+        self.ignore_thresh = ignore_thresh
+        self.label_smooth = label_smooth
+        self.downsample = downsample
+        self.scale_x_y = scale_x_y
+        self.iou_loss = iou_loss
+        self.iou_aware_loss = iou_aware_loss
+        self.distill_pairs = []
+
+    def obj_loss(self, pbox, gbox, pobj, tobj, anchor, downsample):
+        # pbox
+        pbox = decode_yolo(pbox, anchor, downsample)
+        pbox = xywh2xyxy(pbox)
+        pbox = paddle.concat(pbox, axis=-1)
+        b = pbox.shape[0]
+        pbox = pbox.reshape((b, -1, 4))
+        # gbox
+        gxy = gbox[:, :, 0:2] - gbox[:, :, 2:4] * 0.5
+        gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5
+        gbox = paddle.concat([gxy, gwh], axis=-1)
+
+        iou = batch_iou_similarity(pbox, gbox)
+        iou.stop_gradient = True
+        iou_max = iou.max(2)  # [N, M1]
+        iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype)
+        iou_mask.stop_gradient = True
+
+        pobj = pobj.reshape((b, -1))
+        tobj = tobj.reshape((b, -1))
+        obj_mask = paddle.cast(tobj > 0, dtype=pbox.dtype)
+        obj_mask.stop_gradient = True
+
+        loss_obj = F.binary_cross_entropy_with_logits(
+            pobj, obj_mask, reduction='none')
+        loss_obj_pos = (loss_obj * tobj)
+        loss_obj_neg = (loss_obj * (1 - obj_mask) * iou_mask)
+        return loss_obj_pos + loss_obj_neg
+
+    def cls_loss(self, pcls, tcls):
+        if self.label_smooth:
+            delta = min(1. / self.num_classes, 1. / 40)
+            pos, neg = 1 - delta, delta
+            # 1 for positive, 0 for negative
+            tcls = pos * paddle.cast(
+                tcls > 0., dtype=tcls.dtype) + neg * paddle.cast(
+                    tcls <= 0., dtype=tcls.dtype)
+
+        loss_cls = F.binary_cross_entropy_with_logits(
+            pcls, tcls, reduction='none')
+        return loss_cls
+
+    def yolov3_loss(self, p, t, gt_box, anchor, downsample, scale=1.,
+                    eps=1e-10):
+        na = len(anchor)
+        b, c, h, w = p.shape
+        if self.iou_aware_loss:
+            ioup, p = p[:, 0:na, :, :], p[:, na:, :, :]
+            ioup = ioup.unsqueeze(-1)
+        p = p.reshape((b, na, -1, h, w)).transpose((0, 1, 3, 4, 2))
+        x, y = p[:, :, :, :, 0:1], p[:, :, :, :, 1:2]
+        w, h = p[:, :, :, :, 2:3], p[:, :, :, :, 3:4]
+        obj, pcls = p[:, :, :, :, 4:5], p[:, :, :, :, 5:]
+        self.distill_pairs.append([x, y, w, h, obj, pcls])
+
+        t = t.transpose((0, 1, 3, 4, 2))
+        tx, ty = t[:, :, :, :, 0:1], t[:, :, :, :, 1:2]
+        tw, th = t[:, :, :, :, 2:3], t[:, :, :, :, 3:4]
+        tscale = t[:, :, :, :, 4:5]
+        tobj, tcls = t[:, :, :, :, 5:6], t[:, :, :, :, 6:]
+
+        tscale_obj = tscale * tobj
+        loss = dict()
+
+        x = scale * F.sigmoid(x) - 0.5 * (scale - 1.)
+        y = scale * F.sigmoid(y) - 0.5 * (scale - 1.)
+
+        if abs(scale - 1.) < eps:
+            loss_x = F.binary_cross_entropy(x, tx, reduction='none')
+            loss_y = F.binary_cross_entropy(y, ty, reduction='none')
+            loss_xy = tscale_obj * (loss_x + loss_y)
+        else:
+            loss_x = paddle.abs(x - tx)
+            loss_y = paddle.abs(y - ty)
+            loss_xy = tscale_obj * (loss_x + loss_y)
+
+        loss_xy = loss_xy.sum([1, 2, 3, 4]).mean()
+
+        loss_w = paddle.abs(w - tw)
+        loss_h = paddle.abs(h - th)
+        loss_wh = tscale_obj * (loss_w + loss_h)
+        loss_wh = loss_wh.sum([1, 2, 3, 4]).mean()
+
+        loss['loss_xy'] = loss_xy
+        loss['loss_wh'] = loss_wh
+
+        if self.iou_loss is not None:
+            # warn: do not modify x, y, w, h in place
+            box, tbox = [x, y, w, h], [tx, ty, tw, th]
+            pbox = bbox_transform(box, anchor, downsample)
+            gbox = bbox_transform(tbox, anchor, downsample)
+            loss_iou = self.iou_loss(pbox, gbox)
+            loss_iou = loss_iou * tscale_obj
+            loss_iou = loss_iou.sum([1, 2, 3, 4]).mean()
+            loss['loss_iou'] = loss_iou
+
+        if self.iou_aware_loss is not None:
+            box, tbox = [x, y, w, h], [tx, ty, tw, th]
+            pbox = bbox_transform(box, anchor, downsample)
+            gbox = bbox_transform(tbox, anchor, downsample)
+            loss_iou_aware = self.iou_aware_loss(ioup, pbox, gbox)
+            loss_iou_aware = loss_iou_aware * tobj
+            loss_iou_aware = loss_iou_aware.sum([1, 2, 3, 4]).mean()
+            loss['loss_iou_aware'] = loss_iou_aware
+
+        box = [x, y, w, h]
+        loss_obj = self.obj_loss(box, gt_box, obj, tobj, anchor, downsample)
+        loss_obj = loss_obj.sum(-1).mean()
+        loss['loss_obj'] = loss_obj
+        loss_cls = self.cls_loss(pcls, tcls) * tobj
+        loss_cls = loss_cls.sum([1, 2, 3, 4]).mean()
+        loss['loss_cls'] = loss_cls
+        return loss
+
+    def forward(self, inputs, targets, anchors):
+        np = len(inputs)
+        gt_targets = [targets['target{}'.format(i)] for i in range(np)]
+        gt_box = targets['gt_bbox']
+        yolo_losses = dict()
+        self.distill_pairs.clear()
+        for x, t, anchor, downsample in zip(inputs, gt_targets, anchors,
+                                            self.downsample):
+            yolo_loss = self.yolov3_loss(x, t, gt_box, anchor, downsample,
+                                         self.scale_x_y)
+            for k, v in yolo_loss.items():
+                if k in yolo_losses:
+                    yolo_losses[k] += v
+                else:
+                    yolo_losses[k] = v
+
+        loss = 0
+        for k, v in yolo_losses.items():
+            loss += v
+
+        yolo_losses['loss'] = loss
+        return yolo_losses
diff --git a/ppdet/modeling/losses/yolov5_loss.py b/ppdet/modeling/losses/yolov5_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..641c1714e24d716efb127655697fddeb8aa40e30
--- /dev/null
+++ b/ppdet/modeling/losses/yolov5_loss.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from ..bbox_utils import bbox_iou
+
+__all__ = ['YOLOv5Loss']
+
+
+@register
+class YOLOv5Loss(nn.Layer):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 downsample_ratios=[8, 16, 32],
+                 balance=[4.0, 1.0, 0.4],
+                 box_weight=0.05,
+                 obj_weight=1.0,
+                 cls_weght=0.5,
+                 bias=0.5,
+                 anchor_t=4.0,
+                 label_smooth_eps=0.):
+        super(YOLOv5Loss, self).__init__()
+        self.num_classes = num_classes
+        self.balance = balance
+        self.na = 3  # not len(anchors)
+        self.gr = 1.0
+
+        self.BCEcls = nn.BCEWithLogitsLoss(reduction="mean")
+        self.BCEobj = nn.BCEWithLogitsLoss(reduction="mean")
+
+        self.loss_weights = {
+            'box': box_weight,
+            'obj': obj_weight,
+            'cls': cls_weght,
+        }
+
+        eps = label_smooth_eps if label_smooth_eps > 0 else 0.
+        self.cls_pos_label = 1.0 - 0.5 * eps
+        self.cls_neg_label = 0.5 * eps
+
+        self.downsample_ratios = downsample_ratios
+        self.bias = bias  # named 'g' in torch yolov5
+        self.off = np.array(
+            [
+                [0, 0],
+                [1, 0],
+                [0, 1],
+                [-1, 0],
+                [0, -1],  # j,k,l,m
+            ],
+            dtype=np.float32) * bias  # offsets
+        self.anchor_t = anchor_t
+        self.to_static = False
+
+    def build_targets(self, outputs, targets, anchors):
+        if 0:
+            # collate_batch True
+            # targets['gt_class'] [bs, max_gt_nums, 1]
+            # targets['gt_bbox'] [bs, max_gt_nums, 4]
+            # targets['pad_gt_mask'] [bs, max_gt_nums, 1]
+            gt_nums = targets['pad_gt_mask'].sum(1).squeeze(-1).numpy()
+            nt = int(sum(gt_nums))
+            anchors = anchors.numpy()
+            na = anchors.shape[1]  # not len(anchors)
+            tcls, tbox, indices, anch = [], [], [], []
+
+            gain = np.ones(7, dtype=np.float32)  # normalized to gridspace gain
+            ai = np.tile(
+                np.arange(
+                    na, dtype=np.float32).reshape(na, 1), [1, nt])
+
+            batch_size = outputs[0].shape[0]
+            gt_labels = []
+            for idx in range(batch_size):
+                gt_num = int(gt_nums[idx])
+                if gt_num == 0:
+                    continue
+                gt_bbox = targets['gt_bbox'][idx][:gt_num].numpy()
+                gt_class = targets['gt_class'][idx][:gt_num].numpy() * 1.0
+                img_idx = np.repeat(np.array([[idx]]), gt_num, axis=0)
+                gt_labels.append(
+                    np.concatenate((img_idx, gt_class, gt_bbox), -1))
+        else:
+            gt_nums = [len(bbox) for bbox in targets['gt_bbox']]
+            nt = int(sum(gt_nums))
+            anchors = anchors.numpy()
+            na = anchors.shape[1]  # not len(anchors)
+            tcls, tbox, indices, anch = [], [], [], []
+
+            gain = np.ones(7, dtype=np.float32)  # normalized to gridspace gain
+            ai = np.tile(
+                np.arange(
+                    na, dtype=np.float32).reshape(na, 1), [1, nt])
+
+            batch_size = outputs[0].shape[0]
+            gt_labels = []
+            for idx in range(batch_size):
+                gt_num = gt_nums[idx]
+                if gt_num == 0:
+                    continue
+                gt_bbox = targets['gt_bbox'][idx][:gt_num]
+                gt_class = targets['gt_class'][idx][:gt_num] * 1.0
+                img_idx = np.repeat(np.array([[idx]]), gt_num, axis=0)
+                gt_labels.append(
+                    np.concatenate((img_idx, gt_class, gt_bbox), -1))
+
+        if (len(gt_labels)):
+            gt_labels = np.concatenate(gt_labels)
+        else:
+            gt_labels = np.zeros([0, 6])
+
+        targets_labels = np.concatenate((np.tile(
+            np.expand_dims(gt_labels, 0), [na, 1, 1]), ai[:, :, None]), 2)
+        g = self.bias  # 0.5
+
+        for i in range(len(anchors)):
+            anchor = np.array(anchors[i]) / self.downsample_ratios[i]
+            gain[2:6] = np.array(
+                outputs[i].shape, dtype=np.float32)[[3, 2, 3, 2]]  # xyxy gain
+
+            # Match targets_labels to
+            t = targets_labels * gain
+            if nt:
+                # Matches
+                r = t[:, :, 4:6] / anchor[:, None]
+                j = np.maximum(r, 1 / r).max(2) < self.anchor_t
+                t = t[j]  # filter
+
+                # Offsets
+                gxy = t[:, 2:4]  # grid xy
+                gxi = gain[[2, 3]] - gxy  # inverse
+                j, k = ((gxy % 1 < g) & (gxy > 1)).T
+                l, m = ((gxi % 1 < g) & (gxi > 1)).T
+                j = np.stack((np.ones_like(j), j, k, l, m))
+                t = np.tile(t, [5, 1, 1])[j]
+                offsets = (np.zeros_like(gxy)[None] + self.off[:, None])[j]
+            else:
+                t = targets_labels[0]
+                offsets = 0
+
+            # Define
+            b, c = t[:, :2].astype(np.int64).T  # image, class
+            gxy = t[:, 2:4]  # grid xy
+            gwh = t[:, 4:6]  # grid wh
+            gij = (gxy - offsets).astype(np.int64)
+            gi, gj = gij.T  # grid xy indices
+
+            # Append
+            a = t[:, 6].astype(np.int64)  # anchor indices
+            gj, gi = gj.clip(0, gain[3] - 1), gi.clip(0, gain[2] - 1)
+            indices.append(
+                (paddle.to_tensor(b), paddle.to_tensor(a),
+                 paddle.to_tensor(gj, 'int64'), paddle.to_tensor(gi, 'int64')))
+            tbox.append(
+                paddle.to_tensor(
+                    np.concatenate((gxy - gij, gwh), 1), dtype=paddle.float32))
+            anch.append(paddle.to_tensor(anchor[a]))
+            tcls.append(paddle.to_tensor(c))
+        return tcls, tbox, indices, anch
+
+    def yolov5_loss(self, pi, t_cls, t_box, t_indices, t_anchor, balance):
+        loss = dict()
+        b, a, gj, gi = t_indices  # image, anchor, gridy, gridx
+        n = b.shape[0]  # number of targets
+        tobj = paddle.zeros_like(pi[:, :, :, :, 4])
+        loss_box = paddle.to_tensor([0.])
+        loss_cls = paddle.to_tensor([0.])
+        if n:
+            mask = paddle.stack([b, a, gj, gi], 1)
+            ps = pi.gather_nd(mask)
+            # Regression
+            pxy = F.sigmoid(ps[:, :2]) * 2 - 0.5
+            pwh = (F.sigmoid(ps[:, 2:4]) * 2)**2 * t_anchor
+            pbox = paddle.concat((pxy, pwh), 1)
+            iou = bbox_iou(pbox.T, t_box.T, x1y1x2y2=False, ciou=True)
+            loss_box = (1.0 - iou).mean()
+
+            # Objectness
+            score_iou = paddle.cast(iou.detach().clip(0), tobj.dtype)
+            # with paddle.no_grad():
+            #     x = paddle.gather_nd(tobj, mask)
+            #     tobj = paddle.scatter_nd_add(
+            #         tobj, mask, (1.0 - self.gr) + self.gr * score_iou - x)
+            with paddle.no_grad():
+                tobj[b, a, gj, gi] = (1.0 - self.gr
+                                      ) + self.gr * score_iou  # iou ratio
+
+            # Classification
+            if self.num_classes > 1:  # cls loss (only if multiple classes)
+                # t = paddle.full_like(ps[:, 5:], self.cls_neg_label)
+                # t[range(n), t_cls] = self.cls_pos_label
+                # loss_cls = self.BCEcls(ps[:, 5:], t)
+
+                t = paddle.full_like(ps[:, 5:], self.cls_neg_label)
+                if not self.to_static:
+                    t = paddle.put_along_axis(
+                        t,
+                        t_cls.unsqueeze(-1),
+                        values=self.cls_pos_label,
+                        axis=1)
+                else:
+                    for i in range(n):
+                        t[i, t_cls[i]] = self.cls_pos_label
+
+                loss_cls = self.BCEcls(ps[:, 5:], t)
+
+        obji = self.BCEobj(pi[:, :, :, :, 4], tobj)  # [bs, 3, h, w]
+
+        loss_obj = obji * balance
+
+        loss['loss_box'] = loss_box * self.loss_weights['box']
+        loss['loss_obj'] = loss_obj * self.loss_weights['obj']
+        loss['loss_cls'] = loss_cls * self.loss_weights['cls']
+        return loss
+
+    def forward(self, inputs, targets, anchors):
+        yolo_losses = dict()
+        if not self.to_static:
+            tcls, tbox, indices, anch = self.build_targets(inputs, targets,
+                                                           anchors)
+        else:
+            tcls, tbox, indices, anch = self.build_targets_paddle(
+                inputs, targets, anchors)
+
+        for i, (p_det, balance) in enumerate(zip(inputs, self.balance)):
+            t_cls = tcls[i]
+            t_box = tbox[i]
+            t_anchor = anch[i]
+            t_indices = indices[i]
+
+            bs, ch, h, w = p_det.shape
+            pi = p_det.reshape(
+                (bs, self.na, int(ch / self.na), h, w)).transpose(
+                    (0, 1, 3, 4, 2))
+
+            yolo_loss = self.yolov5_loss(pi, t_cls, t_box, t_indices, t_anchor,
+                                         balance)
+
+            for k, v in yolo_loss.items():
+                if k in yolo_losses:
+                    yolo_losses[k] += v
+                else:
+                    yolo_losses[k] = v
+
+        batch_size = inputs[0].shape[0]
+        num_gpus = targets.get('num_gpus', 8)
+        loss = 0
+        for k, v in yolo_losses.items():
+            yolo_losses[k] = v * batch_size * num_gpus
+            loss += yolo_losses[k]
+        yolo_losses['loss'] = loss
+        return yolo_losses
+
+    def build_targets_paddle(self, outputs, targets, anchors):
+        # targets['gt_class'] [bs, max_gt_nums, 1]
+        # targets['gt_bbox'] [bs, max_gt_nums, 4]
+        # targets['pad_gt_mask'] [bs, max_gt_nums, 1]
+        gt_nums = [len(bbox) for bbox in targets['gt_bbox']]
+        nt = int(sum(gt_nums))
+        anchors = anchors
+        na = anchors.shape[1]  # not len(anchors)
+        tcls, tbox, indices, anch = [], [], [], []
+
+        gain = paddle.ones(
+            [7], dtype=paddle.float32)  # normalized to gridspace gain
+        ai = paddle.tile(
+            paddle.arange(
+                na, dtype=paddle.float32).reshape([na, 1]), [1, nt])
+
+        batch_size = outputs[0].shape[0]
+        gt_labels = []
+        for i, (
+                gt_num, gt_bboxs, gt_classes
+        ) in enumerate(zip(gt_nums, targets['gt_bbox'], targets['gt_class'])):
+            if gt_num == 0:
+                continue
+            gt_bbox = gt_bboxs[:gt_num].astype('float32')
+            gt_class = (gt_classes[:gt_num] * 1.0).astype('float32')
+            img_idx = paddle.repeat_interleave(
+                paddle.to_tensor([i]), gt_num,
+                axis=0)[None, :].astype('float32').T
+
+            gt_labels.append(
+                paddle.concat(
+                    (img_idx, gt_class, gt_bbox), axis=-1))
+
+        if (len(gt_labels)):
+            gt_labels = paddle.concat(gt_labels)
+        else:
+            gt_labels = paddle.zeros([0, 6], dtype=paddle.float32)
+
+        targets_labels = paddle.concat((paddle.tile(
+            paddle.unsqueeze(gt_labels, 0), [na, 1, 1]), ai[:, :, None]), 2)
+        g = self.bias  # 0.5
+
+        for i in range(len(anchors)):
+            anchor = anchors[i] / self.downsample_ratios[i]
+            gain[2:6] = paddle.to_tensor(
+                outputs[i].shape,
+                dtype=paddle.float32)[[3, 2, 3, 2]]  # xyxy gain
+
+            # Match targets_labels to
+            t = targets_labels * gain
+            if nt:
+                # Matches
+                r = t[:, :, 4:6] / anchor[:, None]
+                j = paddle.maximum(r, 1 / r).max(2) < self.anchor_t
+                t = paddle.flatten(t, 0, 1)
+                j = paddle.flatten(j.astype(paddle.int32), 0,
+                                   1).astype(paddle.bool)
+                t = t[j]  # filter
+
+                # Offsets
+                gxy = t[:, 2:4]  # grid xy
+                gxi = gain[[2, 3]] - gxy  # inverse
+                j, k = ((gxy % 1 < g) & (gxy > 1)).T.astype(paddle.int64)
+                l, m = ((gxi % 1 < g) & (gxi > 1)).T.astype(paddle.int64)
+                j = paddle.flatten(
+                    paddle.stack((paddle.ones_like(j), j, k, l, m)), 0,
+                    1).astype(paddle.bool)
+                t = paddle.flatten(paddle.tile(t, [5, 1, 1]), 0, 1)
+                t = t[j]
+                offsets = paddle.zeros_like(gxy)[None, :] + paddle.to_tensor(
+                    self.off)[:, None]
+                offsets = paddle.flatten(offsets, 0, 1)[j]
+            else:
+                t = targets_labels[0]
+                offsets = 0
+
+            # Define
+            b, c = t[:, :2].astype(paddle.int64).T  # image, class
+            gxy = t[:, 2:4]  # grid xy
+            gwh = t[:, 4:6]  # grid wh
+            gij = (gxy - offsets).astype(paddle.int64)
+            gi, gj = gij.T  # grid xy indices
+
+            # Append
+            a = t[:, 6].astype(paddle.int64)  # anchor indices
+            gj, gi = gj.clip(0, gain[3] - 1), gi.clip(0, gain[2] - 1)
+            indices.append(
+                (b, a, gj.astype(paddle.int64), gi.astype(paddle.int64)))
+            tbox.append(
+                paddle.concat((gxy - gij, gwh), 1).astype(paddle.float32))
+            anch.append(anchor[a])
+            tcls.append(c)
+        return tcls, tbox, indices, anch
diff --git a/ppdet/modeling/losses/yolov7_loss.py b/ppdet/modeling/losses/yolov7_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ceefaaa5c71fb6160cd1f0bb9090c99145fe150
--- /dev/null
+++ b/ppdet/modeling/losses/yolov7_loss.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from ..bbox_utils import bbox_iou
+
+__all__ = ['YOLOv7Loss']
+
+
+@register
+class YOLOv7Loss(nn.Layer):
+    """
+    this code is based on https://github.com/WongKinYiu/yolov7
+    """
+    __shared__ = ['num_classes', 'use_aux']
+
+    def __init__(self,
+                 num_classes=80,
+                 downsample_ratios=[8, 16, 32],
+                 balance=[4.0, 1.0, 0.4],
+                 box_weight=0.05,
+                 cls_weght=0.3,
+                 obj_weight=0.7,
+                 bias=0.5,
+                 anchor_t=4.0,
+                 label_smooth_eps=0.,
+                 use_aux=False):
+        super(YOLOv7Loss, self).__init__()
+        self.num_classes = num_classes
+        self.balance = balance
+        self.use_aux = use_aux
+        if self.use_aux:
+            self.balance = balance * 2
+        self.na = 3  # len(anchors[0]) not len(anchors)
+        self.gr = 1.0
+
+        self.BCEcls = nn.BCEWithLogitsLoss(
+            pos_weight=paddle.to_tensor([1.0]), reduction="mean")
+        self.BCEobj = nn.BCEWithLogitsLoss(
+            pos_weight=paddle.to_tensor([1.0]), reduction="mean")
+
+        self.loss_weights = {
+            'box': box_weight,
+            'obj': obj_weight,
+            'cls': cls_weght,
+        }
+
+        eps = label_smooth_eps if label_smooth_eps > 0 else 0.
+        self.cls_pos_label = 1.0 - 0.5 * eps
+        self.cls_neg_label = 0.5 * eps
+
+        self.downsample_ratios = downsample_ratios
+        if self.use_aux:
+            self.downsample_ratios = downsample_ratios * 2
+        self.bias = bias  # named 'g' in torch yolov5/yolov7
+        self.off = np.array(
+            [
+                [0, 0],
+                [1, 0],
+                [0, 1],
+                [-1, 0],
+                [0, -1],  # j,k,l,m
+            ],
+            dtype=np.float32) * bias  # offsets
+        self.anchor_t = anchor_t
+
+    def forward(self, head_outs, gt_targets, anchors):
+        self.nl = len(anchors)
+
+        # 1.split head_outs feature from [b,c,h,w] to [b,na,c//na,h,w]
+        inputs = []
+        for i in range(self.nl):
+            pi = head_outs[i]
+            bs, _, h, w = pi.shape
+            pi = pi.reshape((bs, self.na, -1, h, w)).transpose((0, 1, 3, 4, 2))
+            inputs.append(pi)
+        if self.use_aux:
+            for i in range(self.nl):
+                pi = head_outs[i + self.nl]
+                bs, _, h, w = pi.shape
+                pi = pi.reshape((bs, self.na, -1, h, w)).transpose(
+                    (0, 1, 3, 4, 2))
+                inputs.append(pi)
+
+        # 2.generate targets_labels [nt, 6] from gt_targets(dict)
+        anchors = anchors.numpy()
+        if 0:
+            # collate_batch True
+            # gt_targets['gt_class'] [bs, max_gt_nums, 1]
+            # gt_targets['gt_bbox'] [bs, max_gt_nums, 4]
+            # gt_targets['pad_gt_mask'] [bs, max_gt_nums, 1]
+            gt_nums = gt_targets['pad_gt_mask'].sum(1).squeeze(-1).numpy()
+            batch_size = head_outs[0].shape[0]
+            targets_labels = []  # [nt, 6]
+            for idx in range(batch_size):
+                gt_num = int(gt_nums[idx])
+                if gt_num == 0:
+                    continue
+                gt_bbox = gt_targets['gt_bbox'][idx][:gt_num].reshape(
+                    [-1, 4]).numpy()
+                gt_class = gt_targets['gt_class'][idx][:gt_num].reshape(
+                    [-1, 1]).numpy() * 1.0
+                img_idx = np.repeat(np.array([[idx]]), gt_num, axis=0)
+                targets_labels.append(
+                    np.concatenate((img_idx, gt_class, gt_bbox), -1))
+        else:
+            gt_nums = [len(bbox) for bbox in gt_targets['gt_bbox']]
+            batch_size = head_outs[0].shape[0]
+            targets_labels = []  # [nt, 6]
+            for idx in range(batch_size):
+                gt_num = int(gt_nums[idx])
+                if gt_num == 0:
+                    continue
+                gt_bbox = gt_targets['gt_bbox'][idx][:gt_num].reshape([-1, 4])
+                gt_class = gt_targets['gt_class'][idx][:gt_num].reshape(
+                    [-1, 1]) * 1.0
+                img_idx = np.repeat(np.array([[idx]]), gt_num, axis=0)
+                targets_labels.append(
+                    np.concatenate((img_idx, gt_class, gt_bbox), -1))
+
+        if (len(targets_labels)):
+            targets_labels = np.concatenate(targets_labels)
+        else:
+            targets_labels = np.zeros([0, 6])
+
+        # 3.build targets
+        batch_images = gt_targets['image']  # just get shape
+        if not self.use_aux:
+            bs, as_, gjs, gis, targets, anchors = self.build_targets(
+                inputs, targets_labels, anchors, batch_images)
+            pre_gen_gains = [
+                paddle.to_tensor(pp.shape, 'float32')[[3, 2, 3, 2]]
+                for pp in inputs
+            ]
+        else:
+            bs_aux, as_aux_, gjs_aux, gis_aux, targets_aux, anchors_aux = self.build_targets2(
+                inputs[:self.nl], targets_labels, anchors, batch_images)
+            bs, as_, gjs, gis, targets, anchors = self.build_targets(
+                inputs[:self.nl], targets_labels, anchors, batch_images)
+            pre_gen_gains_aux = [
+                paddle.to_tensor(pp.shape, 'float32')[[3, 2, 3, 2]]
+                for pp in inputs[:self.nl]
+            ]
+            pre_gen_gains = [
+                paddle.to_tensor(pp.shape, 'float32')[[3, 2, 3, 2]]
+                for pp in inputs[:self.nl]
+            ]
+
+        # Losses
+        lcls, lbox = paddle.zeros([1]), paddle.zeros([1])
+        lobj = paddle.zeros([1])  # single class will always be tensor([0.])
+        for i in range(self.nl):
+            pi = inputs[i]
+            b, a, gj, gi = bs[i], as_[i], gjs[i], gis[i]
+            tobj = paddle.zeros_like(pi[..., 0])
+            n = b.shape[0]  # number of targets
+            if n:
+                ps = pi[b, a, gj, gi]  # numpy index
+                if len(ps.shape) == 1:  # Note: when only one sample
+                    ps = ps.unsqueeze(0)
+
+                # Regression
+                tensor_grid = paddle.to_tensor(np.stack([gi, gj], 1), 'float32')
+                tensor_anch = paddle.to_tensor(anchors[i], 'float32')
+                tensor_box = paddle.to_tensor(targets[i][:, 2:6], 'float32')
+                pxy = F.sigmoid(ps[:, :2]) * 2. - 0.5
+                pwh = (F.sigmoid(ps[:, 2:4]) * 2)**2 * tensor_anch
+                pbox = paddle.concat([pxy, pwh], 1)  # predicted box
+                selected_tbox = tensor_box * pre_gen_gains[i]
+                selected_tbox[:, :2] -= tensor_grid
+                iou = bbox_iou(
+                    pbox.T,
+                    selected_tbox.T,
+                    x1y1x2y2=False,
+                    ciou=True,
+                    eps=1e-7)
+                lbox += (1.0 - iou).mean()
+
+                # Objectness
+                score_iou = paddle.cast(iou.detach().clip(0), tobj.dtype)
+                with paddle.no_grad():
+                    # numpy index
+                    tobj[b, a, gj, gi] = (1.0 - self.gr) + self.gr * score_iou
+
+                # Classification
+                selected_tcls = targets[i][:, 1].astype(np.int64)
+                if self.num_classes > 1:  # cls loss (only if multiple classes)
+                    t = paddle.full_like(ps[:, 5:], self.cls_neg_label)
+                    t[range(n), selected_tcls] = self.cls_pos_label
+                    lcls += self.BCEcls(ps[:, 5:], t)
+
+            if self.use_aux:
+                pi_aux = inputs[i + self.nl]
+                b_aux, a_aux, gj_aux, gi_aux = bs_aux[i], as_aux_[i], gjs_aux[
+                    i], gis_aux[i]
+                tobj_aux = paddle.zeros_like(pi_aux[..., 0])
+
+                n_aux = b_aux.shape[0]  # number of targets
+                if n_aux:
+                    ps_aux = pi_aux[b_aux, a_aux, gj_aux, gi_aux]  # numpy index
+                    if len(ps_aux.shape) == 1:  # Note: when only one sample
+                        ps_aux = ps_aux.unsqueeze(0)
+
+                    # Regression
+                    tensor_grid_aux = paddle.to_tensor(
+                        np.stack([gi_aux, gj_aux], 1), 'float32')
+                    tensor_anch_aux = paddle.to_tensor(anchors_aux[i],
+                                                       'float32')
+                    tensor_box_aux = paddle.to_tensor(targets_aux[i][:, 2:6],
+                                                      'float32')
+                    pxy_aux = F.sigmoid(ps_aux[:, :2]) * 2. - 0.5
+                    pwh_aux = (F.sigmoid(ps_aux[:, 2:4]) *
+                               2)**2 * tensor_anch_aux
+                    pbox_aux = paddle.concat((pxy_aux, pwh_aux), 1)
+                    selected_tbox_aux = tensor_box_aux * pre_gen_gains_aux[i]
+                    selected_tbox_aux[:, :2] -= tensor_grid_aux
+                    iou_aux = bbox_iou(
+                        pbox_aux.T,
+                        selected_tbox_aux.T,
+                        x1y1x2y2=False,
+                        ciou=True)
+                    lbox += 0.25 * (1.0 - iou_aux).mean()
+
+                    # Objectness
+                    score_iou_aux = paddle.cast(iou_aux.detach().clip(0),
+                                                tobj_aux.dtype)
+                    with paddle.no_grad():
+                        tobj_aux[b_aux, a_aux, gj_aux, gi_aux] = (
+                            1.0 - self.gr) + self.gr * score_iou_aux
+
+                    # Classification
+                    selected_tcls_aux = targets_aux[i][:, 1].astype(np.int64)
+                    if self.num_classes > 1:  # cls loss (only if multiple classes)
+                        t_aux = paddle.full_like(ps_aux[:, 5:],
+                                                 self.cls_neg_label)
+                        t_aux[range(n_aux),
+                              selected_tcls_aux] = self.cls_pos_label
+                        lcls += 0.25 * self.BCEcls(ps_aux[:, 5:], t_aux)
+
+            obji = self.BCEobj(pi[:, :, :, :, 4], tobj)
+            lobj += obji * self.balance[i]  # obj loss
+            if self.use_aux:
+                obji_aux = self.BCEobj(pi_aux[:, :, :, :, 4], tobj_aux)
+                lobj += 0.25 * obji_aux * self.balance[i]  # obj_aux loss
+
+        batch_size = head_outs[0].shape[0]
+        num_gpus = gt_targets.get('num_gpus', 8)
+        yolo_losses = dict()
+        yolo_losses['loss_box'] = lbox * self.loss_weights[
+            'box'] * batch_size * num_gpus
+        yolo_losses['loss_cls'] = lcls * self.loss_weights[
+            'cls'] * batch_size * num_gpus
+        yolo_losses['loss_obj'] = lobj * self.loss_weights[
+            'obj'] * batch_size * num_gpus
+        loss_all = yolo_losses['loss_box'] + yolo_losses[
+            'loss_obj'] + yolo_losses['loss_cls']
+        yolo_losses['loss'] = loss_all
+        return yolo_losses
+
+    def build_targets(self, p, targets, anchors, batch_images):
+        indices, anch = self.find_3_positive(p, targets, anchors)
+        # numpy indices,anch for fast assign
+
+        matching_bs = [[] for pp in p]
+        matching_as = [[] for pp in p]
+        matching_gjs = [[] for pp in p]
+        matching_gis = [[] for pp in p]
+        matching_targets = [[] for pp in p]
+        matching_anchs = [[] for pp in p]
+
+        nl = len(p)
+        for batch_idx in range(p[0].shape[0]):
+            b_idx = targets[:, 0] == batch_idx
+            if b_idx.sum() == 0:
+                continue
+            this_target = targets[b_idx]
+            txywh = this_target[:, 2:6] * batch_images[batch_idx].shape[1]
+            # this_target[:, 2:6] * 640
+            txyxy = xywh2xyxy(paddle.to_tensor(txywh, 'float32'))  # tensor op
+
+            pxyxys, p_cls, p_obj = [], [], []
+            from_which_layer = []
+            all_b, all_a, all_gj, all_gi = [], [], [], []
+            all_anch = []
+
+            empty_feats_num = 0
+            for i, pi in enumerate(p):
+                b, a, gj, gi = indices[i]
+                idx = (b == batch_idx)
+                if idx.sum() == 0:
+                    empty_feats_num += 1
+                    continue
+                b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx]
+                all_b.append(b)
+                all_a.append(a)
+                all_gj.append(gj)
+                all_gi.append(gi)
+                all_anch.append(anch[i][idx])
+                from_which_layer.append(np.ones([len(b)]) * i)
+
+                fg_pred = pi[b, a, gj, gi]  # numpy index
+                if len(fg_pred.shape) == 1:  # Note: when only one sample
+                    fg_pred = fg_pred.unsqueeze(0)
+                p_obj.append(fg_pred[:, 4:5])
+                p_cls.append(fg_pred[:, 5:])
+
+                tensor_grid = paddle.to_tensor(np.stack([gi, gj], 1), 'float32')
+                pxy = (F.sigmoid(fg_pred[:, :2]) * 2. - 0.5 + tensor_grid
+                       ) * self.downsample_ratios[i]
+                tensor_anch = paddle.to_tensor(anch[i][idx], 'float32')
+                pwh = (F.sigmoid(fg_pred[:, 2:4]) *
+                       2)**2 * tensor_anch * self.downsample_ratios[i]
+                pxywh = paddle.concat([pxy, pwh], -1)
+                pxyxy = xywh2xyxy(pxywh)  # tensor op
+                pxyxys.append(pxyxy)
+
+            if empty_feats_num == len(p) or len(pxyxys) == 0:  # Note: empty
+                continue
+            pxyxys = paddle.concat(pxyxys, 0)
+
+            p_obj = paddle.concat(p_obj, 0)
+            p_cls = paddle.concat(p_cls, 0)
+
+            from_which_layer = np.concatenate(from_which_layer, 0)
+            all_b = np.concatenate(all_b, 0)
+            all_a = np.concatenate(all_a, 0)
+            all_gj = np.concatenate(all_gj, 0)
+            all_gi = np.concatenate(all_gi, 0)
+            all_anch = np.concatenate(all_anch, 0)
+
+            #pairwise_ious = box_iou(txyxy, pxyxys)  # tensor op
+            _, h, w = batch_images[batch_idx].shape
+            pairwise_ious = box_iou_normalization(txyxy, pxyxys, h,
+                                                  w)  # tensor op
+            # [N, 4] [M, 4] to get [N, M] ious
+
+            pairwise_iou_loss = -paddle.log(pairwise_ious + 1e-5)
+
+            min_topk = 10
+            topk_ious, _ = paddle.topk(pairwise_ious,
+                                       min(min_topk, pairwise_ious.shape[1]), 1)
+            dynamic_ks = paddle.clip(topk_ious.sum(1).cast('int'), min=1)
+
+            gt_cls_per_image = (paddle.tile(
+                F.one_hot(
+                    paddle.to_tensor(this_target[:, 1], 'int64'),
+                    self.num_classes).unsqueeze(1), [1, pxyxys.shape[0], 1]))
+
+            num_gt = this_target.shape[0]
+            cls_preds_ = (
+                F.sigmoid(paddle.tile(p_cls.unsqueeze(0), [num_gt, 1, 1])) *
+                F.sigmoid(paddle.tile(p_obj.unsqueeze(0), [num_gt, 1, 1])))
+
+            y = cls_preds_.sqrt_()
+            pairwise_cls_loss = F.binary_cross_entropy_with_logits(
+                paddle.log(y / (1 - y) + 1e-5),
+                gt_cls_per_image,
+                reduction="none").sum(-1)
+            del cls_preds_
+
+            cost = (pairwise_cls_loss + 3.0 * pairwise_iou_loss)
+
+            matching_matrix = np.zeros(cost.shape)
+            for gt_idx in range(num_gt):
+                _, pos_idx = paddle.topk(
+                    cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
+                matching_matrix[gt_idx, pos_idx.numpy()] = 1.0
+            del topk_ious, dynamic_ks, pos_idx
+
+            anchor_matching_gt = matching_matrix.sum(0)
+            if (anchor_matching_gt > 1).sum() > 0:
+                cost_argmin = np.argmin(cost.numpy()[:, anchor_matching_gt > 1],
+                                        0)
+                matching_matrix[:, anchor_matching_gt > 1] *= 0.0
+                matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
+            fg_mask_inboxes = matching_matrix.sum(0) > 0.0
+            matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+
+            from_which_layer = from_which_layer[fg_mask_inboxes]
+            all_b = all_b[fg_mask_inboxes]
+            all_a = all_a[fg_mask_inboxes]
+            all_gj = all_gj[fg_mask_inboxes]
+            all_gi = all_gi[fg_mask_inboxes]
+            all_anch = all_anch[fg_mask_inboxes]
+
+            this_target = this_target[matched_gt_inds]
+
+            for i in range(nl):
+                layer_idx = from_which_layer == i
+                matching_bs[i].append(all_b[layer_idx])
+                matching_as[i].append(all_a[layer_idx])
+                matching_gjs[i].append(all_gj[layer_idx])
+                matching_gis[i].append(all_gi[layer_idx])
+                matching_targets[i].append(
+                    this_target[layer_idx])  # this_ not all_
+                matching_anchs[i].append(all_anch[layer_idx])
+
+        for i in range(nl):
+            if matching_targets[i] != []:
+                matching_bs[i] = np.concatenate(matching_bs[i], 0)
+                matching_as[i] = np.concatenate(matching_as[i], 0)
+                matching_gjs[i] = np.concatenate(matching_gjs[i], 0)
+                matching_gis[i] = np.concatenate(matching_gis[i], 0)
+                matching_targets[i] = np.concatenate(matching_targets[i], 0)
+                matching_anchs[i] = np.concatenate(matching_anchs[i], 0)
+
+        return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs
+
+    def find_3_positive(self, outputs, targets, all_anchors):
+        na, nt = self.na, targets.shape[0]  # number of anchors, targets
+        indices, anch = [], []
+        gain = np.ones(7, dtype=np.float32)  # normalized to gridspace gain
+        ai = np.tile(np.arange(na, dtype=np.float32).reshape(na, 1), [1, nt])
+        targets_labels = np.concatenate((np.tile(
+            np.expand_dims(targets, 0), [na, 1, 1]), ai[:, :, None]), 2)
+        g = self.bias  # 0.5
+
+        for i in range(len(all_anchors)):
+            anchors = np.array(all_anchors[i]) / self.downsample_ratios[i]
+            gain[2:6] = np.array(
+                outputs[i].shape, dtype=np.float32)[[3, 2, 3, 2]]  # xyxy gain
+
+            # Match targets_labels to anchors
+            t = targets_labels * gain
+            if nt:
+                # Matches
+                r = t[:, :, 4:6] / anchors[:, None]  # wh ratio
+                j = np.maximum(r, 1. / r).max(2) < self.anchor_t
+                t = t[j]  # filter
+
+                # Offsets
+                gxy = t[:, 2:4]  # grid xy
+                gxi = gain[[2, 3]] - gxy  # inverse
+                j, k = ((gxy % 1. < g) & (gxy > 1.)).T
+                l, m = ((gxi % 1. < g) & (gxi > 1.)).T
+                j = np.stack([np.ones_like(j), j, k, l, m])
+                t = np.tile(t, [5, 1, 1])[j]
+                offsets = (np.zeros_like(gxy)[None] + self.off[:, None])[j]
+            else:
+                t = targets_labels[0]
+                offsets = 0
+
+            # Define
+            b, c = t[:, :2].astype(np.int64).T
+            gxy = t[:, 2:4]  # grid xy
+            gij = (gxy - offsets).astype(np.int64)
+            gi, gj = gij.T  # grid xy indices
+
+            # Append
+            a = t[:, 6].astype(np.int64)  # anchor indices
+            gj, gi = gj.clip(0, gain[3] - 1).astype(np.int64), gi.clip(
+                0, gain[2] - 1).astype(np.int64)
+            indices.append((b, a, gj, gi))
+            anch.append(anchors[a])  # anchors
+        # return numpy rather than tensor
+        return indices, anch
+
+    def build_targets2(self, p, targets, anchors, batch_images):
+        indices, anch = self.find_5_positive(p, targets, anchors)
+        # numpy indices,anch for fast assign
+
+        matching_bs = [[] for pp in p]
+        matching_as = [[] for pp in p]
+        matching_gjs = [[] for pp in p]
+        matching_gis = [[] for pp in p]
+        matching_targets = [[] for pp in p]
+        matching_anchs = [[] for pp in p]
+
+        nl = len(p)
+        for batch_idx in range(p[0].shape[0]):
+            b_idx = targets[:, 0] == batch_idx
+            if b_idx.sum() == 0:
+                continue
+            this_target = targets[b_idx]
+            txywh = this_target[:, 2:6] * batch_images[batch_idx].shape[1]
+            # this_target[:, 2:6] * 1280
+            txyxy = xywh2xyxy(paddle.to_tensor(txywh, 'float32'))  # tensor op
+
+            pxyxys, p_cls, p_obj = [], [], []
+            from_which_layer = []
+            all_b, all_a, all_gj, all_gi = [], [], [], []
+            all_anch = []
+
+            empty_feats_num = 0
+            for i, pi in enumerate(p):
+                b, a, gj, gi = indices[i]
+                idx = (b == batch_idx)
+                if idx.sum() == 0:
+                    empty_feats_num += 1
+                    continue
+                b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx]
+                all_b.append(b)
+                all_a.append(a)
+                all_gj.append(gj)
+                all_gi.append(gi)
+                all_anch.append(anch[i][idx])
+                from_which_layer.append(np.ones([len(b)]) * i)
+
+                fg_pred = pi[b, a, gj, gi]  # numpy index
+                if len(fg_pred.shape) == 1:  # Note: when only one sample
+                    fg_pred = fg_pred.unsqueeze(0)
+                p_obj.append(fg_pred[:, 4:5])
+                p_cls.append(fg_pred[:, 5:])
+
+                tensor_grid = paddle.to_tensor(np.stack([gi, gj], 1), 'float32')
+                pxy = (F.sigmoid(fg_pred[:, :2]) * 2. - 0.5 + tensor_grid
+                       ) * self.downsample_ratios[i]
+                tensor_anch = paddle.to_tensor(anch[i][idx], 'float32')
+                pwh = (F.sigmoid(fg_pred[:, 2:4]) *
+                       2)**2 * tensor_anch * self.downsample_ratios[i]
+                pxywh = paddle.concat([pxy, pwh], -1)
+                pxyxy = xywh2xyxy(pxywh)  # tensor op
+                pxyxys.append(pxyxy)
+
+            if empty_feats_num == len(p) or len(pxyxys) == 0:  # Note: empty
+                continue
+            pxyxys = paddle.concat(pxyxys, 0)
+
+            p_obj = paddle.concat(p_obj, 0)
+            p_cls = paddle.concat(p_cls, 0)
+
+            from_which_layer = np.concatenate(from_which_layer, 0)
+            all_b = np.concatenate(all_b, 0)
+            all_a = np.concatenate(all_a, 0)
+            all_gj = np.concatenate(all_gj, 0)
+            all_gi = np.concatenate(all_gi, 0)
+            all_anch = np.concatenate(all_anch, 0)
+
+            pairwise_ious = box_iou(txyxy, pxyxys)  # tensor op
+            # [N, 4] [M, 4] to get [N, M] ious
+
+            pairwise_iou_loss = -paddle.log(pairwise_ious + 1e-8)
+
+            min_topk = 20  # diff, 10 in build_targets()
+            topk_ious, _ = paddle.topk(pairwise_ious,
+                                       min(min_topk, pairwise_ious.shape[1]), 1)
+            dynamic_ks = paddle.clip(topk_ious.sum(1).cast('int'), min=1)
+
+            gt_cls_per_image = (paddle.tile(
+                F.one_hot(
+                    paddle.to_tensor(this_target[:, 1], 'int64'),
+                    self.num_classes).unsqueeze(1), [1, pxyxys.shape[0], 1]))
+
+            num_gt = this_target.shape[0]
+            cls_preds_ = (
+                F.sigmoid(paddle.tile(p_cls.unsqueeze(0), [num_gt, 1, 1])) *
+                F.sigmoid(paddle.tile(p_obj.unsqueeze(0), [num_gt, 1, 1])))
+
+            y = cls_preds_.sqrt_()
+            pairwise_cls_loss = F.binary_cross_entropy_with_logits(
+                paddle.log(y / (1 - y) + 1e-5),
+                gt_cls_per_image,
+                reduction="none").sum(-1)
+            del cls_preds_
+
+            cost = (pairwise_cls_loss + 3.0 * pairwise_iou_loss)
+
+            matching_matrix = np.zeros(cost.shape)
+            for gt_idx in range(num_gt):
+                _, pos_idx = paddle.topk(
+                    cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
+                matching_matrix[gt_idx, pos_idx.numpy()] = 1.0
+            del topk_ious, dynamic_ks, pos_idx
+
+            anchor_matching_gt = matching_matrix.sum(0)
+            if (anchor_matching_gt > 1).sum() > 0:
+                cost_argmin = np.argmin(cost.numpy()[:, anchor_matching_gt > 1],
+                                        0)
+                matching_matrix[:, anchor_matching_gt > 1] *= 0.0
+                matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
+            fg_mask_inboxes = matching_matrix.sum(0) > 0.0
+            matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+
+            from_which_layer = from_which_layer[fg_mask_inboxes]
+            all_b = all_b[fg_mask_inboxes]
+            all_a = all_a[fg_mask_inboxes]
+            all_gj = all_gj[fg_mask_inboxes]
+            all_gi = all_gi[fg_mask_inboxes]
+            all_anch = all_anch[fg_mask_inboxes]
+
+            this_target = this_target[matched_gt_inds]
+
+            for i in range(nl):
+                layer_idx = from_which_layer == i
+                matching_bs[i].append(all_b[layer_idx])
+                matching_as[i].append(all_a[layer_idx])
+                matching_gjs[i].append(all_gj[layer_idx])
+                matching_gis[i].append(all_gi[layer_idx])
+                matching_targets[i].append(
+                    this_target[layer_idx])  # this_ not all_
+                matching_anchs[i].append(all_anch[layer_idx])
+
+        for i in range(nl):
+            if matching_targets[i] != []:
+                matching_bs[i] = np.concatenate(matching_bs[i], 0)
+                matching_as[i] = np.concatenate(matching_as[i], 0)
+                matching_gjs[i] = np.concatenate(matching_gjs[i], 0)
+                matching_gis[i] = np.concatenate(matching_gis[i], 0)
+                matching_targets[i] = np.concatenate(matching_targets[i], 0)
+                matching_anchs[i] = np.concatenate(matching_anchs[i], 0)
+
+        return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs
+
+    def find_5_positive(self, outputs, targets, all_anchors):
+        na, nt = self.na, targets.shape[0]  # number of anchors, targets
+        indices, anch = [], []
+        gain = np.ones(7, dtype=np.float32)  # normalized to gridspace gain
+        ai = np.tile(np.arange(na, dtype=np.float32).reshape(na, 1), [1, nt])
+        targets_labels = np.concatenate((np.tile(
+            np.expand_dims(targets, 0), [na, 1, 1]), ai[:, :, None]), 2)
+        g = 1.0  # Note: diff, not self.bias(0.5) in find_3_positive()
+
+        for i in range(len(all_anchors)):
+            anchors = np.array(all_anchors[i]) / self.downsample_ratios[i]
+            gain[2:6] = np.array(
+                outputs[i].shape, dtype=np.float32)[[3, 2, 3, 2]]  # xyxy gain
+
+            # Match targets_labels to anchors
+            t = targets_labels * gain
+            if nt:
+                # Matches
+                r = t[:, :, 4:6] / anchors[:, None]  # wh ratio
+                j = np.maximum(r, 1. / r).max(2) < self.anchor_t
+                t = t[j]  # filter
+
+                # Offsets
+                gxy = t[:, 2:4]  # grid xy
+                gxi = gain[[2, 3]] - gxy  # inverse
+                j, k = ((gxy % 1. < g) & (gxy > 1.)).T
+                l, m = ((gxi % 1. < g) & (gxi > 1.)).T
+                j = np.stack([np.ones_like(j), j, k, l, m])
+                t = np.tile(t, [5, 1, 1])[j]
+                offsets = (np.zeros_like(gxy)[None] + self.off[:, None])[j]
+            else:
+                t = targets_labels[0]
+                offsets = 0
+
+            # Define
+            b, c = t[:, :2].astype(np.int64).T
+            gxy = t[:, 2:4]  # grid xy
+            gij = (gxy - offsets).astype(np.int64)
+            gi, gj = gij.T  # grid xy indices
+
+            # Append
+            a = t[:, 6].astype(np.int64)  # anchor indices
+            gj, gi = gj.clip(0, gain[3] - 1).astype(np.int64), gi.clip(
+                0, gain[2] - 1).astype(np.int64)
+            indices.append((b, a, gj, gi))
+            anch.append(anchors[a])  # anchors
+        # return numpy rather than tensor
+        return indices, anch
+
+
+def xywh2xyxy(x):
+    """
+    [x, y, w, h] to [x1, y1, x2, y2], paddle Tensor op
+    """
+    y = x.clone()
+    y[:, 0] = x[:, 0] - x[:, 2] / 2
+    y[:, 1] = x[:, 1] - x[:, 3] / 2
+    y[:, 2] = x[:, 0] + x[:, 2] / 2
+    y[:, 3] = x[:, 1] + x[:, 3] / 2
+    return y
+
+
+def box_iou(box1, box2):
+    """
+    [N, 4] [M, 4] to get [N, M] ious, boxes in [x1, y1, x2, y2] format. paddle Tensor op
+     """
+
+    def box_area(box):
+        return (box[2] - box[0]) * (box[3] - box[1])
+
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+    inter = (paddle.minimum(box1[:, None, 2:], box2[:, 2:]) - paddle.maximum(
+        box1[:, None, :2], box2[:, :2])).clip(0).prod(2)
+    return inter / (area1[:, None] + area2 - inter)
+
+
+def box_iou_normalization(box1, box2, h, w):
+    """
+    [N, 4] [M, 4] to get [N, M] ious, boxes in [x1, y1, x2, y2] format. paddle Tensor op
+     """
+
+    def box_area(box):
+        return (box[2] - box[0]) / h * (box[3] - box[1]) / w
+
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+
+    xy_max = paddle.minimum(paddle.unsqueeze(box1, 1)[:, :, 2:], box2[:, 2:])
+    xy_min = paddle.maximum(paddle.unsqueeze(box1, 1)[:, :, :2], box2[:, :2])
+    width_height = xy_max - xy_min
+
+    width_height = width_height.clip(min=0)
+    width_height[:, :, 0] = width_height[:, :, 0] / h
+    width_height[:, :, 1] = width_height[:, :, 1] / w
+    inter = width_height.prod(2)
+
+    return inter / (area1[:, None] + area2 - inter)
diff --git a/ppdet/modeling/necks/__init__.py b/ppdet/modeling/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a67c94617f2671653c51894bedbdd370b9c9d6a
--- /dev/null
+++ b/ppdet/modeling/necks/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import fpn
+from . import yolo_fpn
+from . import custom_pan
+from . import yolov6_pafpn
+from . import yolov7_pafpn
+from . import rtmdet_pafpn
+from . import yolov8_pafpn
+from . import dilated_encoder
+
+from .fpn import *
+from .yolo_fpn import *
+from .custom_pan import *
+from .yolov6_pafpn import *
+from .yolov7_pafpn import *
+from .rtmdet_pafpn import *
+from .yolov8_pafpn import *
+from .dilated_encoder import *
diff --git a/ppdet/modeling/necks/custom_pan.py b/ppdet/modeling/necks/custom_pan.py
new file mode 100644
index 0000000000000000000000000000000000000000..23fbb7cde680dea2ba3a8909a953042e80008fdc
--- /dev/null
+++ b/ppdet/modeling/necks/custom_pan.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.layers import DropBlock, MultiHeadAttention
+from ppdet.modeling.ops import get_act_fn
+from ..backbones.cspresnet import ConvBNLayer, BasicBlock
+from ..shape_spec import ShapeSpec
+from ..initializer import linear_init_
+
+__all__ = ['CustomCSPPAN']
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
+
+
+class SPP(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 k,
+                 pool_size,
+                 act='swish',
+                 data_format='NCHW'):
+        super(SPP, self).__init__()
+        self.pool = []
+        self.data_format = data_format
+        for i, size in enumerate(pool_size):
+            pool = self.add_sublayer(
+                'pool{}'.format(i),
+                nn.MaxPool2D(
+                    kernel_size=size,
+                    stride=1,
+                    padding=size // 2,
+                    data_format=data_format,
+                    ceil_mode=False))
+            self.pool.append(pool)
+        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)
+
+    def forward(self, x):
+        outs = [x]
+        for pool in self.pool:
+            outs.append(pool(x))
+        if self.data_format == 'NCHW':
+            y = paddle.concat(outs, axis=1)
+        else:
+            y = paddle.concat(outs, axis=-1)
+
+        y = self.conv(y)
+        return y
+
+
+class CSPStage(nn.Layer):
+    def __init__(self,
+                 block_fn,
+                 ch_in,
+                 ch_out,
+                 n,
+                 act='swish',
+                 spp=False,
+                 use_alpha=False):
+        super(CSPStage, self).__init__()
+
+        ch_mid = int(ch_out // 2)
+        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        self.convs = nn.Sequential()
+        next_ch_in = ch_mid
+        for i in range(n):
+            self.convs.add_sublayer(
+                str(i),
+                eval(block_fn)(next_ch_in,
+                               ch_mid,
+                               act=act,
+                               shortcut=False,
+                               use_alpha=use_alpha))
+            if i == (n - 1) // 2 and spp:
+                self.convs.add_sublayer(
+                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
+            next_ch_in = ch_mid
+        self.conv3 = ConvBNLayer(ch_mid * 2, ch_out, 1, act=act)
+
+    def forward(self, x):
+        y1 = self.conv1(x)
+        y2 = self.conv2(x)
+        y2 = self.convs(y2)
+        y = paddle.concat([y1, y2], axis=1)
+        y = self.conv3(y)
+        return y
+
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+@register
+@serializable
+class CustomCSPPAN(nn.Layer):
+    __shared__ = [
+        'norm_type', 'data_format', 'width_mult', 'depth_mult', 'trt',
+        'eval_size'
+    ]
+
+    def __init__(self,
+                 in_channels=[256, 512, 1024],
+                 out_channels=[1024, 512, 256],
+                 norm_type='bn',
+                 act='leaky',
+                 stage_fn='CSPStage',
+                 block_fn='BasicBlock',
+                 stage_num=1,
+                 block_num=3,
+                 drop_block=False,
+                 block_size=3,
+                 keep_prob=0.9,
+                 spp=False,
+                 data_format='NCHW',
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 use_alpha=False,
+                 trt=False,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='gelu',
+                 nhead=4,
+                 num_layers=4,
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 use_trans=False,
+                 eval_size=None):
+
+        super(CustomCSPPAN, self).__init__()
+        out_channels = [max(round(c * width_mult), 1) for c in out_channels]
+        block_num = max(round(block_num * depth_mult), 1)
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+        self.num_blocks = len(in_channels)
+        self.data_format = data_format
+        self._out_channels = out_channels
+
+        self.hidden_dim = in_channels[-1]
+        in_channels = in_channels[::-1]
+
+        self.use_trans = use_trans
+        self.eval_size = eval_size
+        if use_trans:
+            if eval_size is not None:
+                self.pos_embed = self.build_2d_sincos_position_embedding(
+                    eval_size[1] // 32,
+                    eval_size[0] // 32,
+                    embed_dim=self.hidden_dim)
+            else:
+                self.pos_embed = None
+
+            encoder_layer = TransformerEncoderLayer(
+                self.hidden_dim, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before)
+            encoder_norm = nn.LayerNorm(
+                self.hidden_dim) if normalize_before else None
+            self.encoder = TransformerEncoder(encoder_layer, num_layers,
+                                              encoder_norm)
+
+        fpn_stages = []
+        fpn_routes = []
+        for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)):
+            if i > 0:
+                ch_in += ch_pre // 2
+
+            stage = nn.Sequential()
+            for j in range(stage_num):
+                stage.add_sublayer(
+                    str(j),
+                    eval(stage_fn)(block_fn,
+                                   ch_in if j == 0 else ch_out,
+                                   ch_out,
+                                   block_num,
+                                   act=act,
+                                   spp=(spp and i == 0),
+                                   use_alpha=use_alpha))
+
+            if drop_block:
+                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))
+
+            fpn_stages.append(stage)
+
+            if i < self.num_blocks - 1:
+                fpn_routes.append(
+                    ConvBNLayer(
+                        ch_in=ch_out,
+                        ch_out=ch_out // 2,
+                        filter_size=1,
+                        stride=1,
+                        padding=0,
+                        act=act))
+
+            ch_pre = ch_out
+
+        self.fpn_stages = nn.LayerList(fpn_stages)
+        self.fpn_routes = nn.LayerList(fpn_routes)
+
+        pan_stages = []
+        pan_routes = []
+        for i in reversed(range(self.num_blocks - 1)):
+            pan_routes.append(
+                ConvBNLayer(
+                    ch_in=out_channels[i + 1],
+                    ch_out=out_channels[i + 1],
+                    filter_size=3,
+                    stride=2,
+                    padding=1,
+                    act=act))
+
+            ch_in = out_channels[i] + out_channels[i + 1]
+            ch_out = out_channels[i]
+            stage = nn.Sequential()
+            for j in range(stage_num):
+                stage.add_sublayer(
+                    str(j),
+                    eval(stage_fn)(block_fn,
+                                   ch_in if j == 0 else ch_out,
+                                   ch_out,
+                                   block_num,
+                                   act=act,
+                                   spp=False,
+                                   use_alpha=use_alpha))
+            if drop_block:
+                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))
+
+            pan_stages.append(stage)
+
+        self.pan_stages = nn.LayerList(pan_stages[::-1])
+        self.pan_routes = nn.LayerList(pan_routes[::-1])
+
+    def build_2d_sincos_position_embedding(
+            self,
+            w,
+            h,
+            embed_dim=1024,
+            temperature=10000., ):
+        grid_w = paddle.arange(int(w), dtype=paddle.float32)
+        grid_h = paddle.arange(int(h), dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+        return pos_emb
+
+    def forward(self, blocks, for_mot=False):
+        if self.use_trans:
+            last_feat = blocks[-1]
+            n, c, h, w = last_feat.shape
+
+            # flatten [B, C, H, W] to [B, HxW, C]
+            src_flatten = last_feat.flatten(2).transpose([0, 2, 1])
+            if self.eval_size is not None and not self.training:
+                pos_embed = self.pos_embed
+            else:
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    w=w, h=h, embed_dim=self.hidden_dim)
+
+            memory = self.encoder(src_flatten, pos_embed=pos_embed)
+            last_feat_encode = memory.transpose([0, 2, 1]).reshape([n, c, h, w])
+            blocks[-1] = last_feat_encode
+
+        blocks = blocks[::-1]
+        fpn_feats = []
+
+        for i, block in enumerate(blocks):
+            if i > 0:
+                block = paddle.concat([route, block], axis=1)
+            route = self.fpn_stages[i](block)
+            fpn_feats.append(route)
+
+            if i < self.num_blocks - 1:
+                route = self.fpn_routes[i](route)
+                route = F.interpolate(
+                    route, scale_factor=2., data_format=self.data_format)
+
+        pan_feats = [fpn_feats[-1], ]
+        route = fpn_feats[-1]
+        for i in reversed(range(self.num_blocks - 1)):
+            block = fpn_feats[i]
+            route = self.pan_routes[i](route)
+            block = paddle.concat([route, block], axis=1)
+            route = self.pan_stages[i](block)
+            pan_feats.append(route)
+
+        return pan_feats[::-1]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/necks/dilated_encoder.py b/ppdet/modeling/necks/dilated_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bbc7fd1bc895933c4a6175dfa74d3b3d95071b3
--- /dev/null
+++ b/ppdet/modeling/necks/dilated_encoder.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingUniform, Constant, Normal
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['DilatedEncoder']
+
+
+class Bottleneck(nn.Layer):
+    def __init__(self, in_channels, mid_channels, dilation):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Sequential(* [
+            nn.Conv2D(
+                in_channels,
+                mid_channels,
+                1,
+                padding=0,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(0.0))),
+            nn.BatchNorm2D(
+                mid_channels,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+            nn.ReLU(),
+        ])
+        self.conv2 = nn.Sequential(* [
+            nn.Conv2D(
+                mid_channels,
+                mid_channels,
+                3,
+                padding=dilation,
+                dilation=dilation,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(0.0))),
+            nn.BatchNorm2D(
+                mid_channels,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+            nn.ReLU(),
+        ])
+        self.conv3 = nn.Sequential(* [
+            nn.Conv2D(
+                mid_channels,
+                in_channels,
+                1,
+                padding=0,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(0.0))),
+            nn.BatchNorm2D(
+                in_channels,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+            nn.ReLU(),
+        ])
+
+    def forward(self, x):
+        identity = x
+        y = self.conv3(self.conv2(self.conv1(x)))
+        return y + identity
+
+
+@register
+class DilatedEncoder(nn.Layer):
+    """
+    DilatedEncoder used in YOLOF
+    """
+
+    def __init__(self,
+                 in_channels=[2048],
+                 out_channels=[512],
+                 block_mid_channels=128,
+                 num_residual_blocks=4,
+                 block_dilations=[2, 4, 6, 8]):
+        super(DilatedEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        assert len(self.in_channels) == 1, "YOLOF only has one level feature."
+        assert len(self.out_channels) == 1, "YOLOF only has one level feature."
+
+        self.block_mid_channels = block_mid_channels
+        self.num_residual_blocks = num_residual_blocks
+        self.block_dilations = block_dilations
+
+        out_ch = self.out_channels[0]
+        self.lateral_conv = nn.Conv2D(
+            self.in_channels[0],
+            out_ch,
+            1,
+            weight_attr=ParamAttr(initializer=KaimingUniform(
+                negative_slope=1, nonlinearity='leaky_relu')),
+            bias_attr=ParamAttr(initializer=Constant(value=0.0)))
+        self.lateral_norm = nn.BatchNorm2D(
+            out_ch,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        self.fpn_conv = nn.Conv2D(
+            out_ch,
+            out_ch,
+            3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=KaimingUniform(
+                negative_slope=1, nonlinearity='leaky_relu')))
+        self.fpn_norm = nn.BatchNorm2D(
+            out_ch,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        encoder_blocks = []
+        for i in range(self.num_residual_blocks):
+            encoder_blocks.append(
+                Bottleneck(
+                    out_ch,
+                    self.block_mid_channels,
+                    dilation=block_dilations[i]))
+        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)
+
+    def forward(self, inputs, for_mot=False):
+        out = self.lateral_norm(self.lateral_conv(inputs[0]))
+        out = self.fpn_norm(self.fpn_conv(out))
+        out = self.dilated_encoder_blocks(out)
+        return [out]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self.out_channels]
diff --git a/ppdet/modeling/necks/fpn.py b/ppdet/modeling/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d08ca415c7acce7d4495dae5834d53961ea9df57
--- /dev/null
+++ b/ppdet/modeling/necks/fpn.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import XavierUniform
+
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.layers import ConvNormLayer
+from ..shape_spec import ShapeSpec
+
+__all__ = ['FPN']
+
+
+@register
+@serializable
+class FPN(nn.Layer):
+    """
+    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
+
+    Args:
+        in_channels (list[int]): input channels of each level which can be 
+            derived from the output shape of backbone by from_config
+        out_channel (int): output channel of each level
+        spatial_scales (list[float]): the spatial scales between input feature
+            maps and original input image which can be derived from the output 
+            shape of backbone by from_config
+        has_extra_convs (bool): whether to add extra conv to the last level.
+            default False
+        extra_stage (int): the number of extra stages added to the last level.
+            default 1
+        use_c5 (bool): Whether to use c5 as the input of extra stage, 
+            otherwise p5 is used. default True
+        norm_type (string|None): The normalization type in FPN module. If 
+            norm_type is None, norm will not be used after conv and if 
+            norm_type is string, bn, gn, sync_bn are available. default None
+        norm_decay (float): weight decay for normalization layer weights.
+            default 0.
+        freeze_norm (bool): whether to freeze normalization layer.  
+            default False
+        relu_before_extra_convs (bool): whether to add relu before extra convs.
+            default False
+        
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channel,
+                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
+                 has_extra_convs=False,
+                 extra_stage=1,
+                 use_c5=True,
+                 norm_type=None,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 relu_before_extra_convs=True):
+        super(FPN, self).__init__()
+        self.out_channel = out_channel
+        for s in range(extra_stage):
+            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
+        self.spatial_scales = spatial_scales
+        self.has_extra_convs = has_extra_convs
+        self.extra_stage = extra_stage
+        self.use_c5 = use_c5
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+
+        self.lateral_convs = []
+        self.fpn_convs = []
+        fan = out_channel * 3 * 3
+
+        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
+        # 0 <= st_stage < ed_stage <= 3
+        st_stage = 4 - len(in_channels)
+        ed_stage = st_stage + len(in_channels) - 1
+        for i in range(st_stage, ed_stage + 1):
+            if i == 3:
+                lateral_name = 'fpn_inner_res5_sum'
+            else:
+                lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
+            in_c = in_channels[i - st_stage]
+            if self.norm_type is not None:
+                lateral = self.add_sublayer(
+                    lateral_name,
+                    ConvNormLayer(
+                        ch_in=in_c,
+                        ch_out=out_channel,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=self.norm_type,
+                        norm_decay=self.norm_decay,
+                        freeze_norm=self.freeze_norm,
+                        initializer=XavierUniform(fan_out=in_c)))
+            else:
+                lateral = self.add_sublayer(
+                    lateral_name,
+                    nn.Conv2D(
+                        in_channels=in_c,
+                        out_channels=out_channel,
+                        kernel_size=1,
+                        weight_attr=ParamAttr(
+                            initializer=XavierUniform(fan_out=in_c))))
+            self.lateral_convs.append(lateral)
+
+            fpn_name = 'fpn_res{}_sum'.format(i + 2)
+            if self.norm_type is not None:
+                fpn_conv = self.add_sublayer(
+                    fpn_name,
+                    ConvNormLayer(
+                        ch_in=out_channel,
+                        ch_out=out_channel,
+                        filter_size=3,
+                        stride=1,
+                        norm_type=self.norm_type,
+                        norm_decay=self.norm_decay,
+                        freeze_norm=self.freeze_norm,
+                        initializer=XavierUniform(fan_out=fan)))
+            else:
+                fpn_conv = self.add_sublayer(
+                    fpn_name,
+                    nn.Conv2D(
+                        in_channels=out_channel,
+                        out_channels=out_channel,
+                        kernel_size=3,
+                        padding=1,
+                        weight_attr=ParamAttr(
+                            initializer=XavierUniform(fan_out=fan))))
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
+        if self.has_extra_convs:
+            for i in range(self.extra_stage):
+                lvl = ed_stage + 1 + i
+                if i == 0 and self.use_c5:
+                    in_c = in_channels[-1]
+                else:
+                    in_c = out_channel
+                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
+                if self.norm_type is not None:
+                    extra_fpn_conv = self.add_sublayer(
+                        extra_fpn_name,
+                        ConvNormLayer(
+                            ch_in=in_c,
+                            ch_out=out_channel,
+                            filter_size=3,
+                            stride=2,
+                            norm_type=self.norm_type,
+                            norm_decay=self.norm_decay,
+                            freeze_norm=self.freeze_norm,
+                            initializer=XavierUniform(fan_out=fan)))
+                else:
+                    extra_fpn_conv = self.add_sublayer(
+                        extra_fpn_name,
+                        nn.Conv2D(
+                            in_channels=in_c,
+                            out_channels=out_channel,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            weight_attr=ParamAttr(
+                                initializer=XavierUniform(fan_out=fan))))
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
+
+    def forward(self, body_feats):
+        laterals = []
+        num_levels = len(body_feats)
+        for i in range(num_levels):
+            laterals.append(self.lateral_convs[i](body_feats[i]))
+
+        for i in range(1, num_levels):
+            lvl = num_levels - i
+            upsample = F.interpolate(
+                laterals[lvl],
+                scale_factor=2.,
+                mode='nearest', )
+            laterals[lvl - 1] += upsample
+
+        fpn_output = []
+        for lvl in range(num_levels):
+            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))
+
+        if self.extra_stage > 0:
+            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
+            if not self.has_extra_convs:
+                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
+                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
+            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
+            else:
+                if self.use_c5:
+                    extra_source = body_feats[-1]
+                else:
+                    extra_source = fpn_output[-1]
+                fpn_output.append(self.fpn_convs[num_levels](extra_source))
+
+                for i in range(1, self.extra_stage):
+                    if self.relu_before_extra_convs:
+                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
+                            fpn_output[-1])))
+                    else:
+                        fpn_output.append(self.fpn_convs[num_levels + i](
+                            fpn_output[-1]))
+        return fpn_output
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channel, stride=1. / s)
+            for s in self.spatial_scales
+        ]
diff --git a/ppdet/modeling/necks/rtmdet_pafpn.py b/ppdet/modeling/necks/rtmdet_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fb427e749b49f00c5349c75e00e582a6a7573df
--- /dev/null
+++ b/ppdet/modeling/necks/rtmdet_pafpn.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ..backbones.cspnext import CSPNeXtLayer
+from ..backbones.csp_darknet import BaseConv, DWConv
+from ..shape_spec import ShapeSpec
+
+__all__ = ['CSPNeXtPAFPN']
+
+
+@register
+@serializable
+class CSPNeXtPAFPN(nn.Layer):
+    """
+    CSPNeXtPAFPN of RTMDet.
+    """
+    __shared__ = ['depth_mult', 'width_mult', 'data_format', 'act', 'trt']
+
+    def __init__(self,
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 in_channels=[256, 512, 1024],
+                 out_channels=256,
+                 depthwise=False,
+                 data_format='NCHW',
+                 act='silu',
+                 trt=False):
+        super(CSPNeXtPAFPN, self).__init__()
+        self.in_channels = in_channels
+        self._out_channels = [
+            int(out_channels * width_mult) for _ in range(len(in_channels))
+        ]
+        Conv = DWConv if depthwise else BaseConv
+
+        self.data_format = data_format
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+        # top-down fpn
+        self.lateral_convs = nn.LayerList()
+        self.fpn_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(
+                BaseConv(
+                    int(in_channels[idx]),
+                    int(in_channels[idx - 1]),
+                    1,
+                    1,
+                    act=act))
+            self.fpn_blocks.append(
+                CSPNeXtLayer(
+                    int(in_channels[idx - 1] * 2),
+                    int(in_channels[idx - 1]),
+                    round(3 * depth_mult),
+                    shortcut=False,
+                    depthwise=depthwise,
+                    act=act))
+
+        # bottom-up pan
+        self.downsample_convs = nn.LayerList()
+        self.pan_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                Conv(
+                    int(in_channels[idx]),
+                    int(in_channels[idx]),
+                    3,
+                    stride=2,
+                    act=act))
+            self.pan_blocks.append(
+                CSPNeXtLayer(
+                    int(in_channels[idx] * 2),
+                    int(in_channels[idx + 1]),
+                    round(3 * depth_mult),
+                    shortcut=False,
+                    depthwise=depthwise,
+                    act=act))
+
+        # CSPNeXtPAFPN new added
+        self.out_convs = nn.LayerList()
+        for in_ch, out_ch in zip(self.in_channels, self._out_channels):
+            self.out_convs.append(Conv(in_ch, out_ch, 3, 1, act=act))
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+
+        # top-down fpn
+        inner_outs = [feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = F.interpolate(
+                feat_heigh,
+                scale_factor=2.,
+                mode="nearest",
+                data_format=self.data_format)
+            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat(
+                    [upsample_feat, feat_low], axis=1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up pan
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](paddle.concat(
+                [downsample_feat, feat_height], axis=1))
+            outs.append(out)
+        # [4, 96, 80, 80] [4, 192, 40, 40] [4, 384, 20, 20]
+
+        # out convs
+        for i, conv in enumerate(self.out_convs):
+            outs[i] = conv(outs[i])
+
+        return outs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/necks/yolo_fpn.py b/ppdet/modeling/necks/yolo_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..40ebe4dba1160a9969f75630d5e6e7cd58050792
--- /dev/null
+++ b/ppdet/modeling/necks/yolo_fpn.py
@@ -0,0 +1,1105 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.layers import DropBlock
+from ppdet.modeling.ops import get_act_fn
+from ..backbones.darknet import ConvBNLayer
+from ..shape_spec import ShapeSpec
+from ..backbones.csp_darknet import BaseConv, DWConv, CSPLayer
+
+__all__ = [
+    'YOLOv3FPN',
+    'PPYOLOFPN',
+    'PPYOLOTinyFPN',
+    'PPYOLOPAN',
+    'YOLOCSPPAN',
+]
+
+# PP-YOLO use 'PPYOLOFPN' and 'PPYOLOTinyFPN', PP-YOLOv2 use 'PPYOLOPAN'
+# YOLOX and YOLOv5 use the same 'YOLOCSPPAN'
+
+
+def add_coord(x, data_format):
+    b = paddle.shape(x)[0]
+    if data_format == 'NCHW':
+        h, w = x.shape[2], x.shape[3]
+    else:
+        h, w = x.shape[1], x.shape[2]
+
+    gx = paddle.cast(paddle.arange(w) / ((w - 1.) * 2.0) - 1., x.dtype)
+    gy = paddle.cast(paddle.arange(h) / ((h - 1.) * 2.0) - 1., x.dtype)
+
+    if data_format == 'NCHW':
+        gx = gx.reshape([1, 1, 1, w]).expand([b, 1, h, w])
+        gy = gy.reshape([1, 1, h, 1]).expand([b, 1, h, w])
+    else:
+        gx = gx.reshape([1, 1, w, 1]).expand([b, h, w, 1])
+        gy = gy.reshape([1, h, 1, 1]).expand([b, h, w, 1])
+
+    gx.stop_gradient = True
+    gy.stop_gradient = True
+    return gx, gy
+
+
+class YoloDetBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 channel,
+                 norm_type,
+                 freeze_norm=False,
+                 name='',
+                 data_format='NCHW'):
+        """
+        YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767
+
+        Args:
+            ch_in (int): input channel
+            channel (int): base channel
+            norm_type (str): batch norm type
+            freeze_norm (bool): whether to freeze norm, default False
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(YoloDetBlock, self).__init__()
+        self.ch_in = ch_in
+        self.channel = channel
+        assert channel % 2 == 0, \
+            "channel {} cannot be divided by 2".format(channel)
+        conv_def = [
+            ['conv0', ch_in, channel, 1, '.0.0'],
+            ['conv1', channel, channel * 2, 3, '.0.1'],
+            ['conv2', channel * 2, channel, 1, '.1.0'],
+            ['conv3', channel, channel * 2, 3, '.1.1'],
+            ['route', channel * 2, channel, 1, '.2'],
+        ]
+
+        self.conv_module = nn.Sequential()
+        for idx, (conv_name, ch_in, ch_out, filter_size,
+                  post_name) in enumerate(conv_def):
+            self.conv_module.add_sublayer(
+                conv_name,
+                ConvBNLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    filter_size=filter_size,
+                    padding=(filter_size - 1) // 2,
+                    norm_type=norm_type,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format,
+                    name=name + post_name))
+
+        self.tip = ConvBNLayer(
+            ch_in=channel,
+            ch_out=channel * 2,
+            filter_size=3,
+            padding=1,
+            norm_type=norm_type,
+            freeze_norm=freeze_norm,
+            data_format=data_format,
+            name=name + '.tip')
+
+    def forward(self, inputs):
+        route = self.conv_module(inputs)
+        tip = self.tip(route)
+        return route, tip
+
+
+class SPP(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 k,
+                 pool_size,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 name='',
+                 act='leaky',
+                 data_format='NCHW'):
+        """
+        SPP layer, which consist of four pooling layer follwed by conv layer
+
+        Args:
+            ch_in (int): input channel of conv layer
+            ch_out (int): output channel of conv layer
+            k (int): kernel size of conv layer
+            norm_type (str): batch norm type
+            freeze_norm (bool): whether to freeze norm, default False
+            name (str): layer name
+            act (str): activation function
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(SPP, self).__init__()
+        self.pool = []
+        self.data_format = data_format
+        for size in pool_size:
+            pool = self.add_sublayer(
+                '{}.pool1'.format(name),
+                nn.MaxPool2D(
+                    kernel_size=size,
+                    stride=1,
+                    padding=size // 2,
+                    data_format=data_format,
+                    ceil_mode=False))
+            self.pool.append(pool)
+        self.conv = ConvBNLayer(
+            ch_in,
+            ch_out,
+            k,
+            padding=k // 2,
+            norm_type=norm_type,
+            freeze_norm=freeze_norm,
+            name=name,
+            act=act,
+            data_format=data_format)
+
+    def forward(self, x):
+        outs = [x]
+        for pool in self.pool:
+            outs.append(pool(x))
+        if self.data_format == "NCHW":
+            y = paddle.concat(outs, axis=1)
+        else:
+            y = paddle.concat(outs, axis=-1)
+
+        y = self.conv(y)
+        return y
+
+
+class CoordConv(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 padding,
+                 norm_type,
+                 freeze_norm=False,
+                 name='',
+                 data_format='NCHW'):
+        """
+        CoordConv layer, see https://arxiv.org/abs/1807.03247
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            padding (int): padding size, default 0
+            norm_type (str): batch norm type, default bn
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+
+        """
+        super(CoordConv, self).__init__()
+        self.conv = ConvBNLayer(
+            ch_in + 2,
+            ch_out,
+            filter_size=filter_size,
+            padding=padding,
+            norm_type=norm_type,
+            freeze_norm=freeze_norm,
+            data_format=data_format,
+            name=name)
+        self.data_format = data_format
+
+    def forward(self, x):
+        gx, gy = add_coord(x, self.data_format)
+        if self.data_format == 'NCHW':
+            y = paddle.concat([x, gx, gy], axis=1)
+        else:
+            y = paddle.concat([x, gx, gy], axis=-1)
+        y = self.conv(y)
+        return y
+
+
+class PPYOLODetBlock(nn.Layer):
+    def __init__(self, cfg, name, data_format='NCHW'):
+        """
+        PPYOLODetBlock layer
+
+        Args:
+            cfg (list): layer configs for this block
+            name (str): block name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(PPYOLODetBlock, self).__init__()
+        self.conv_module = nn.Sequential()
+        for idx, (conv_name, layer, args, kwargs) in enumerate(cfg[:-1]):
+            kwargs.update(
+                name='{}.{}'.format(name, conv_name), data_format=data_format)
+            self.conv_module.add_sublayer(conv_name, layer(*args, **kwargs))
+
+        conv_name, layer, args, kwargs = cfg[-1]
+        kwargs.update(
+            name='{}.{}'.format(name, conv_name), data_format=data_format)
+        self.tip = layer(*args, **kwargs)
+
+    def forward(self, inputs):
+        route = self.conv_module(inputs)
+        tip = self.tip(route)
+        return route, tip
+
+
+class PPYOLOTinyDetBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 name,
+                 drop_block=False,
+                 block_size=3,
+                 keep_prob=0.9,
+                 data_format='NCHW'):
+        """
+        PPYOLO Tiny DetBlock layer
+        Args:
+            ch_in (list): input channel number
+            ch_out (list): output channel number
+            name (str): block name
+            drop_block: whether user DropBlock
+            block_size: drop block size
+            keep_prob: probability to keep block in DropBlock
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(PPYOLOTinyDetBlock, self).__init__()
+        self.drop_block_ = drop_block
+        self.conv_module = nn.Sequential()
+
+        cfgs = [
+            # name, in channels, out channels, filter_size, 
+            # stride, padding, groups
+            ['.0', ch_in, ch_out, 1, 1, 0, 1],
+            ['.1', ch_out, ch_out, 5, 1, 2, ch_out],
+            ['.2', ch_out, ch_out, 1, 1, 0, 1],
+            ['.route', ch_out, ch_out, 5, 1, 2, ch_out],
+        ]
+        for cfg in cfgs:
+            conv_name, conv_ch_in, conv_ch_out, filter_size, stride, padding, \
+                    groups = cfg
+            self.conv_module.add_sublayer(
+                name + conv_name,
+                ConvBNLayer(
+                    ch_in=conv_ch_in,
+                    ch_out=conv_ch_out,
+                    filter_size=filter_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    name=name + conv_name))
+
+        self.tip = ConvBNLayer(
+            ch_in=ch_out,
+            ch_out=ch_out,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            name=name + conv_name)
+
+        if self.drop_block_:
+            self.drop_block = DropBlock(
+                block_size=block_size,
+                keep_prob=keep_prob,
+                data_format=data_format,
+                name=name + '.dropblock')
+
+    def forward(self, inputs):
+        if self.drop_block_:
+            inputs = self.drop_block(inputs)
+        route = self.conv_module(inputs)
+        tip = self.tip(route)
+        return route, tip
+
+
+class PPYOLODetBlockCSP(nn.Layer):
+    def __init__(self,
+                 cfg,
+                 ch_in,
+                 ch_out,
+                 act,
+                 norm_type,
+                 name,
+                 data_format='NCHW'):
+        """
+        PPYOLODetBlockCSP layer
+
+        Args:
+            cfg (list): layer configs for this block
+            ch_in (int): input channel
+            ch_out (int): output channel
+            act (str): default mish
+            name (str): block name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(PPYOLODetBlockCSP, self).__init__()
+        self.data_format = data_format
+        self.conv1 = ConvBNLayer(
+            ch_in,
+            ch_out,
+            1,
+            padding=0,
+            act=act,
+            norm_type=norm_type,
+            name=name + '.left',
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            ch_in,
+            ch_out,
+            1,
+            padding=0,
+            act=act,
+            norm_type=norm_type,
+            name=name + '.right',
+            data_format=data_format)
+        self.conv3 = ConvBNLayer(
+            ch_out * 2,
+            ch_out * 2,
+            1,
+            padding=0,
+            act=act,
+            norm_type=norm_type,
+            name=name,
+            data_format=data_format)
+        self.conv_module = nn.Sequential()
+        for idx, (layer_name, layer, args, kwargs) in enumerate(cfg):
+            kwargs.update(name=name + layer_name, data_format=data_format)
+            self.conv_module.add_sublayer(layer_name, layer(*args, **kwargs))
+
+    def forward(self, inputs):
+        conv_left = self.conv1(inputs)
+        conv_right = self.conv2(inputs)
+        conv_left = self.conv_module(conv_left)
+        if self.data_format == 'NCHW':
+            conv = paddle.concat([conv_left, conv_right], axis=1)
+        else:
+            conv = paddle.concat([conv_left, conv_right], axis=-1)
+
+        conv = self.conv3(conv)
+        return conv, conv
+
+
+@register
+@serializable
+class YOLOv3FPN(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 in_channels=[256, 512, 1024],
+                 norm_type='bn',
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        YOLOv3FPN layer
+
+        Args:
+            in_channels (list): input channels for fpn
+            norm_type (str): batch norm type, default bn
+            data_format (str): data format, NCHW or NHWC
+
+        """
+        super(YOLOv3FPN, self).__init__()
+        assert len(in_channels) > 0, "in_channels length should > 0"
+        self.in_channels = in_channels
+        self.num_blocks = len(in_channels)
+
+        self._out_channels = []
+        self.yolo_blocks = []
+        self.routes = []
+        self.data_format = data_format
+        for i in range(self.num_blocks):
+            name = 'yolo_block.{}'.format(i)
+            in_channel = in_channels[-i - 1]
+            if i > 0:
+                in_channel += 512 // (2**i)
+            yolo_block = self.add_sublayer(
+                name,
+                YoloDetBlock(
+                    in_channel,
+                    channel=512 // (2**i),
+                    norm_type=norm_type,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format,
+                    name=name))
+            self.yolo_blocks.append(yolo_block)
+            # tip layer output channel doubled
+            self._out_channels.append(1024 // (2**i))
+
+            if i < self.num_blocks - 1:
+                name = 'yolo_transition.{}'.format(i)
+                route = self.add_sublayer(
+                    name,
+                    ConvBNLayer(
+                        ch_in=512 // (2**i),
+                        ch_out=256 // (2**i),
+                        filter_size=1,
+                        stride=1,
+                        padding=0,
+                        norm_type=norm_type,
+                        freeze_norm=freeze_norm,
+                        data_format=data_format,
+                        name=name))
+                self.routes.append(route)
+
+    def forward(self, blocks, for_mot=False):
+        assert len(blocks) == self.num_blocks
+        blocks = blocks[::-1]
+        yolo_feats = []
+
+        # add embedding features output for multi-object tracking model
+        if for_mot:
+            emb_feats = []
+
+        for i, block in enumerate(blocks):
+            if i > 0:
+                if self.data_format == 'NCHW':
+                    block = paddle.concat([route, block], axis=1)
+                else:
+                    block = paddle.concat([route, block], axis=-1)
+            route, tip = self.yolo_blocks[i](block)
+            yolo_feats.append(tip)
+
+            if for_mot:
+                # add embedding features output
+                emb_feats.append(route)
+
+            if i < self.num_blocks - 1:
+                route = self.routes[i](route)
+                route = F.interpolate(
+                    route, scale_factor=2., data_format=self.data_format)
+
+        if for_mot:
+            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
+        else:
+            return yolo_feats
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class PPYOLOFPN(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 in_channels=[512, 1024, 2048],
+                 norm_type='bn',
+                 freeze_norm=False,
+                 data_format='NCHW',
+                 coord_conv=False,
+                 conv_block_num=2,
+                 drop_block=False,
+                 block_size=3,
+                 keep_prob=0.9,
+                 spp=False):
+        """
+        PPYOLOFPN layer
+
+        Args:
+            in_channels (list): input channels for fpn
+            norm_type (str): batch norm type, default bn
+            data_format (str): data format, NCHW or NHWC
+            coord_conv (bool): whether use CoordConv or not
+            conv_block_num (int): conv block num of each pan block
+            drop_block (bool): whether use DropBlock or not
+            block_size (int): block size of DropBlock
+            keep_prob (float): keep probability of DropBlock
+            spp (bool): whether use spp or not
+
+        """
+        super(PPYOLOFPN, self).__init__()
+        assert len(in_channels) > 0, "in_channels length should > 0"
+        self.in_channels = in_channels
+        self.num_blocks = len(in_channels)
+        # parse kwargs
+        self.coord_conv = coord_conv
+        self.drop_block = drop_block
+        self.block_size = block_size
+        self.keep_prob = keep_prob
+        self.spp = spp
+        self.conv_block_num = conv_block_num
+        self.data_format = data_format
+        if self.coord_conv:
+            ConvLayer = CoordConv
+        else:
+            ConvLayer = ConvBNLayer
+
+        if self.drop_block:
+            dropblock_cfg = [[
+                'dropblock', DropBlock, [self.block_size, self.keep_prob],
+                dict()
+            ]]
+        else:
+            dropblock_cfg = []
+
+        self._out_channels = []
+        self.yolo_blocks = []
+        self.routes = []
+        for i, ch_in in enumerate(self.in_channels[::-1]):
+            if i > 0:
+                ch_in += 512 // (2**i)
+            channel = 64 * (2**self.num_blocks) // (2**i)
+            base_cfg = []
+            c_in, c_out = ch_in, channel
+            for j in range(self.conv_block_num):
+                base_cfg += [
+                    [
+                        'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1],
+                        dict(
+                            padding=0,
+                            norm_type=norm_type,
+                            freeze_norm=freeze_norm)
+                    ],
+                    [
+                        'conv{}'.format(2 * j + 1), ConvBNLayer,
+                        [c_out, c_out * 2, 3], dict(
+                            padding=1,
+                            norm_type=norm_type,
+                            freeze_norm=freeze_norm)
+                    ],
+                ]
+                c_in, c_out = c_out * 2, c_out
+
+            base_cfg += [[
+                'route', ConvLayer, [c_in, c_out, 1], dict(
+                    padding=0, norm_type=norm_type, freeze_norm=freeze_norm)
+            ], [
+                'tip', ConvLayer, [c_out, c_out * 2, 3], dict(
+                    padding=1, norm_type=norm_type, freeze_norm=freeze_norm)
+            ]]
+
+            if self.conv_block_num == 2:
+                if i == 0:
+                    if self.spp:
+                        spp_cfg = [[
+                            'spp', SPP, [channel * 4, channel, 1], dict(
+                                pool_size=[5, 9, 13],
+                                norm_type=norm_type,
+                                freeze_norm=freeze_norm)
+                        ]]
+                    else:
+                        spp_cfg = []
+                    cfg = base_cfg[0:3] + spp_cfg + base_cfg[
+                        3:4] + dropblock_cfg + base_cfg[4:6]
+                else:
+                    cfg = base_cfg[0:2] + dropblock_cfg + base_cfg[2:6]
+            elif self.conv_block_num == 0:
+                if self.spp and i == 0:
+                    spp_cfg = [[
+                        'spp', SPP, [c_in * 4, c_in, 1], dict(
+                            pool_size=[5, 9, 13],
+                            norm_type=norm_type,
+                            freeze_norm=freeze_norm)
+                    ]]
+                else:
+                    spp_cfg = []
+                cfg = spp_cfg + dropblock_cfg + base_cfg
+            name = 'yolo_block.{}'.format(i)
+            yolo_block = self.add_sublayer(name, PPYOLODetBlock(cfg, name))
+            self.yolo_blocks.append(yolo_block)
+            self._out_channels.append(channel * 2)
+            if i < self.num_blocks - 1:
+                name = 'yolo_transition.{}'.format(i)
+                route = self.add_sublayer(
+                    name,
+                    ConvBNLayer(
+                        ch_in=channel,
+                        ch_out=256 // (2**i),
+                        filter_size=1,
+                        stride=1,
+                        padding=0,
+                        norm_type=norm_type,
+                        freeze_norm=freeze_norm,
+                        data_format=data_format,
+                        name=name))
+                self.routes.append(route)
+
+    def forward(self, blocks, for_mot=False):
+        assert len(blocks) == self.num_blocks
+        blocks = blocks[::-1]
+        yolo_feats = []
+
+        # add embedding features output for multi-object tracking model
+        if for_mot:
+            emb_feats = []
+
+        for i, block in enumerate(blocks):
+            if i > 0:
+                if self.data_format == 'NCHW':
+                    block = paddle.concat([route, block], axis=1)
+                else:
+                    block = paddle.concat([route, block], axis=-1)
+            route, tip = self.yolo_blocks[i](block)
+            yolo_feats.append(tip)
+
+            if for_mot:
+                # add embedding features output
+                emb_feats.append(route)
+
+            if i < self.num_blocks - 1:
+                route = self.routes[i](route)
+                route = F.interpolate(
+                    route, scale_factor=2., data_format=self.data_format)
+
+        if for_mot:
+            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
+        else:
+            return yolo_feats
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class PPYOLOTinyFPN(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 in_channels=[80, 56, 34],
+                 detection_block_channels=[160, 128, 96],
+                 norm_type='bn',
+                 data_format='NCHW',
+                 **kwargs):
+        """
+        PPYOLO Tiny FPN layer
+        Args:
+            in_channels (list): input channels for fpn
+            detection_block_channels (list): channels in fpn
+            norm_type (str): batch norm type, default bn
+            data_format (str): data format, NCHW or NHWC
+            kwargs: extra key-value pairs, such as parameter of DropBlock and spp 
+        """
+        super(PPYOLOTinyFPN, self).__init__()
+        assert len(in_channels) > 0, "in_channels length should > 0"
+        self.in_channels = in_channels[::-1]
+        assert len(detection_block_channels
+                   ) > 0, "detection_block_channelslength should > 0"
+        self.detection_block_channels = detection_block_channels
+        self.data_format = data_format
+        self.num_blocks = len(in_channels)
+        # parse kwargs
+        self.drop_block = kwargs.get('drop_block', False)
+        self.block_size = kwargs.get('block_size', 3)
+        self.keep_prob = kwargs.get('keep_prob', 0.9)
+
+        self.spp_ = kwargs.get('spp', False)
+        if self.spp_:
+            self.spp = SPP(self.in_channels[0] * 4,
+                           self.in_channels[0],
+                           k=1,
+                           pool_size=[5, 9, 13],
+                           norm_type=norm_type,
+                           name='spp')
+
+        self._out_channels = []
+        self.yolo_blocks = []
+        self.routes = []
+        for i, (
+                ch_in, ch_out
+        ) in enumerate(zip(self.in_channels, self.detection_block_channels)):
+            name = 'yolo_block.{}'.format(i)
+            if i > 0:
+                ch_in += self.detection_block_channels[i - 1]
+            yolo_block = self.add_sublayer(
+                name,
+                PPYOLOTinyDetBlock(
+                    ch_in,
+                    ch_out,
+                    name,
+                    drop_block=self.drop_block,
+                    block_size=self.block_size,
+                    keep_prob=self.keep_prob))
+            self.yolo_blocks.append(yolo_block)
+            self._out_channels.append(ch_out)
+
+            if i < self.num_blocks - 1:
+                name = 'yolo_transition.{}'.format(i)
+                route = self.add_sublayer(
+                    name,
+                    ConvBNLayer(
+                        ch_in=ch_out,
+                        ch_out=ch_out,
+                        filter_size=1,
+                        stride=1,
+                        padding=0,
+                        norm_type=norm_type,
+                        data_format=data_format,
+                        name=name))
+                self.routes.append(route)
+
+    def forward(self, blocks, for_mot=False):
+        assert len(blocks) == self.num_blocks
+        blocks = blocks[::-1]
+        yolo_feats = []
+
+        # add embedding features output for multi-object tracking model
+        if for_mot:
+            emb_feats = []
+
+        for i, block in enumerate(blocks):
+            if i == 0 and self.spp_:
+                block = self.spp(block)
+
+            if i > 0:
+                if self.data_format == 'NCHW':
+                    block = paddle.concat([route, block], axis=1)
+                else:
+                    block = paddle.concat([route, block], axis=-1)
+            route, tip = self.yolo_blocks[i](block)
+            yolo_feats.append(tip)
+
+            if for_mot:
+                # add embedding features output
+                emb_feats.append(route)
+
+            if i < self.num_blocks - 1:
+                route = self.routes[i](route)
+                route = F.interpolate(
+                    route, scale_factor=2., data_format=self.data_format)
+
+        if for_mot:
+            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
+        else:
+            return yolo_feats
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class PPYOLOPAN(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 in_channels=[512, 1024, 2048],
+                 norm_type='bn',
+                 data_format='NCHW',
+                 act='mish',
+                 conv_block_num=3,
+                 drop_block=False,
+                 block_size=3,
+                 keep_prob=0.9,
+                 spp=False):
+        """
+        PPYOLOPAN layer with SPP, DropBlock and CSP connection.
+
+        Args:
+            in_channels (list): input channels for fpn
+            norm_type (str): batch norm type, default bn
+            data_format (str): data format, NCHW or NHWC
+            act (str): activation function, default mish
+            conv_block_num (int): conv block num of each pan block
+            drop_block (bool): whether use DropBlock or not
+            block_size (int): block size of DropBlock
+            keep_prob (float): keep probability of DropBlock
+            spp (bool): whether use spp or not
+
+        """
+        super(PPYOLOPAN, self).__init__()
+        assert len(in_channels) > 0, "in_channels length should > 0"
+        self.in_channels = in_channels
+        self.num_blocks = len(in_channels)
+        # parse kwargs
+        self.drop_block = drop_block
+        self.block_size = block_size
+        self.keep_prob = keep_prob
+        self.spp = spp
+        self.conv_block_num = conv_block_num
+        self.data_format = data_format
+        if self.drop_block:
+            dropblock_cfg = [[
+                'dropblock', DropBlock, [self.block_size, self.keep_prob],
+                dict()
+            ]]
+        else:
+            dropblock_cfg = []
+
+        # fpn
+        self.fpn_blocks = []
+        self.fpn_routes = []
+        fpn_channels = []
+        for i, ch_in in enumerate(self.in_channels[::-1]):
+            if i > 0:
+                ch_in += 512 // (2**(i - 1))
+            channel = 512 // (2**i)
+            base_cfg = []
+            for j in range(self.conv_block_num):
+                base_cfg += [
+                    # name, layer, args
+                    [
+                        '{}.0'.format(j), ConvBNLayer, [channel, channel, 1],
+                        dict(
+                            padding=0, act=act, norm_type=norm_type)
+                    ],
+                    [
+                        '{}.1'.format(j), ConvBNLayer, [channel, channel, 3],
+                        dict(
+                            padding=1, act=act, norm_type=norm_type)
+                    ]
+                ]
+
+            if i == 0 and self.spp:
+                base_cfg[3] = [
+                    'spp', SPP, [channel * 4, channel, 1], dict(
+                        pool_size=[5, 9, 13], act=act, norm_type=norm_type)
+                ]
+
+            cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:]
+            name = 'fpn.{}'.format(i)
+            fpn_block = self.add_sublayer(
+                name,
+                PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name,
+                                  data_format))
+            self.fpn_blocks.append(fpn_block)
+            fpn_channels.append(channel * 2)
+            if i < self.num_blocks - 1:
+                name = 'fpn_transition.{}'.format(i)
+                route = self.add_sublayer(
+                    name,
+                    ConvBNLayer(
+                        ch_in=channel * 2,
+                        ch_out=channel,
+                        filter_size=1,
+                        stride=1,
+                        padding=0,
+                        act=act,
+                        norm_type=norm_type,
+                        data_format=data_format,
+                        name=name))
+                self.fpn_routes.append(route)
+        # pan
+        self.pan_blocks = []
+        self.pan_routes = []
+        self._out_channels = [512 // (2**(self.num_blocks - 2)), ]
+        for i in reversed(range(self.num_blocks - 1)):
+            name = 'pan_transition.{}'.format(i)
+            route = self.add_sublayer(
+                name,
+                ConvBNLayer(
+                    ch_in=fpn_channels[i + 1],
+                    ch_out=fpn_channels[i + 1],
+                    filter_size=3,
+                    stride=2,
+                    padding=1,
+                    act=act,
+                    norm_type=norm_type,
+                    data_format=data_format,
+                    name=name))
+            self.pan_routes = [route, ] + self.pan_routes
+            base_cfg = []
+            ch_in = fpn_channels[i] + fpn_channels[i + 1]
+            channel = 512 // (2**i)
+            for j in range(self.conv_block_num):
+                base_cfg += [
+                    # name, layer, args
+                    [
+                        '{}.0'.format(j), ConvBNLayer, [channel, channel, 1],
+                        dict(
+                            padding=0, act=act, norm_type=norm_type)
+                    ],
+                    [
+                        '{}.1'.format(j), ConvBNLayer, [channel, channel, 3],
+                        dict(
+                            padding=1, act=act, norm_type=norm_type)
+                    ]
+                ]
+
+            cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:]
+            name = 'pan.{}'.format(i)
+            pan_block = self.add_sublayer(
+                name,
+                PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name,
+                                  data_format))
+
+            self.pan_blocks = [pan_block, ] + self.pan_blocks
+            self._out_channels.append(channel * 2)
+
+        self._out_channels = self._out_channels[::-1]
+
+    def forward(self, blocks, for_mot=False):
+        assert len(blocks) == self.num_blocks
+        blocks = blocks[::-1]
+        fpn_feats = []
+
+        # add embedding features output for multi-object tracking model
+        if for_mot:
+            emb_feats = []
+
+        for i, block in enumerate(blocks):
+            if i > 0:
+                if self.data_format == 'NCHW':
+                    block = paddle.concat([route, block], axis=1)
+                else:
+                    block = paddle.concat([route, block], axis=-1)
+            route, tip = self.fpn_blocks[i](block)
+            fpn_feats.append(tip)
+
+            if for_mot:
+                # add embedding features output
+                emb_feats.append(route)
+
+            if i < self.num_blocks - 1:
+                route = self.fpn_routes[i](route)
+                route = F.interpolate(
+                    route, scale_factor=2., data_format=self.data_format)
+
+        pan_feats = [fpn_feats[-1], ]
+        route = fpn_feats[self.num_blocks - 1]
+        for i in reversed(range(self.num_blocks - 1)):
+            block = fpn_feats[i]
+            route = self.pan_routes[i](route)
+            if self.data_format == 'NCHW':
+                block = paddle.concat([route, block], axis=1)
+            else:
+                block = paddle.concat([route, block], axis=-1)
+
+            route, tip = self.pan_blocks[i](block)
+            pan_feats.append(tip)
+
+        if for_mot:
+            return {'yolo_feats': pan_feats[::-1], 'emb_feats': emb_feats}
+        else:
+            return pan_feats[::-1]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class YOLOCSPPAN(nn.Layer):
+    """
+    YOLO CSP-PAN, used in YOLOv5 and YOLOX.
+    """
+    __shared__ = ['depth_mult', 'data_format', 'act', 'trt']
+
+    def __init__(self,
+                 depth_mult=1.0,
+                 in_channels=[256, 512, 1024],
+                 depthwise=False,
+                 data_format='NCHW',
+                 act='silu',
+                 trt=False):
+        super(YOLOCSPPAN, self).__init__()
+        self.in_channels = in_channels
+        self._out_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        self.data_format = data_format
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+        # top-down fpn
+        self.lateral_convs = nn.LayerList()
+        self.fpn_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(
+                BaseConv(
+                    int(in_channels[idx]),
+                    int(in_channels[idx - 1]),
+                    1,
+                    1,
+                    act=act))
+            self.fpn_blocks.append(
+                CSPLayer(
+                    int(in_channels[idx - 1] * 2),
+                    int(in_channels[idx - 1]),
+                    round(3 * depth_mult),
+                    shortcut=False,
+                    depthwise=depthwise,
+                    act=act))
+
+        # bottom-up pan
+        self.downsample_convs = nn.LayerList()
+        self.pan_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                Conv(
+                    int(in_channels[idx]),
+                    int(in_channels[idx]),
+                    3,
+                    stride=2,
+                    act=act))
+            self.pan_blocks.append(
+                CSPLayer(
+                    int(in_channels[idx] * 2),
+                    int(in_channels[idx + 1]),
+                    round(3 * depth_mult),
+                    shortcut=False,
+                    depthwise=depthwise,
+                    act=act))
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+
+        # top-down fpn
+        inner_outs = [feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = F.interpolate(
+                feat_heigh,
+                scale_factor=2.,
+                mode="nearest",
+                data_format=self.data_format)
+            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat(
+                    [upsample_feat, feat_low], axis=1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up pan
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](paddle.concat(
+                [downsample_feat, feat_height], axis=1))
+            outs.append(out)
+
+        return outs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/necks/yolov6_pafpn.py b/ppdet/modeling/necks/yolov6_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8830c7f71932e308dc99ac34e90eedcfa442fce1
--- /dev/null
+++ b/ppdet/modeling/necks/yolov6_pafpn.py
@@ -0,0 +1,530 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+"""
+This code is based on https://github.com/meituan/YOLOv6
+"""
+
+import paddle
+import paddle.nn as nn
+from ppdet.core.workspace import register, serializable
+from ..backbones.yolov6_efficientrep import SimConv, Transpose, RepLayer, BepC3Layer, make_divisible, get_block, make_divisible_lite
+from ..backbones.yolov6_efficientrep import ConvBNHS, DPBlock, CSPBlock
+from ..shape_spec import ShapeSpec
+
+__all__ = ['RepPAN', 'RepBiFPAN', 'CSPRepPAN', 'CSPRepBiFPAN', 'Lite_EffiNeck']
+
+
+class BiFusion(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.cv1 = SimConv(in_channels[0], out_channels, 1, 1)
+        self.cv2 = SimConv(in_channels[1], out_channels, 1, 1)
+        self.cv3 = SimConv(out_channels * 3, out_channels, 1, 1)
+
+        self.upsample = Transpose(
+            in_channels=out_channels, out_channels=out_channels)
+        self.downsample = SimConv(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2)
+
+    def forward(self, x):
+        x0 = self.upsample(x[0])
+        x1 = self.cv1(x[1])
+        x2 = self.downsample(self.cv2(x[2]))
+        return self.cv3(paddle.concat([x0, x1, x2], 1))
+
+
+@register
+@serializable
+class RepPAN(nn.Layer):
+    """RepPAN of YOLOv6 n/t/s
+    """
+    __shared__ = ['depth_mult', 'width_mult', 'training_mode']
+
+    def __init__(self,
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 in_channels=[256, 512, 1024],
+                 num_repeats=[12, 12, 12, 12],
+                 training_mode='repvgg'):
+        super(RepPAN, self).__init__()
+        backbone_ch_list = [64, 128, 256, 512, 1024]
+        ch_list = backbone_ch_list + [256, 128, 128, 256, 256, 512]
+        num_repeats = [(max(round(i * depth_mult), 1) if i > 1 else i)
+                       for i in (num_repeats)]
+        ch_list = [make_divisible(i * width_mult, 8) for i in (ch_list)]
+        self.in_channels = in_channels
+        self._out_channels = ch_list[6], ch_list[8], ch_list[10]
+
+        # block = get_block(training_mode) # RepConv(RepVGGBlock) as default
+        # Rep_p4
+        in_ch, out_ch = self.in_channels[2], ch_list[5]
+        self.lateral_conv1 = SimConv(in_ch, out_ch, 1, 1)
+        self.up1 = Transpose(out_ch, out_ch)
+        self.rep_fpn1 = RepLayer(self.in_channels[1] + out_ch, out_ch,
+                                 num_repeats[0])
+
+        # Rep_p3
+        in_ch, out_ch = ch_list[5], ch_list[6]
+        self.lateral_conv2 = SimConv(in_ch, out_ch, 1, 1)
+        self.up2 = Transpose(out_ch, out_ch)
+        self.rep_fpn2 = RepLayer(self.in_channels[0] + out_ch, out_ch,
+                                 num_repeats[1])
+
+        # Rep_n3
+        in_ch, out_ch1, out_ch2 = ch_list[6], ch_list[7], ch_list[8]
+        self.down_conv1 = SimConv(in_ch, out_ch1, 3, 2)
+        self.rep_pan1 = RepLayer(in_ch + out_ch1, out_ch2, num_repeats[2])
+
+        # Rep_n4
+        in_ch, out_ch1, out_ch2 = ch_list[8], ch_list[9], ch_list[10]
+        self.down_conv2 = SimConv(in_ch, out_ch1, 3, 2)
+        self.rep_pan2 = RepLayer(ch_list[5] + out_ch1, out_ch2, num_repeats[3])
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        [c3, c4, c5] = feats
+        # [8, 128, 80, 80] [8, 256, 40, 40] [8, 512, 20, 20]
+
+        # top-down FPN
+        fpn_out1 = self.lateral_conv1(c5)
+        up_feat1 = self.up1(fpn_out1)
+        f_concat1 = paddle.concat([up_feat1, c4], 1)
+        f_out1 = self.rep_fpn1(f_concat1)
+
+        fpn_out2 = self.lateral_conv2(f_out1)
+        up_feat2 = self.up2(fpn_out2)
+        f_concat2 = paddle.concat([up_feat2, c3], 1)
+        pan_out2 = self.rep_fpn2(f_concat2)
+
+        # bottom-up PAN
+        down_feat1 = self.down_conv1(pan_out2)
+        p_concat1 = paddle.concat([down_feat1, fpn_out2], 1)
+        pan_out1 = self.rep_pan1(p_concat1)
+
+        down_feat2 = self.down_conv2(pan_out1)
+        p_concat2 = paddle.concat([down_feat2, fpn_out1], 1)
+        pan_out0 = self.rep_pan2(p_concat2)
+
+        return [pan_out2, pan_out1, pan_out0]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class RepBiFPAN(nn.Layer):
+    """
+    RepBiFPAN Neck for YOLOv6 n/s in v3.0
+    change lateral_conv + up(Transpose) to BiFusion
+    """
+    __shared__ = ['depth_mult', 'width_mult', 'training_mode']
+
+    def __init__(self,
+                 depth_mult=0.33,
+                 width_mult=0.50,
+                 in_channels=[128, 256, 512, 1024],
+                 training_mode='repvgg'):
+        super(RepBiFPAN, self).__init__()
+        backbone_ch_list = [64, 128, 256, 512, 1024]
+        backbone_num_repeats = [1, 6, 12, 18, 6]
+
+        ch_list = backbone_ch_list + [256, 128, 128, 256, 256, 512]
+        ch_list = [make_divisible(i * width_mult, 8) for i in (ch_list)]
+
+        num_repeats = backbone_num_repeats + [12, 12, 12, 12]
+        num_repeats = [(max(round(i * depth_mult), 1) if i > 1 else i)
+                       for i in (num_repeats)]
+
+        self.in_channels = in_channels
+        self._out_channels = ch_list[6], ch_list[8], ch_list[10]
+
+        # block = get_block(training_mode) # RepConv(RepVGGBlock) as default
+        # Rep_p4
+        self.reduce_layer0 = SimConv(ch_list[4], ch_list[5], 1, 1)
+        self.Bifusion0 = BiFusion([ch_list[3], ch_list[5]], ch_list[5])
+        self.Rep_p4 = RepLayer(ch_list[5], ch_list[5], num_repeats[5])
+
+        # Rep_p3
+        self.reduce_layer1 = SimConv(ch_list[5], ch_list[6], 1, 1)
+        self.Bifusion1 = BiFusion([ch_list[5], ch_list[6]], ch_list[6])
+        self.Rep_p3 = RepLayer(ch_list[6], ch_list[6], num_repeats[6])
+
+        # Rep_n3
+        self.downsample2 = SimConv(ch_list[6], ch_list[7], 3, 2)
+        self.Rep_n3 = RepLayer(ch_list[6] + ch_list[7], ch_list[8],
+                               num_repeats[7])
+
+        # Rep_n4
+        self.downsample1 = SimConv(ch_list[8], ch_list[9], 3, 2)
+        self.Rep_n4 = RepLayer(ch_list[5] + ch_list[9], ch_list[10],
+                               num_repeats[8])
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        [x3, x2, x1, x0] = feats  # p2, p3, p4, p5 
+
+        # top-down
+        fpn_out0 = self.reduce_layer0(x0)
+        f_concat_layer0 = self.Bifusion0([fpn_out0, x1, x2])
+        f_out0 = self.Rep_p4(f_concat_layer0)
+
+        fpn_out1 = self.reduce_layer1(f_out0)
+        f_concat_layer1 = self.Bifusion1([fpn_out1, x2, x3])
+        pan_out2 = self.Rep_p3(f_concat_layer1)
+
+        # bottom-up
+        down_feat1 = self.downsample2(pan_out2)
+        p_concat_layer1 = paddle.concat([down_feat1, fpn_out1], 1)
+        pan_out1 = self.Rep_n3(p_concat_layer1)
+
+        down_feat0 = self.downsample1(pan_out1)
+        p_concat_layer2 = paddle.concat([down_feat0, fpn_out0], 1)
+        pan_out0 = self.Rep_n4(p_concat_layer2)
+
+        return [pan_out2, pan_out1, pan_out0]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class CSPRepPAN(nn.Layer):
+    """
+    CSPRepPAN of YOLOv6 m/l
+    """
+
+    __shared__ = ['depth_mult', 'width_mult', 'act', 'training_mode']
+
+    def __init__(self,
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 in_channels=[256, 512, 1024],
+                 num_repeats=[12, 12, 12, 12],
+                 training_mode='repvgg',
+                 csp_e=0.5,
+                 act='relu'):
+        super(CSPRepPAN, self).__init__()
+        backbone_ch_list = [64, 128, 256, 512, 1024]
+        ch_list = backbone_ch_list + [256, 128, 128, 256, 256, 512]
+        num_repeats = [(max(round(i * depth_mult), 1) if i > 1 else i)
+                       for i in (num_repeats)]
+        ch_list = [make_divisible(i * width_mult, 8) for i in (ch_list)]
+        self.in_channels = in_channels
+        self._out_channels = ch_list[6], ch_list[8], ch_list[10]
+        if csp_e == 0.67:
+            csp_e = float(2) / 3
+        block = get_block(training_mode)
+        # RepConv(or RepVGGBlock) in M, but ConvBNSiLUBlock(or ConvWrapper) in L
+
+        # Rep_p4
+        in_ch, out_ch = self.in_channels[2], ch_list[5]
+        self.lateral_conv1 = SimConv(in_ch, out_ch, 1, 1)
+        self.up1 = Transpose(out_ch, out_ch)
+        self.Rep_p4 = BepC3Layer(
+            self.in_channels[1] + out_ch,
+            out_ch,
+            num_repeats[0],
+            csp_e,
+            block=block,
+            act=act)
+
+        # Rep_p3
+        in_ch, out_ch = ch_list[5], ch_list[6]
+        self.lateral_conv2 = SimConv(in_ch, out_ch, 1, 1)
+        self.up2 = Transpose(out_ch, out_ch)
+        self.Rep_p3 = BepC3Layer(
+            self.in_channels[0] + out_ch,
+            out_ch,
+            num_repeats[1],
+            csp_e,
+            block=block,
+            act=act)
+
+        # Rep_n3
+        in_ch, out_ch1, out_ch2 = ch_list[6], ch_list[7], ch_list[8]
+        self.down_conv1 = SimConv(in_ch, out_ch1, 3, 2)
+        self.Rep_n3 = BepC3Layer(
+            in_ch + out_ch1,
+            out_ch2,
+            num_repeats[2],
+            csp_e,
+            block=block,
+            act=act)
+
+        # Rep_n4
+        in_ch, out_ch1, out_ch2 = ch_list[8], ch_list[9], ch_list[10]
+        self.down_conv2 = SimConv(in_ch, out_ch1, 3, 2)
+        self.Rep_n4 = BepC3Layer(
+            ch_list[5] + out_ch1,
+            out_ch2,
+            num_repeats[3],
+            csp_e,
+            block=block,
+            act=act)
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        [c3, c4, c5] = feats
+        # [8, 128, 80, 80] [8, 256, 40, 40] [8, 512, 20, 20]
+
+        # top-down FPN
+        fpn_out1 = self.lateral_conv1(c5)  # reduce_layer0
+        up_feat1 = self.up1(fpn_out1)
+        f_concat1 = paddle.concat([up_feat1, c4], 1)
+        f_out1 = self.Rep_p4(f_concat1)
+
+        fpn_out2 = self.lateral_conv2(f_out1)  # reduce_layer1
+        up_feat2 = self.up2(fpn_out2)
+        f_concat2 = paddle.concat([up_feat2, c3], 1)
+        pan_out2 = self.Rep_p3(f_concat2)
+
+        # bottom-up PAN
+        down_feat1 = self.down_conv1(pan_out2)  # downsample2
+        p_concat1 = paddle.concat([down_feat1, fpn_out2], 1)
+        pan_out1 = self.Rep_n3(p_concat1)
+
+        down_feat2 = self.down_conv2(pan_out1)  # downsample1
+        p_concat2 = paddle.concat([down_feat2, fpn_out1], 1)
+        pan_out0 = self.Rep_n4(p_concat2)
+
+        return [pan_out2, pan_out1, pan_out0]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class CSPRepBiFPAN(nn.Layer):
+    """
+    CSPRepBiFPAN of YOLOv6 m/l in v3.0
+    change lateral_conv + up(Transpose) to BiFusion
+    """
+    __shared__ = ['depth_mult', 'width_mult', 'act', 'training_mode']
+
+    def __init__(self,
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 in_channels=[128, 256, 512, 1024],
+                 training_mode='repvgg',
+                 csp_e=0.5,
+                 act='relu'):
+        super(CSPRepBiFPAN, self).__init__()
+        backbone_ch_list = [64, 128, 256, 512, 1024]
+        backbone_num_repeats = [1, 6, 12, 18, 6]
+
+        ch_list = backbone_ch_list + [256, 128, 128, 256, 256, 512]
+        ch_list = [make_divisible(i * width_mult, 8) for i in (ch_list)]
+
+        num_repeats = backbone_num_repeats + [12, 12, 12, 12]
+        num_repeats = [(max(round(i * depth_mult), 1) if i > 1 else i)
+                       for i in (num_repeats)]
+
+        self.in_channels = in_channels
+        self._out_channels = ch_list[6], ch_list[8], ch_list[10]
+        if csp_e == 0.67:
+            csp_e = float(2) / 3
+        block = get_block(training_mode)
+        # RepConv(or RepVGGBlock) in M, but ConvBNSiLUBlock(or ConvWrapper) in L
+
+        # Rep_p4
+        self.reduce_layer0 = SimConv(ch_list[4], ch_list[5], 1, 1)
+        self.Bifusion0 = BiFusion([ch_list[3], ch_list[5]], ch_list[5])
+        self.Rep_p4 = BepC3Layer(
+            ch_list[5], ch_list[5], num_repeats[5], csp_e, block=block, act=act)
+
+        # Rep_p3
+        self.reduce_layer1 = SimConv(ch_list[5], ch_list[6], 1, 1)
+        self.Bifusion1 = BiFusion([ch_list[5], ch_list[6]], ch_list[6])
+        self.Rep_p3 = BepC3Layer(
+            ch_list[6], ch_list[6], num_repeats[6], csp_e, block=block, act=act)
+
+        # Rep_n3
+        self.downsample2 = SimConv(ch_list[6], ch_list[7], 3, 2)
+        self.Rep_n3 = BepC3Layer(
+            ch_list[6] + ch_list[7],
+            ch_list[8],
+            num_repeats[7],
+            csp_e,
+            block=block,
+            act=act)
+
+        # Rep_n4
+        self.downsample1 = SimConv(ch_list[8], ch_list[9], 3, 2)
+        self.Rep_n4 = BepC3Layer(
+            ch_list[5] + ch_list[9],
+            ch_list[10],
+            num_repeats[8],
+            csp_e,
+            block=block,
+            act=act)
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        [x3, x2, x1, x0] = feats  # p2, p3, p4, p5 
+
+        # top-down FPN
+        fpn_out0 = self.reduce_layer0(x0)
+        f_concat_layer0 = self.Bifusion0([fpn_out0, x1, x2])
+        f_out0 = self.Rep_p4(f_concat_layer0)
+
+        fpn_out1 = self.reduce_layer1(f_out0)
+        f_concat_layer1 = self.Bifusion1([fpn_out1, x2, x3])
+        pan_out2 = self.Rep_p3(f_concat_layer1)
+
+        # bottom-up PAN
+        down_feat1 = self.downsample2(pan_out2)
+        p_concat_layer1 = paddle.concat([down_feat1, fpn_out1], 1)
+        pan_out1 = self.Rep_n3(p_concat_layer1)
+
+        down_feat0 = self.downsample1(pan_out1)
+        p_concat_layer2 = paddle.concat([down_feat0, fpn_out0], 1)
+        pan_out0 = self.Rep_n4(p_concat_layer2)
+
+        return [pan_out2, pan_out1, pan_out0]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class Lite_EffiNeck(nn.Layer):
+    """Lite_EffiNeck of YOLOv6-lite """
+
+    def __init__(self, in_channels=[64, 128, 256], unified_channels=96):
+        super().__init__()
+        self.in_channels = in_channels
+        self._out_channels = [unified_channels] * 4
+
+        self.reduce_layer0 = ConvBNHS(
+            in_channels=in_channels[2],
+            out_channels=unified_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.reduce_layer1 = ConvBNHS(
+            in_channels=in_channels[1],
+            out_channels=unified_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.reduce_layer2 = ConvBNHS(
+            in_channels=in_channels[0],
+            out_channels=unified_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.upsample0 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.upsample1 = nn.Upsample(scale_factor=2, mode='nearest')
+
+        self.Csp_p4 = CSPBlock(
+            in_channels=unified_channels * 2,
+            out_channels=unified_channels,
+            kernel_size=5)
+        self.Csp_p3 = CSPBlock(
+            in_channels=unified_channels * 2,
+            out_channels=unified_channels,
+            kernel_size=5)
+        self.Csp_n3 = CSPBlock(
+            in_channels=unified_channels * 2,
+            out_channels=unified_channels,
+            kernel_size=5)
+        self.Csp_n4 = CSPBlock(
+            in_channels=unified_channels * 2,
+            out_channels=unified_channels,
+            kernel_size=5)
+        self.downsample2 = DPBlock(
+            in_channel=unified_channels,
+            out_channel=unified_channels,
+            kernel_size=5,
+            stride=2)
+        self.downsample1 = DPBlock(
+            in_channel=unified_channels,
+            out_channel=unified_channels,
+            kernel_size=5,
+            stride=2)
+        self.p6_conv_1 = DPBlock(
+            in_channel=unified_channels,
+            out_channel=unified_channels,
+            kernel_size=5,
+            stride=2)
+        self.p6_conv_2 = DPBlock(
+            in_channel=unified_channels,
+            out_channel=unified_channels,
+            kernel_size=5,
+            stride=2)
+
+    def forward(self, feats, for_mot=False):
+        (c3, c4, c5) = feats
+        # [1, 48, 80, 80] [1, 96, 40, 40] [1, 176, 20, 20]
+
+        fpn_out0 = self.reduce_layer0(c5)  #c5 # [1, 96, 20, 20]
+        x1 = self.reduce_layer1(c4)  #c4 # [1, 96, 40, 40]
+        x2 = self.reduce_layer2(c3)  #c3 # [1, 96, 80, 80]
+
+        upsample_feat0 = self.upsample0(fpn_out0)
+        f_concat_layer0 = paddle.concat([upsample_feat0, x1], 1)
+        f_out1 = self.Csp_p4(f_concat_layer0)
+
+        upsample_feat1 = self.upsample1(f_out1)
+        f_concat_layer1 = paddle.concat([upsample_feat1, x2], 1)
+        pan_out3 = self.Csp_p3(f_concat_layer1)  #p3
+
+        down_feat1 = self.downsample2(pan_out3)
+        p_concat_layer1 = paddle.concat([down_feat1, f_out1], 1)
+        pan_out2 = self.Csp_n3(p_concat_layer1)  #p4
+
+        down_feat0 = self.downsample1(pan_out2)
+        p_concat_layer2 = paddle.concat([down_feat0, fpn_out0], 1)
+        pan_out1 = self.Csp_n4(p_concat_layer2)  #p5
+
+        top_features = self.p6_conv_1(fpn_out0)
+        pan_out0 = top_features + self.p6_conv_2(pan_out1)  #p6
+
+        outputs = [pan_out3, pan_out2, pan_out1, pan_out0]
+        return outputs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/necks/yolov7_pafpn.py b/ppdet/modeling/necks/yolov7_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c0774d729db7b3fcfe2a69ba270dd0d7c9f8a78
--- /dev/null
+++ b/ppdet/modeling/necks/yolov7_pafpn.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from ppdet.core.workspace import register, serializable
+from ..backbones.yolov7_elannet import BaseConv, ELANLayer, ELAN2Layer, MPConvLayer, RepConv, DownC
+from ..shape_spec import ShapeSpec
+
+__all__ = ['ELANFPN', 'ELANFPNP6']
+
+
+@register
+@serializable
+class ELANFPN(nn.Layer):
+    """
+    YOLOv7 E-ELAN FPN, used in P5 model like ['tiny', 'L', 'X'], return 3 feats
+    """
+    __shared__ = ['arch', 'depth_mult', 'width_mult', 'act', 'trt']
+
+    # [in_ch, mid_ch1, mid_ch2, out_ch] of each ELANLayer (2 FPN + 2 PAN): 
+    ch_settings = {
+        'tiny': [[256, 64, 64, 128], [128, 32, 32, 64], [64, 64, 64, 128],
+                 [128, 128, 128, 256]],
+        'L': [[512, 256, 128, 256], [256, 128, 64, 128], [128, 256, 128, 256],
+              [256, 512, 256, 512]],
+        'X': [[640, 256, 256, 320], [320, 128, 128, 160], [160, 256, 256, 320],
+              [320, 512, 512, 640]],
+    }
+    # concat_list of each ELANLayer:
+    concat_list_settings = {
+        'tiny': [-1, -2, -3, -4],
+        'L': [-1, -2, -3, -4, -5, -6],
+        'X': [-1, -3, -5, -7, -8],
+    }
+    num_blocks = {'tiny': 2, 'L': 4, 'X': 6}
+
+    def __init__(
+            self,
+            arch='L',
+            depth_mult=1.0,
+            width_mult=1.0,
+            in_channels=[512, 1024, 512],  # layer num: 24 37 51 [c3,c4,c5]
+            out_channels=[256, 512, 1024],  # layer num: 75 88 101
+            depthwise=False,
+            for_u6=False,  # u6 branch, YOLOv7u version
+            act='silu',
+            trt=False):
+        super(ELANFPN, self).__init__()
+        self.in_channels = in_channels
+        self.arch = arch
+        concat_list = self.concat_list_settings[arch]
+        num_blocks = self.num_blocks[arch]
+        ch_settings = self.ch_settings[arch]
+        self._out_channels = [chs[-1] * 2 for chs in ch_settings[1:]]
+        self.for_u6 = for_u6
+
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[0][:]
+        self.lateral_conv1 = BaseConv(
+            self.in_channels[2], out_ch, 1, 1, act=act)  # 512->256
+        self.route_conv1 = BaseConv(
+            self.in_channels[1], out_ch, 1, 1, act=act)  # 1024->256
+        self.elan_fpn1 = ELANLayer(
+            out_ch * 2,
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[1][:]
+        self.lateral_conv2 = BaseConv(in_ch, out_ch, 1, 1, act=act)  # 256->128
+        self.route_conv2 = BaseConv(
+            self.in_channels[0], out_ch, 1, 1, act=act)  # 512->128
+        self.elan_fpn2 = ELANLayer(
+            out_ch * 2,
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[2][:]
+        if self.arch in ['L', 'X']:
+            self.mp_conv1 = MPConvLayer(in_ch, out_ch, 0.5, depthwise, act=act)
+            # TODO: named down_conv1
+        elif self.arch in ['tiny']:
+            self.mp_conv1 = BaseConv(in_ch, out_ch, 3, 2, act=act)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(self.arch))
+        self.elan_pan1 = ELANLayer(
+            out_ch * 2,
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[3][:]
+        if self.arch in ['L', 'X']:
+            self.mp_conv2 = MPConvLayer(in_ch, out_ch, 0.5, depthwise, act=act)
+        elif self.arch in ['tiny']:
+            self.mp_conv2 = BaseConv(in_ch, out_ch, 3, 2, act=act)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(self.arch))
+        self.elan_pan2 = ELANLayer(
+            out_ch + self.in_channels[2],  # concat([pan_out1_down, c5], 1)
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        self.repconvs = nn.LayerList()
+        Conv = RepConv if (self.arch == 'L' and not self.for_u6) else BaseConv
+        for out_ch in self._out_channels:
+            self.repconvs.append(Conv(int(out_ch // 2), out_ch, 3, 1, act=act))
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        [c3, c4, c5] = feats  # 24  37  51
+        # [8, 512, 80, 80] [8, 1024, 40, 40] [8, 512, 20, 20]
+
+        # Top-Down FPN
+        p5_lateral = self.lateral_conv1(c5)  # 512->256
+        p5_up = self.upsample(p5_lateral)
+        route_c4 = self.route_conv1(c4)  # 1024->256 # route
+        f_out1 = paddle.concat([route_c4, p5_up], 1)  # 512 # [8, 512, 40, 40]
+        fpn_out1 = self.elan_fpn1(f_out1)  # 512 -> 128*4 + 256*2 -> 1024 -> 256
+        # 63
+
+        fpn_out1_lateral = self.lateral_conv2(fpn_out1)  # 256->128
+        fpn_out1_up = self.upsample(fpn_out1_lateral)
+        route_c3 = self.route_conv2(c3)  # 512->128 # route
+        f_out2 = paddle.concat([route_c3, fpn_out1_up], 1)  # 256
+        fpn_out2 = self.elan_fpn2(f_out2)  # 256 -> 64*4 + 128*2 -> 512 -> 128
+        # layer 75: [8, 128, 80, 80]
+
+        # Buttom-Up PAN
+        p_out1_down = self.mp_conv1(fpn_out2)  # 128
+        p_out1 = paddle.concat([p_out1_down, fpn_out1], 1)  # 128*2 + 256 -> 512
+        pan_out1 = self.elan_pan1(p_out1)  # 512 -> 128*4 + 256*2 -> 1024 -> 256
+        # layer 88: [8, 256, 40, 40]
+
+        pan_out1_down = self.mp_conv2(pan_out1)  # 256
+        p_out2 = paddle.concat([pan_out1_down, c5], 1)  # 256*2 + 512 -> 1024
+        pan_out2 = self.elan_pan2(
+            p_out2)  # 1024 -> 256*4 + 512*2 -> 2048 -> 512
+        # layer 101: [8, 512, 20, 20]
+
+        outputs = []
+        pan_outs = [fpn_out2, pan_out1, pan_out2]  # 75 88 101
+        for i, out in enumerate(pan_outs):
+            outputs.append(self.repconvs[i](out))
+        return outputs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class ELANFPNP6(nn.Layer):
+    """
+    YOLOv7P6 E-ELAN FPN, used in P6 model like ['W6', 'E6', 'D6', 'E6E']
+    return 4 feats
+    """
+    __shared__ = ['arch', 'depth_mult', 'width_mult', 'act', 'use_aux', 'trt']
+
+    # in_ch, mid_ch1, mid_ch2, out_ch of each ELANLayer (3 FPN + 3 PAN): 
+    ch_settings = {
+        'W6':
+        [[512, 384, 192, 384], [384, 256, 128, 256], [256, 128, 64, 128],
+         [128, 256, 128, 256], [256, 384, 192, 384], [384, 512, 256, 512]],
+        'E6': [[640, 384, 192, 480], [480, 256, 128, 320], [320, 128, 64, 160],
+               [160, 256, 128, 320], [320, 384, 192, 480],
+               [480, 512, 256, 640]],
+        'D6': [[768, 384, 192, 576], [576, 256, 128, 384], [384, 128, 64, 192],
+               [192, 256, 128, 384], [384, 384, 192, 576],
+               [576, 512, 256, 768]],
+        'E6E': [[640, 384, 192, 480], [480, 256, 128, 320],
+                [320, 128, 64, 160], [160, 256, 128, 320],
+                [320, 384, 192, 480], [480, 512, 256, 640]],
+    }
+    # concat_list of each ELANLayer:
+    concat_list_settings = {
+        'W6': [-1, -2, -3, -4, -5, -6],
+        'E6': [-1, -2, -3, -4, -5, -6, -7, -8],
+        'D6': [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10],
+        'E6E': [-1, -2, -3, -4, -5, -6, -7, -8],
+    }
+    num_blocks = {'W6': 4, 'E6': 6, 'D6': 8, 'E6E': 6}
+
+    def __init__(
+            self,
+            arch='W6',
+            use_aux=False,
+            depth_mult=1.0,
+            width_mult=1.0,
+            in_channels=[256, 512, 768, 512],  # 19 28 37 47 (c3,c4,c5,c6)
+            out_channels=[256, 512, 768, 1024],  # layer: 83 93 103 113
+            depthwise=False,
+            act='silu',
+            trt=False):
+        super(ELANFPNP6, self).__init__()
+        self.in_channels = in_channels
+        self.arch = arch
+        self.use_aux = use_aux
+        concat_list = self.concat_list_settings[arch]
+        num_blocks = self.num_blocks[arch]
+        ch_settings = self.ch_settings[arch]
+        self._out_channels = [chs[-1] * 2 for chs in ch_settings[2:]]
+        if self.training and self.use_aux:
+            chs_aux = [chs[-1] for chs in ch_settings[:3][::-1]
+                       ] + [self.in_channels[3]]
+            self.in_channels_aux = chs_aux
+            self._out_channels = self._out_channels + [320, 640, 960, 1280]
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+        ELANBlock = ELAN2Layer if self.arch in ['E6E'] else ELANLayer
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[0][:]
+        self.lateral_conv1 = BaseConv(
+            self.in_channels[3], out_ch, 1, 1, act=act)  # 512->384
+        self.route_conv1 = BaseConv(
+            self.in_channels[2], out_ch, 1, 1, act=act)  # 768->384
+        self.elan_fpn1 = ELANBlock(
+            out_ch * 2,
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[1][:]
+        self.lateral_conv2 = BaseConv(in_ch, out_ch, 1, 1, act=act)  # 384->256
+        self.route_conv2 = BaseConv(
+            self.in_channels[1], out_ch, 1, 1, act=act)  # 512->256
+        self.elan_fpn2 = ELANBlock(
+            out_ch * 2,
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[2][:]
+        self.lateral_conv3 = BaseConv(in_ch, out_ch, 1, 1, act=act)  # 256->128
+        self.route_conv3 = BaseConv(
+            self.in_channels[0], out_ch, 1, 1, act=act)  # 256->128
+        self.elan_fpn3 = ELANBlock(
+            out_ch * 2,
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[3][:]
+        if self.arch in ['W6']:
+            self.down_conv1 = BaseConv(in_ch, out_ch, 3, 2, act=act)
+        elif self.arch in ['E6', 'D6', 'E6E']:
+            self.down_conv1 = DownC(in_ch, out_ch, 2, act=act)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(self.arch))
+        self.elan_pan1 = ELANBlock(
+            out_ch * 2,
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[4][:]
+        if self.arch in ['W6']:
+            self.down_conv2 = BaseConv(in_ch, out_ch, 3, 2, act=act)
+        elif self.arch in ['E6', 'D6', 'E6E']:
+            self.down_conv2 = DownC(in_ch, out_ch, 2, act=act)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(self.arch))
+        self.elan_pan2 = ELANBlock(
+            out_ch * 2,
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        in_ch, mid_ch1, mid_ch2, out_ch = ch_settings[5][:]
+        if self.arch in ['W6']:
+            self.down_conv3 = BaseConv(in_ch, out_ch, 3, 2, act=act)
+        elif self.arch in ['E6', 'D6', 'E6E']:
+            self.down_conv3 = DownC(in_ch, out_ch, 2, act=act)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(self.arch))
+        self.elan_pan3 = ELANBlock(
+            out_ch + self.in_channels[3],  # concat([pan_out2_down, c6], 1)
+            mid_ch1,
+            mid_ch2,
+            out_ch,
+            num_blocks,
+            concat_list,
+            depthwise,
+            act=act)
+
+        self.repconvs = nn.LayerList()
+        Conv = BaseConv
+        for i, _out_ch in enumerate(self._out_channels[:4]):
+            self.repconvs.append(Conv(_out_ch // 2, _out_ch, 3, 1, act=act))
+
+        if self.training and self.use_aux:
+            self.repconvs_aux = nn.LayerList()
+            for i, _out_ch in enumerate(self._out_channels[4:]):
+                self.repconvs_aux.append(
+                    Conv(
+                        self.in_channels_aux[i], _out_ch, 3, 1, act=act))
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        [c3, c4, c5, c6] = feats  # 19 28 37 47
+        # [8, 256, 160, 160] [8, 512, 80, 80] [8, 768, 40, 40] [8, 512, 20, 20]
+
+        # Top-Down FPN
+        p6_lateral = self.lateral_conv1(c6)  # 512->384
+        p6_up = self.upsample(p6_lateral)
+        route_c5 = self.route_conv1(c5)  # 768->384 # route
+        f_out1 = paddle.concat([route_c5, p6_up], 1)  # 768 # [8, 768, 40, 40]
+        fpn_out1 = self.elan_fpn1(f_out1)  # 768 -> 192*4 + 384*2 -> 1536 -> 384
+        # layer 59: [8, 384, 40, 40]
+
+        fpn_out1_lateral = self.lateral_conv2(fpn_out1)  # 384->256
+        fpn_out1_up = self.upsample(fpn_out1_lateral)
+        route_c4 = self.route_conv2(c4)  # 512->256 # route
+        f_out2 = paddle.concat([route_c4, fpn_out1_up],
+                               1)  # 512 # [8, 512, 80, 80]
+        fpn_out2 = self.elan_fpn2(f_out2)  # 512 -> 128*4 + 256*2 -> 1024 -> 256
+        # layer 71: [8, 256, 80, 80]
+
+        fpn_out2_lateral = self.lateral_conv3(fpn_out2)  # 256->128
+        fpn_out2_up = self.upsample(fpn_out2_lateral)
+        route_c3 = self.route_conv3(c3)  # 512->128 # route
+        f_out3 = paddle.concat([route_c3, fpn_out2_up], 1)  # 256
+        fpn_out3 = self.elan_fpn3(f_out3)  # 256 -> 64*4 + 128*2 -> 512 -> 128
+        # layer 83: [8, 128, 160, 160]
+
+        # Buttom-Up PAN
+        p_out1_down = self.down_conv1(fpn_out3)  # 128->256
+        p_out1 = paddle.concat([p_out1_down, fpn_out2], 1)  # 256 + 256 -> 512
+        pan_out1 = self.elan_pan1(p_out1)  # 512 -> 128*4 + 256*2 -> 1024 -> 256
+        # layer 93: [8, 256, 80, 80]
+
+        pan_out1_down = self.down_conv2(pan_out1)  # 256->384
+        p_out2 = paddle.concat([pan_out1_down, fpn_out1], 1)  # 384 + 384 -> 768
+        pan_out2 = self.elan_pan2(p_out2)  # 768 -> 192*4 + 384*2 -> 1536 -> 384
+        # layer 103: [8, 384, 40, 40]
+
+        pan_out2_down = self.down_conv3(pan_out2)  # 384->512
+        p_out3 = paddle.concat([pan_out2_down, c6], 1)  # 512 + 512 -> 1024
+        pan_out3 = self.elan_pan3(
+            p_out3)  # 1024 -> 256*4 + 512*2 -> 2048 -> 512
+        # layer 113: [8, 512, 20, 20]
+
+        outputs = []
+        pan_outs = [fpn_out3, pan_out1, pan_out2, pan_out3]  # 83 93 103 113
+        for i, out in enumerate(pan_outs):
+            outputs.append(self.repconvs[i](out))
+
+        if self.training and self.use_aux:
+            aux_outs = [fpn_out3, fpn_out2, fpn_out1, c6]  # 83 71 59 47
+            for i, out in enumerate(aux_outs):
+                outputs.append(self.repconvs_aux[i](out))
+        return outputs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/necks/yolov8_pafpn.py b/ppdet/modeling/necks/yolov8_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5e8099204fcd950afc78ea4ea3e6832e5b25205
--- /dev/null
+++ b/ppdet/modeling/necks/yolov8_pafpn.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+from ..backbones.csp_darknet import BaseConv
+from ..backbones.yolov8_csp_darknet import C2fLayer, C2Layer
+
+__all__ = ['YOLOv8CSPPAN', 'YOLOv8CSPPANP6']
+
+
+@register
+@serializable
+class YOLOv8CSPPAN(nn.Layer):
+    """
+    YOLOv8 CSP-PAN FPN, used in YOLOv8
+    diff with YOLOv5 CSP-PAN FPN:
+    1. no lateral convs
+    2. use C2fLayer in YOLOv8 while CSPLayer in YOLOv5
+    """
+    __shared__ = ['depth_mult', 'act', 'trt']
+
+    def __init__(self,
+                 depth_mult=1.0,
+                 in_channels=[256, 512, 1024],
+                 depthwise=False,
+                 act='silu',
+                 trt=False):
+        super(YOLOv8CSPPAN, self).__init__()
+        self.in_channels = in_channels
+        self._out_channels = in_channels
+
+        # top-down
+        self.fpn_p4 = C2fLayer(
+            int(in_channels[2] + in_channels[1]),
+            int(in_channels[1]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+        self.fpn_p3 = C2fLayer(
+            int(in_channels[1] + in_channels[0]),
+            int(in_channels[0]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+        # bottom-up
+        self.down_conv2 = BaseConv(
+            int(in_channels[0]), int(in_channels[0]), 3, stride=2, act=act)
+        self.pan_n3 = C2fLayer(
+            int(in_channels[0] + in_channels[1]),
+            int(in_channels[1]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+        self.down_conv1 = BaseConv(
+            int(in_channels[1]), int(in_channels[1]), 3, stride=2, act=act)
+        self.pan_n4 = C2fLayer(
+            int(in_channels[1] + in_channels[2]),
+            int(in_channels[2]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+    def forward(self, feats, for_mot=False):
+        [c3, c4, c5] = feats
+
+        # top-down FPN
+        up_feat1 = F.interpolate(c5, scale_factor=2., mode="nearest")
+        f_concat1 = paddle.concat([up_feat1, c4], 1)
+        f_out1 = self.fpn_p4(f_concat1)
+
+        up_feat2 = F.interpolate(f_out1, scale_factor=2., mode="nearest")
+        f_concat2 = paddle.concat([up_feat2, c3], 1)
+        f_out0 = self.fpn_p3(f_concat2)
+
+        # bottom-up PAN
+        down_feat1 = self.down_conv2(f_out0)
+        p_concat1 = paddle.concat([down_feat1, f_out1], 1)
+        pan_out1 = self.pan_n3(p_concat1)
+
+        down_feat2 = self.down_conv1(pan_out1)
+        p_concat2 = paddle.concat([down_feat2, c5], 1)
+        pan_out0 = self.pan_n4(p_concat2)
+
+        return [f_out0, pan_out1, pan_out0]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
+
+
+@register
+@serializable
+class YOLOv8CSPPANP6(nn.Layer):
+    """
+    YOLOv8 CSP-PAN FPN, used in YOLOv8-P6
+    diff with YOLOv5 CSP-PAN FPN:
+    1. no lateral convs
+    2. use C2Layer in YOLOv8-P6 while CSPLayer in YOLOv5-P6
+    """
+    __shared__ = ['depth_mult', 'act', 'trt']
+
+    def __init__(self,
+                 depth_mult=1.0,
+                 in_channels=[256, 512, 768, 1024],
+                 depthwise=False,
+                 act='silu',
+                 trt=False):
+        super(YOLOv8CSPPANP6, self).__init__()
+        self.in_channels = in_channels
+        self._out_channels = in_channels
+
+        # top-down
+        self.fpn_p5 = C2Layer(
+            int(in_channels[3] + in_channels[2]),
+            int(in_channels[2]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+        self.fpn_p4 = C2Layer(
+            int(in_channels[2] + in_channels[1]),
+            int(in_channels[1]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+        self.fpn_p3 = C2Layer(
+            int(in_channels[1] + in_channels[0]),
+            int(in_channels[0]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+        # bottom-up
+        self.down_conv2 = BaseConv(
+            int(in_channels[0]), int(in_channels[0]), 3, stride=2, act=act)
+        self.pan_n3 = C2Layer(
+            int(in_channels[0] + in_channels[1]),
+            int(in_channels[1]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+        self.down_conv1 = BaseConv(
+            int(in_channels[1]), int(in_channels[1]), 3, stride=2, act=act)
+        self.pan_n4 = C2Layer(
+            int(in_channels[1] + in_channels[2]),
+            int(in_channels[2]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+        self.down_conv0 = BaseConv(
+            int(in_channels[2]), int(in_channels[2]), 3, stride=2, act=act)
+        self.pan_n5 = C2Layer(
+            int(in_channels[2] + in_channels[3]),
+            int(in_channels[3]),
+            round(3 * depth_mult),
+            shortcut=False,
+            depthwise=depthwise,
+            act=act)
+
+    def forward(self, feats, for_mot=False):
+        [c3, c4, c5, c6] = feats
+
+        # top-down FPN
+        up_feat0 = F.interpolate(c6, scale_factor=2., mode="nearest")
+        f_concat0 = paddle.concat([up_feat0, c5], 1)
+        f_out0 = self.fpn_p5(f_concat0)
+
+        up_feat1 = F.interpolate(f_out0, scale_factor=2., mode="nearest")
+        f_concat1 = paddle.concat([up_feat1, c4], 1)
+        f_out1 = self.fpn_p4(f_concat1)
+
+        up_feat2 = F.interpolate(f_out1, scale_factor=2., mode="nearest")
+        f_concat2 = paddle.concat([up_feat2, c3], 1)
+        f_out2 = self.fpn_p3(f_concat2)
+
+        # bottom-up PAN
+        down_feat1 = self.down_conv2(f_out2)
+        p_concat1 = paddle.concat([down_feat1, f_out1], 1)
+        pan_out2 = self.pan_n3(p_concat1)
+
+        down_feat2 = self.down_conv1(pan_out2)
+        p_concat2 = paddle.concat([down_feat2, c5], 1)
+        pan_out1 = self.pan_n4(p_concat2)
+
+        down_feat3 = self.down_conv0(pan_out1)
+        p_concat3 = paddle.concat([down_feat3, c6], 1)
+        pan_out0 = self.pan_n5(p_concat3)
+
+        return [f_out2, pan_out2, pan_out1, pan_out0]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/ppdet/modeling/ops.py b/ppdet/modeling/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e54367b76d980eb19d963fa307d355933464edf
--- /dev/null
+++ b/ppdet/modeling/ops.py
@@ -0,0 +1,444 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+try:
+    import paddle._legacy_C_ops as C_ops
+except:
+    import paddle._C_ops as C_ops
+
+from paddle import in_dynamic_mode
+from paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype
+
+__all__ = [
+    'multiclass_nms', 'matrix_nms', 'batch_norm', 'mish', 'silu', 'swish',
+    'identity'
+]
+
+
+def identity(x):
+    return x
+
+
+def mish(x):
+    return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x))
+
+
+def silu(x):
+    return F.silu(x)
+
+
+def swish(x):
+    return x * F.sigmoid(x)
+
+
+TRT_ACT_SPEC = {'swish': swish, 'silu': swish}
+
+ACT_SPEC = {'mish': mish, 'silu': silu}
+
+
+def get_act_fn(act=None, trt=False):
+    assert act is None or isinstance(act, (
+        str, dict)), 'name of activation should be str, dict or None'
+    if not act:
+        return identity
+
+    if isinstance(act, dict):
+        name = act['name']
+        act.pop('name')
+        kwargs = act
+    else:
+        name = act
+        kwargs = dict()
+
+    if trt and name in TRT_ACT_SPEC:
+        fn = TRT_ACT_SPEC[name]
+    elif name in ACT_SPEC:
+        fn = ACT_SPEC[name]
+    else:
+        fn = getattr(F, name)
+
+    return lambda x: fn(x, **kwargs)
+
+
+def batch_norm(ch,
+               norm_type='bn',
+               norm_decay=0.,
+               freeze_norm=False,
+               initializer=None,
+               data_format='NCHW'):
+
+    norm_lr = 0. if freeze_norm else 1.
+    weight_attr = ParamAttr(
+        initializer=initializer,
+        learning_rate=norm_lr,
+        regularizer=L2Decay(norm_decay),
+        trainable=False if freeze_norm else True)
+    bias_attr = ParamAttr(
+        learning_rate=norm_lr,
+        regularizer=L2Decay(norm_decay),
+        trainable=False if freeze_norm else True)
+
+    if norm_type in ['sync_bn', 'bn']:
+        norm_layer = nn.BatchNorm2D(
+            ch,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    norm_params = norm_layer.parameters()
+    if freeze_norm:
+        for param in norm_params:
+            param.stop_gradient = True
+
+    return norm_layer
+
+
+@paddle.jit.not_to_static
+def multiclass_nms(bboxes,
+                   scores,
+                   score_threshold,
+                   nms_top_k,
+                   keep_top_k,
+                   nms_threshold=0.3,
+                   normalized=True,
+                   nms_eta=1.,
+                   background_label=-1,
+                   return_index=False,
+                   return_rois_num=True,
+                   rois_num=None,
+                   name=None):
+    """
+    This operator is to do multi-class non maximum suppression (NMS) on
+    boxes and scores.
+    In the NMS step, this operator greedily selects a subset of detection bounding
+    boxes that have high scores larger than score_threshold, if providing this
+    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+    is larger than -1. Then this operator pruns away boxes that have high IOU
+    (intersection over union) overlap with already selected boxes by adaptive
+    threshold NMS based on parameters of nms_threshold and nms_eta.
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+    Args:
+        bboxes (Tensor): Two types of bboxes are supported:
+                           1. (Tensor) A 3-D Tensor with shape
+                           [N, M, 4 or 8 16 24 32] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
+                           M is the number of bounding boxes, C is the
+                           class number
+        scores (Tensor): Two types of scores are supported:
+                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes.
+                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
+                           M is the number of bbox, C is the class number.
+                           In this case, input BBoxes should be the second
+                           case with shape [M, C, 4].
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score. If not provided,
+                                 consider all boxes.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences after the filtering detections based
+                         on score_threshold.
+        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
+        nms_eta (float): The threshold to be used in NMS. Default: 1.0
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        normalized (bool): Whether detections are normalized. Default: True
+        return_index(bool): Whether return selected index. Default: False
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element 
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
+        name(str): Name of the multiclass nms op. Default: None.
+    Returns:
+        A tuple with two Variables: (Out, Index) if return_index is True,
+        otherwise, a tuple with one Variable(Out) is returned.
+        Out: A 2-D LoDTensor with shape [No, 6] represents the detections.
+        Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+        or A 2-D LoDTensor with shape [No, 10] represents the detections.
+        Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3,
+        x4, y4]. No is the total number of detections.
+        If all images have not detected results, all elements in LoD will be
+        0, and output tensor is empty (None).
+        Index: Only return when return_index is True. A 2-D LoDTensor with
+        shape [No, 1] represents the selected index which type is Integer.
+        The index is the absolute value cross batches. No is the same number
+        as Out. If the index is used to gather other attribute such as age,
+        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where
+        N is the batch size and M is the number of boxes.
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from ppdet.modeling import ops
+            boxes = paddle.static.data(name='bboxes', shape=[81, 4],
+                                      dtype='float32', lod_level=1)
+            scores = paddle.static.data(name='scores', shape=[81],
+                                      dtype='float32', lod_level=1)
+            out, index = ops.multiclass_nms(bboxes=boxes,
+                                            scores=scores,
+                                            background_label=0,
+                                            score_threshold=0.5,
+                                            nms_top_k=400,
+                                            nms_threshold=0.3,
+                                            keep_top_k=200,
+                                            normalized=False,
+                                            return_index=True)
+    """
+    helper = LayerHelper('multiclass_nms3', **locals())
+
+    if in_dynamic_mode():
+        attrs = ('background_label', background_label, 'score_threshold',
+                 score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',
+                 nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,
+                 'normalized', normalized)
+        output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores,
+                                                            rois_num, *attrs)
+        if not return_index:
+            index = None
+        return output, nms_rois_num, index
+
+    else:
+        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+        index = helper.create_variable_for_type_inference(dtype='int32')
+
+        inputs = {'BBoxes': bboxes, 'Scores': scores}
+        outputs = {'Out': output, 'Index': index}
+
+        if rois_num is not None:
+            inputs['RoisNum'] = rois_num
+
+        if return_rois_num:
+            nms_rois_num = helper.create_variable_for_type_inference(
+                dtype='int32')
+            outputs['NmsRoisNum'] = nms_rois_num
+
+        helper.append_op(
+            type="multiclass_nms3",
+            inputs=inputs,
+            attrs={
+                'background_label': background_label,
+                'score_threshold': score_threshold,
+                'nms_top_k': nms_top_k,
+                'nms_threshold': nms_threshold,
+                'keep_top_k': keep_top_k,
+                'nms_eta': nms_eta,
+                'normalized': normalized
+            },
+            outputs=outputs)
+        output.stop_gradient = True
+        index.stop_gradient = True
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            nms_rois_num = None
+
+        return output, nms_rois_num, index
+
+
+@paddle.jit.not_to_static
+def matrix_nms(bboxes,
+               scores,
+               score_threshold,
+               post_threshold,
+               nms_top_k,
+               keep_top_k,
+               use_gaussian=False,
+               gaussian_sigma=2.,
+               background_label=0,
+               normalized=True,
+               return_index=False,
+               return_rois_num=True,
+               name=None):
+    """
+    **Matrix NMS**
+    This operator does matrix non maximum suppression (NMS).
+    First selects a subset of candidate bounding boxes that have higher scores
+    than score_threshold (if provided), then the top k candidate is selected if
+    nms_top_k is larger than -1. Score of the remaining candidate are then
+    decayed according to the Matrix NMS scheme.
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+    Args:
+        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           The data type is float32 or float64.
+        scores (Tensor): A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes. The data type is float32 or float64.
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score.
+        post_threshold (float): Threshold to filter out bounding boxes with
+                                low confidence score AFTER decaying.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences after the filtering detections based
+                         on score_threshold.
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        use_gaussian (bool): Use Gaussian as the decay function. Default: False
+        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        normalized (bool): Whether detections are normalized. Default: True
+        return_index(bool): Whether return selected index. Default: False
+        return_rois_num(bool): whether return rois_num. Default: True
+        name(str): Name of the matrix nms op. Default: None.
+    Returns:
+        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
+        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
+        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
+             detection results.
+             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+             (After version 1.3, when no boxes detected, the lod is changed
+             from {0} to {1})
+        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
+            selected indices, which are absolute values cross batches.
+        rois_num (Tensor): A 1-D Tensor with shape [N] containing 
+            the number of detected boxes in each image.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from ppdet.modeling import ops
+            boxes = paddle.static.data(name='bboxes', shape=[None,81, 4],
+                                      dtype='float32', lod_level=1)
+            scores = paddle.static.data(name='scores', shape=[None,81],
+                                      dtype='float32', lod_level=1)
+            out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0,
+                                 score_threshold=0.5, post_threshold=0.1,
+                                 nms_top_k=400, keep_top_k=200, normalized=False)
+    """
+    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
+                             'matrix_nms')
+    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
+                             'matrix_nms')
+    check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
+    check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
+    check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
+    check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms')
+    check_type(normalized, 'normalized', bool, 'matrix_nms')
+    check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms')
+    check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms')
+    check_type(background_label, 'background_label', int, 'matrix_nms')
+
+    if in_dynamic_mode():
+        attrs = ('background_label', background_label, 'score_threshold',
+                 score_threshold, 'post_threshold', post_threshold, 'nms_top_k',
+                 nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
+                 use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
+                 normalized)
+        out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs)
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            rois_num = None
+        return out, rois_num, index
+    else:
+        helper = LayerHelper('matrix_nms', **locals())
+        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+        index = helper.create_variable_for_type_inference(dtype='int32')
+        outputs = {'Out': output, 'Index': index}
+        if return_rois_num:
+            rois_num = helper.create_variable_for_type_inference(dtype='int32')
+            outputs['RoisNum'] = rois_num
+
+        helper.append_op(
+            type="matrix_nms",
+            inputs={'BBoxes': bboxes,
+                    'Scores': scores},
+            attrs={
+                'background_label': background_label,
+                'score_threshold': score_threshold,
+                'post_threshold': post_threshold,
+                'nms_top_k': nms_top_k,
+                'gaussian_sigma': gaussian_sigma,
+                'use_gaussian': use_gaussian,
+                'keep_top_k': keep_top_k,
+                'normalized': normalized
+            },
+            outputs=outputs)
+        output.stop_gradient = True
+
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            rois_num = None
+        return output, rois_num, index
+
+
+def sigmoid_cross_entropy_with_logits(input,
+                                      label,
+                                      ignore_index=-100,
+                                      normalize=False):
+    output = F.binary_cross_entropy_with_logits(input, label, reduction='none')
+    mask_tensor = paddle.cast(label != ignore_index, 'float32')
+    output = paddle.multiply(output, mask_tensor)
+    if normalize:
+        sum_valid_mask = paddle.sum(mask_tensor)
+        output = output / sum_valid_mask
+    return output
+
+
+def smooth_l1(input, label, inside_weight=None, outside_weight=None,
+              sigma=None):
+    input_new = paddle.multiply(input, inside_weight)
+    label_new = paddle.multiply(label, inside_weight)
+    delta = 1 / (sigma * sigma)
+    out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta)
+    out = paddle.multiply(out, outside_weight)
+    out = out / delta
+    out = paddle.reshape(out, shape=[out.shape[0], -1])
+    out = paddle.sum(out, axis=1)
+    return out
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    assert num_channels % groups == 0, 'num_channels should be divisible by groups'
+    channels_per_group = num_channels // groups
+    x = paddle.reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+    x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4])
+    x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+def get_static_shape(tensor):
+    shape = paddle.shape(tensor)
+    shape.stop_gradient = True
+    return shape
diff --git a/ppdet/modeling/post_process.py b/ppdet/modeling/post_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..dda2e3166daa613ec446d1f41fb439fe63dfa481
--- /dev/null
+++ b/ppdet/modeling/post_process.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from ppdet.modeling.bbox_utils import nonempty_bbox
+from .transformers import bbox_cxcywh_to_xyxy
+
+__all__ = ['BBoxPostProcess', 'DETRPostProcess']
+
+
+@register
+class BBoxPostProcess(object):
+    __shared__ = ['num_classes', 'export_onnx', 'export_eb']
+    __inject__ = ['decode', 'nms']
+
+    def __init__(self,
+                 num_classes=80,
+                 decode=None,
+                 nms=None,
+                 export_onnx=False,
+                 export_eb=False):
+        super(BBoxPostProcess, self).__init__()
+        self.num_classes = num_classes
+        self.decode = decode
+        self.nms = nms
+        self.export_onnx = export_onnx
+        self.export_eb = export_eb
+
+    def __call__(self, head_out, rois, im_shape, scale_factor):
+        """
+        Decode the bbox and do NMS if needed.
+
+        Args:
+            head_out (tuple): bbox_pred and cls_prob of bbox_head output.
+            rois (tuple): roi and rois_num of rpn_head output.
+            im_shape (Tensor): The shape of the input image.
+            scale_factor (Tensor): The scale factor of the input image.
+            export_onnx (bool): whether export model to onnx
+        Returns:
+            bbox_pred (Tensor): The output prediction with shape [N, 6], including
+                labels, scores and bboxes. The size of bboxes are corresponding
+                to the input image, the bboxes may be used in other branch.
+            bbox_num (Tensor): The number of prediction boxes of each batch with
+                shape [1], and is N.
+        """
+        if self.nms is not None:
+            bboxes, score = self.decode(head_out, rois, im_shape, scale_factor)
+            bbox_pred, bbox_num, _ = self.nms(bboxes, score, self.num_classes)
+
+        else:
+            bbox_pred, bbox_num = self.decode(head_out, rois, im_shape,
+                                              scale_factor)
+
+        if self.export_onnx:
+            # add fake box after postprocess when exporting onnx 
+            fake_bboxes = paddle.to_tensor(
+                np.array(
+                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))
+
+            bbox_pred = paddle.concat([bbox_pred, fake_bboxes])
+            bbox_num = bbox_num + 1
+
+        return bbox_pred, bbox_num
+
+    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
+        """
+        Rescale, clip and filter the bbox from the output of NMS to 
+        get final prediction. 
+
+        Notes:
+        Currently only support bs = 1.
+
+        Args:
+            bboxes (Tensor): The output bboxes with shape [N, 6] after decode
+                and NMS, including labels, scores and bboxes.
+            bbox_num (Tensor): The number of prediction boxes of each batch with
+                shape [1], and is N.
+            im_shape (Tensor): The shape of the input image.
+            scale_factor (Tensor): The scale factor of the input image.
+        Returns:
+            pred_result (Tensor): The final prediction results with shape [N, 6]
+                including labels, scores and bboxes.
+        """
+        if self.export_eb:
+            # enable rcnn models for edgeboard hw to skip the following postprocess.
+            return bboxes, bboxes, bbox_num
+
+        if not self.export_onnx:
+            bboxes_list = []
+            bbox_num_list = []
+            id_start = 0
+            fake_bboxes = paddle.to_tensor(
+                np.array(
+                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))
+            fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
+
+            # add fake bbox when output is empty for each batch
+            for i in range(bbox_num.shape[0]):
+                if bbox_num[i] == 0:
+                    bboxes_i = fake_bboxes
+                    bbox_num_i = fake_bbox_num
+                else:
+                    bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
+                    bbox_num_i = bbox_num[i]
+                    id_start += bbox_num[i]
+                bboxes_list.append(bboxes_i)
+                bbox_num_list.append(bbox_num_i)
+            bboxes = paddle.concat(bboxes_list)
+            bbox_num = paddle.concat(bbox_num_list)
+
+        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
+
+        if not self.export_onnx:
+            origin_shape_list = []
+            scale_factor_list = []
+            # scale_factor: scale_y, scale_x
+            for i in range(bbox_num.shape[0]):
+                expand_shape = paddle.expand(origin_shape[i:i + 1, :],
+                                             [bbox_num[i], 2])
+                scale_y, scale_x = scale_factor[i][0], scale_factor[i][1]
+                scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])
+                expand_scale = paddle.expand(scale, [bbox_num[i], 4])
+                origin_shape_list.append(expand_shape)
+                scale_factor_list.append(expand_scale)
+
+            self.origin_shape_list = paddle.concat(origin_shape_list)
+            scale_factor_list = paddle.concat(scale_factor_list)
+
+        else:
+            # simplify the computation for bs=1 when exporting onnx
+            scale_y, scale_x = scale_factor[0][0], scale_factor[0][1]
+            scale = paddle.concat(
+                [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0)
+            self.origin_shape_list = paddle.expand(origin_shape,
+                                                   [bbox_num[0], 2])
+            scale_factor_list = paddle.expand(scale, [bbox_num[0], 4])
+
+        # bboxes: [N, 6], label, score, bbox
+        pred_label = bboxes[:, 0:1]
+        pred_score = bboxes[:, 1:2]
+        pred_bbox = bboxes[:, 2:]
+        # rescale bbox to original image
+        scaled_bbox = pred_bbox / scale_factor_list
+        origin_h = self.origin_shape_list[:, 0]
+        origin_w = self.origin_shape_list[:, 1]
+        zeros = paddle.zeros_like(origin_h)
+        # clip bbox to [0, original_size]
+        x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros)
+        y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros)
+        x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros)
+        y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros)
+        pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
+        # filter empty bbox
+        keep_mask = nonempty_bbox(pred_bbox, return_mask=True)
+        keep_mask = paddle.unsqueeze(keep_mask, [1])
+        pred_label = paddle.where(keep_mask, pred_label,
+                                  paddle.ones_like(pred_label) * -1)
+        pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1)
+        return bboxes, pred_result, bbox_num
+
+    def get_origin_shape(self, ):
+        return self.origin_shape_list
+
+
+@register
+class DETRPostProcess(object):
+    __shared__ = ['num_classes', 'use_focal_loss', 'with_mask']
+    __inject__ = []
+
+    def __init__(self,
+                 num_classes=80,
+                 num_top_queries=100,
+                 dual_queries=False,
+                 dual_groups=0,
+                 use_focal_loss=False,
+                 with_mask=False,
+                 mask_threshold=0.5,
+                 use_avg_mask_score=False,
+                 bbox_decode_type='origin'):
+        super(DETRPostProcess, self).__init__()
+        assert bbox_decode_type in ['origin', 'pad']
+
+        self.num_classes = num_classes
+        self.num_top_queries = num_top_queries
+        self.dual_queries = dual_queries
+        self.dual_groups = dual_groups
+        self.use_focal_loss = use_focal_loss
+        self.with_mask = with_mask
+        self.mask_threshold = mask_threshold
+        self.use_avg_mask_score = use_avg_mask_score
+        self.bbox_decode_type = bbox_decode_type
+
+    def _mask_postprocess(self, mask_pred, score_pred, index):
+        mask_score = F.sigmoid(paddle.gather_nd(mask_pred, index))
+        mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)
+        if self.use_avg_mask_score:
+            avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (
+                mask_pred.sum([-2, -1]) + 1e-6)
+            score_pred *= avg_mask_score
+
+        return mask_pred[0].astype('int32'), score_pred
+
+    def __call__(self, head_out, im_shape, scale_factor, pad_shape):
+        """
+        Decode the bbox and mask.
+
+        Args:
+            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
+            im_shape (Tensor): The shape of the input image without padding.
+            scale_factor (Tensor): The scale factor of the input image.
+            pad_shape (Tensor): The shape of the input image with padding.
+        Returns:
+            bbox_pred (Tensor): The output prediction with shape [N, 6], including
+                labels, scores and bboxes. The size of bboxes are corresponding
+                to the input image, the bboxes may be used in other branch.
+            bbox_num (Tensor): The number of prediction boxes of each batch with
+                shape [bs], and is N.
+        """
+        bboxes, logits, masks = head_out
+        if self.dual_queries:
+            num_queries = logits.shape[1]
+            logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \
+                             bboxes[:, :int(num_queries // (self.dual_groups + 1)), :]
+
+        bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
+        # calculate the original shape of the image
+        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
+        img_h, img_w = paddle.split(origin_shape, 2, axis=-1)
+        if self.bbox_decode_type == 'pad':
+            # calculate the shape of the image with padding
+            out_shape = pad_shape / im_shape * origin_shape
+            out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)
+        elif self.bbox_decode_type == 'origin':
+            out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1)
+        else:
+            raise Exception(
+                f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.')
+        bbox_pred *= out_shape
+
+        scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
+            logits)[:, :, :-1]
+
+        if not self.use_focal_loss:
+            scores, labels = scores.max(-1), scores.argmax(-1)
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = paddle.topk(
+                    scores, self.num_top_queries, axis=-1)
+                batch_ind = paddle.arange(
+                    end=scores.shape[0]).unsqueeze(-1).tile(
+                        [1, self.num_top_queries])
+                index = paddle.stack([batch_ind, index], axis=-1)
+                labels = paddle.gather_nd(labels, index)
+                bbox_pred = paddle.gather_nd(bbox_pred, index)
+        else:
+            scores, index = paddle.topk(
+                scores.flatten(1), self.num_top_queries, axis=-1)
+            labels = index % self.num_classes
+            index = index // self.num_classes
+            batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
+                [1, self.num_top_queries])
+            index = paddle.stack([batch_ind, index], axis=-1)
+            bbox_pred = paddle.gather_nd(bbox_pred, index)
+
+        mask_pred = None
+        if self.with_mask:
+            assert masks is not None
+            masks = F.interpolate(
+                masks, scale_factor=4, mode="bilinear", align_corners=False)
+            # TODO: Support prediction with bs>1.
+            # remove padding for input image
+            h, w = im_shape.astype('int32')[0]
+            masks = masks[..., :h, :w]
+            # get pred_mask in the original resolution.
+            img_h = img_h[0].astype('int32')
+            img_w = img_w[0].astype('int32')
+            masks = F.interpolate(
+                masks,
+                size=(img_h, img_w),
+                mode="bilinear",
+                align_corners=False)
+            mask_pred, scores = self._mask_postprocess(masks, scores, index)
+
+        bbox_pred = paddle.concat(
+            [
+                labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
+                bbox_pred
+            ],
+            axis=-1)
+        bbox_num = paddle.to_tensor(
+            self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]])
+        bbox_pred = bbox_pred.reshape([-1, 6])
+        return bbox_pred, bbox_num, mask_pred
+
+
+def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
+    final_boxes = []
+    for c in range(num_classes):
+        idxs = bboxs[:, 0] == c
+        if np.count_nonzero(idxs) == 0: continue
+        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
+        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+    return final_boxes
+
+
+def nms(dets, match_threshold=0.6, match_metric='iou'):
+    """ Apply NMS to avoid detecting too many overlapping bounding boxes.
+        Args:
+            dets: shape [N, 5], [score, x1, y1, x2, y2]
+            match_metric: 'iou' or 'ios'
+            match_threshold: overlap thresh for match metric.
+    """
+    if dets.shape[0] == 0:
+        return dets[[], :]
+    scores = dets[:, 0]
+    x1 = dets[:, 1]
+    y1 = dets[:, 2]
+    x2 = dets[:, 3]
+    y2 = dets[:, 4]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int)
+
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            if match_metric == 'iou':
+                union = iarea + areas[j] - inter
+                match_value = inter / union
+            elif match_metric == 'ios':
+                smaller = min(iarea, areas[j])
+                match_value = inter / smaller
+            else:
+                raise ValueError()
+            if match_value >= match_threshold:
+                suppressed[j] = 1
+    keep = np.where(suppressed == 0)[0]
+    dets = dets[keep, :]
+    return dets
diff --git a/ppdet/modeling/proposal_generator/__init__.py b/ppdet/modeling/proposal_generator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7517b504d53d498b6b13606d7f742ef65837fb23
--- /dev/null
+++ b/ppdet/modeling/proposal_generator/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import anchor_generator
+
+from .anchor_generator import *
diff --git a/ppdet/modeling/proposal_generator/anchor_generator.py b/ppdet/modeling/proposal_generator/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..425b942e5794dc3dd81c639a17d8f22701c1f3ef
--- /dev/null
+++ b/ppdet/modeling/proposal_generator/anchor_generator.py
@@ -0,0 +1,133 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on 
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/anchor_generator.py
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+from ppdet.core.workspace import register
+
+__all__ = ['AnchorGenerator']
+
+
+@register
+class AnchorGenerator(nn.Layer):
+    """
+    Generate anchors according to the feature maps
+
+    Args:
+        anchor_sizes (list[float] | list[list[float]]): The anchor sizes at 
+            each feature point. list[float] means all feature levels share the 
+            same sizes. list[list[float]] means the anchor sizes for 
+            each level. The sizes stand for the scale of input size.
+        aspect_ratios (list[float] | list[list[float]]): The aspect ratios at
+            each feature point. list[float] means all feature levels share the
+            same ratios. list[list[float]] means the aspect ratios for
+            each level.
+        strides (list[float]): The strides of feature maps which generate 
+            anchors
+        offset (float): The offset of the coordinate of anchors, default 0.
+        
+    """
+
+    def __init__(self,
+                 anchor_sizes=[32, 64, 128, 256, 512],
+                 aspect_ratios=[0.5, 1.0, 2.0],
+                 strides=[16.0],
+                 variance=[1.0, 1.0, 1.0, 1.0],
+                 offset=0.):
+        super(AnchorGenerator, self).__init__()
+        self.anchor_sizes = anchor_sizes
+        self.aspect_ratios = aspect_ratios
+        self.strides = strides
+        self.variance = variance
+        self.cell_anchors = self._calculate_anchors(len(strides))
+        self.offset = offset
+
+    def _broadcast_params(self, params, num_features):
+        if not isinstance(params[0], (list, tuple)):  # list[float]
+            return [params] * num_features
+        if len(params) == 1:
+            return list(params) * num_features
+        return params
+
+    def generate_cell_anchors(self, sizes, aspect_ratios):
+        anchors = []
+        for size in sizes:
+            area = size**2.0
+            for aspect_ratio in aspect_ratios:
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
+                anchors.append([x0, y0, x1, y1])
+        return paddle.to_tensor(anchors, dtype='float32')
+
+    def _calculate_anchors(self, num_features):
+        sizes = self._broadcast_params(self.anchor_sizes, num_features)
+        aspect_ratios = self._broadcast_params(self.aspect_ratios, num_features)
+        cell_anchors = [
+            self.generate_cell_anchors(s, a)
+            for s, a in zip(sizes, aspect_ratios)
+        ]
+        [
+            self.register_buffer(
+                t.name, t, persistable=False) for t in cell_anchors
+        ]
+        return cell_anchors
+
+    def _create_grid_offsets(self, size, stride, offset):
+        grid_height, grid_width = size[0], size[1]
+        shifts_x = paddle.arange(
+            offset * stride, grid_width * stride, step=stride, dtype='float32')
+        shifts_y = paddle.arange(
+            offset * stride, grid_height * stride, step=stride, dtype='float32')
+        shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x)
+        shift_x = paddle.reshape(shift_x, [-1])
+        shift_y = paddle.reshape(shift_y, [-1])
+        return shift_x, shift_y
+
+    def _grid_anchors(self, grid_sizes):
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides,
+                                              self.cell_anchors):
+            shift_x, shift_y = self._create_grid_offsets(size, stride,
+                                                         self.offset)
+            shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1)
+            shifts = paddle.reshape(shifts, [-1, 1, 4])
+            base_anchors = paddle.reshape(base_anchors, [1, -1, 4])
+
+            anchors.append(paddle.reshape(shifts + base_anchors, [-1, 4]))
+
+        return anchors
+
+    def forward(self, input):
+        grid_sizes = [paddle.shape(feature_map)[-2:] for feature_map in input]
+        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
+        return anchors_over_all_feature_maps
+
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            int: number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios and 5 sizes, the number of anchors is 15.
+                For FPN models, `num_anchors` on every feature map is the same.
+        """
+        return len(self.cell_anchors[0])
diff --git a/ppdet/modeling/shape_spec.py b/ppdet/modeling/shape_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..81601fd64cbaedcee57389cfa71ad8c04e97274c
--- /dev/null
+++ b/ppdet/modeling/shape_spec.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py
+
+from collections import namedtuple
+
+
+class ShapeSpec(
+        namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
+    def __new__(cls, channels=None, height=None, width=None, stride=None):
+        return super(ShapeSpec, cls).__new__(cls, channels, height, width,
+                                             stride)
diff --git a/ppdet/modeling/ssod/__init__.py b/ppdet/modeling/ssod/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7588577e943fcac4bbe1f6ea8e1dd17c4ca8362
--- /dev/null
+++ b/ppdet/modeling/ssod/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import utils
+from . import losses
+
+from .utils import *
+from .losses import *
diff --git a/ppdet/modeling/ssod/losses.py b/ppdet/modeling/ssod/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4c5038d4b4c6657f8351ccaa3238d639b53d3f9
--- /dev/null
+++ b/ppdet/modeling/ssod/losses.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ppdet.modeling.losses.iou_loss import GIoULoss
+from .utils import QFLv2
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'SSODFCOSLoss',
+    'SSODPPYOLOELoss',
+]
+
+
+@register
+class SSODFCOSLoss(nn.Layer):
+    def __init__(self, loss_weight=1.0):
+        super(SSODFCOSLoss, self).__init__()
+        self.loss_weight = loss_weight
+
+    def forward(self, student_head_outs, teacher_head_outs, train_cfg):
+        # for semi-det distill
+        student_logits, student_deltas, student_quality = student_head_outs
+        teacher_logits, teacher_deltas, teacher_quality = teacher_head_outs
+        nc = student_logits[0].shape[1]
+
+        student_logits = paddle.concat(
+            [
+                _.transpose([0, 2, 3, 1]).reshape([-1, nc])
+                for _ in student_logits
+            ],
+            axis=0)
+        teacher_logits = paddle.concat(
+            [
+                _.transpose([0, 2, 3, 1]).reshape([-1, nc])
+                for _ in teacher_logits
+            ],
+            axis=0)
+
+        student_deltas = paddle.concat(
+            [
+                _.transpose([0, 2, 3, 1]).reshape([-1, 4])
+                for _ in student_deltas
+            ],
+            axis=0)
+        teacher_deltas = paddle.concat(
+            [
+                _.transpose([0, 2, 3, 1]).reshape([-1, 4])
+                for _ in teacher_deltas
+            ],
+            axis=0)
+
+        student_quality = paddle.concat(
+            [
+                _.transpose([0, 2, 3, 1]).reshape([-1, 1])
+                for _ in student_quality
+            ],
+            axis=0)
+        teacher_quality = paddle.concat(
+            [
+                _.transpose([0, 2, 3, 1]).reshape([-1, 1])
+                for _ in teacher_quality
+            ],
+            axis=0)
+
+        ratio = train_cfg.get('ratio', 0.01)
+        with paddle.no_grad():
+            # Region Selection
+            count_num = int(teacher_logits.shape[0] * ratio)
+            teacher_probs = F.sigmoid(teacher_logits)
+            max_vals = paddle.max(teacher_probs, 1)
+            sorted_vals, sorted_inds = paddle.topk(max_vals,
+                                                   teacher_logits.shape[0])
+            mask = paddle.zeros_like(max_vals)
+            mask[sorted_inds[:count_num]] = 1.
+            fg_num = sorted_vals[:count_num].sum()
+            b_mask = mask > 0
+
+        # distill_loss_cls
+        loss_logits = QFLv2(
+            F.sigmoid(student_logits),
+            teacher_probs,
+            weight=mask,
+            reduction="sum") / fg_num
+
+        # distill_loss_box
+        inputs = paddle.concat(
+            (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]),
+            axis=-1)
+        targets = paddle.concat(
+            (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]),
+            axis=-1)
+        iou_loss = GIoULoss(reduction='mean')
+        loss_deltas = iou_loss(inputs, targets)
+
+        # distill_loss_quality
+        loss_quality = F.binary_cross_entropy(
+            F.sigmoid(student_quality[b_mask]),
+            F.sigmoid(teacher_quality[b_mask]),
+            reduction='mean')
+
+        return {
+            "distill_loss_cls": loss_logits,
+            "distill_loss_box": loss_deltas,
+            "distill_loss_quality": loss_quality,
+            "fg_sum": fg_num,
+        }
+
+
+@register
+class SSODPPYOLOELoss(nn.Layer):
+    def __init__(self, loss_weight=1.0):
+        super(SSODPPYOLOELoss, self).__init__()
+        self.loss_weight = loss_weight
+
+    def forward(self, student_head_outs, teacher_head_outs, train_cfg):
+        # for semi-det distill
+        # student_probs: already sigmoid
+        student_probs, student_deltas, student_dfl = student_head_outs
+        teacher_probs, teacher_deltas, teacher_dfl = teacher_head_outs
+        bs, l, nc = student_probs.shape[:]  # bs, l, num_classes
+        bs, l, _, reg_ch = student_dfl.shape[:]  # bs, l, 4, reg_ch
+        student_probs = student_probs.reshape([-1, nc])
+        teacher_probs = teacher_probs.reshape([-1, nc])
+        student_deltas = student_deltas.reshape([-1, 4])
+        teacher_deltas = teacher_deltas.reshape([-1, 4])
+        student_dfl = student_dfl.reshape([-1, 4, reg_ch])
+        teacher_dfl = teacher_dfl.reshape([-1, 4, reg_ch])
+
+        ratio = train_cfg.get('ratio', 0.01)
+
+        # for contrast loss
+        curr_iter = train_cfg['curr_iter']
+        st_iter = train_cfg['st_iter']
+        if curr_iter == st_iter + 1:
+            # start semi-det training
+            self.queue_ptr = 0
+            self.queue_size = int(bs * l * ratio)
+            self.queue_feats = paddle.zeros([self.queue_size, nc])
+            self.queue_probs = paddle.zeros([self.queue_size, nc])
+        contrast_loss_cfg = train_cfg['contrast_loss']
+        temperature = contrast_loss_cfg.get('temperature', 0.2)
+        alpha = contrast_loss_cfg.get('alpha', 0.9)
+        smooth_iter = contrast_loss_cfg.get('smooth_iter', 100) + st_iter
+
+        with paddle.no_grad():
+            # Region Selection
+            count_num = int(teacher_probs.shape[0] * ratio)
+            max_vals = paddle.max(teacher_probs, 1)
+            sorted_vals, sorted_inds = paddle.topk(max_vals,
+                                                   teacher_probs.shape[0])
+            mask = paddle.zeros_like(max_vals)
+            mask[sorted_inds[:count_num]] = 1.
+            fg_num = sorted_vals[:count_num].sum()
+            b_mask = mask > 0.
+
+            # for contrast loss
+            probs = teacher_probs[b_mask].detach()
+            if curr_iter > smooth_iter:  # memory-smoothing
+                A = paddle.exp(
+                    paddle.mm(teacher_probs[b_mask], self.queue_probs.t()) /
+                    temperature)
+                A = A / A.sum(1, keepdim=True)
+                probs = alpha * probs + (1 - alpha) * paddle.mm(
+                    A, self.queue_probs)
+            n = student_probs[b_mask].shape[0]
+            # update memory bank
+            self.queue_feats[self.queue_ptr:self.queue_ptr +
+                             n, :] = teacher_probs[b_mask].detach()
+            self.queue_probs[self.queue_ptr:self.queue_ptr +
+                             n, :] = teacher_probs[b_mask].detach()
+            self.queue_ptr = (self.queue_ptr + n) % self.queue_size
+
+        # embedding similarity
+        sim = paddle.exp(
+            paddle.mm(student_probs[b_mask], teacher_probs[b_mask].t()) / 0.2)
+        sim_probs = sim / sim.sum(1, keepdim=True)
+        # pseudo-label graph with self-loop
+        Q = paddle.mm(probs, probs.t())
+        Q.fill_diagonal_(1)
+        pos_mask = (Q >= 0.5).astype('float32')
+        Q = Q * pos_mask
+        Q = Q / Q.sum(1, keepdim=True)
+        # contrastive loss
+        loss_contrast = -(paddle.log(sim_probs + 1e-7) * Q).sum(1)
+        loss_contrast = loss_contrast.mean()
+
+        # distill_loss_cls
+        loss_cls = QFLv2(
+            student_probs, teacher_probs, weight=mask, reduction="sum") / fg_num
+
+        # distill_loss_iou
+        inputs = paddle.concat(
+            (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]),
+            -1)
+        targets = paddle.concat(
+            (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]),
+            -1)
+        iou_loss = GIoULoss(reduction='mean')
+        loss_iou = iou_loss(inputs, targets)
+
+        # distill_loss_dfl
+        loss_dfl = F.cross_entropy(
+            student_dfl[b_mask].reshape([-1, reg_ch]),
+            teacher_dfl[b_mask].reshape([-1, reg_ch]),
+            soft_label=True,
+            reduction='mean')
+
+        return {
+            "distill_loss_cls": loss_cls,
+            "distill_loss_iou": loss_iou,
+            "distill_loss_dfl": loss_dfl,
+            "distill_loss_contrast": loss_contrast,
+            "fg_sum": fg_num,
+        }
diff --git a/ppdet/modeling/ssod/utils.py b/ppdet/modeling/ssod/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..09753abfeddd4a017cb64ec8560ad0da1e585708
--- /dev/null
+++ b/ppdet/modeling/ssod/utils.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+
+
+def align_weak_strong_shape(data_weak, data_strong):
+    max_shape_x = max(data_strong['image'].shape[2],
+                      data_weak['image'].shape[2])
+    max_shape_y = max(data_strong['image'].shape[3],
+                      data_weak['image'].shape[3])
+
+    scale_x_s = max_shape_x / data_strong['image'].shape[2]
+    scale_y_s = max_shape_y / data_strong['image'].shape[3]
+    scale_x_w = max_shape_x / data_weak['image'].shape[2]
+    scale_y_w = max_shape_y / data_weak['image'].shape[3]
+    target_size = [max_shape_x, max_shape_y]
+
+    if scale_x_s != 1 or scale_y_s != 1:
+        data_strong['image'] = F.interpolate(
+            data_strong['image'],
+            size=target_size,
+            mode='bilinear',
+            align_corners=False)
+        if 'gt_bbox' in data_strong:
+            gt_bboxes = data_strong['gt_bbox'].numpy()
+            for i in range(len(gt_bboxes)):
+                if len(gt_bboxes[i]) > 0:
+                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_s
+                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_s
+            data_strong['gt_bbox'] = paddle.to_tensor(gt_bboxes)
+
+    if scale_x_w != 1 or scale_y_w != 1:
+        data_weak['image'] = F.interpolate(
+            data_weak['image'],
+            size=target_size,
+            mode='bilinear',
+            align_corners=False)
+        if 'gt_bbox' in data_weak:
+            gt_bboxes = data_weak['gt_bbox'].numpy()
+            for i in range(len(gt_bboxes)):
+                if len(gt_bboxes[i]) > 0:
+                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_w
+                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_w
+            data_weak['gt_bbox'] = paddle.to_tensor(gt_bboxes)
+    return data_weak, data_strong
+
+
+def QFLv2(pred_sigmoid,
+          teacher_sigmoid,
+          weight=None,
+          beta=2.0,
+          reduction='mean'):
+    pt = pred_sigmoid
+    zerolabel = paddle.zeros_like(pt)
+    loss = F.binary_cross_entropy(
+        pred_sigmoid, zerolabel, reduction='none') * pt.pow(beta)
+    pos = weight > 0
+
+    pt = teacher_sigmoid[pos] - pred_sigmoid[pos]
+    loss[pos] = F.binary_cross_entropy(
+        pred_sigmoid[pos], teacher_sigmoid[pos],
+        reduction='none') * pt.pow(beta)
+
+    valid = weight >= 0
+    if reduction == "mean":
+        loss = loss[valid].mean()
+    elif reduction == "sum":
+        loss = loss[valid].sum()
+    return loss
diff --git a/ppdet/modeling/tests/__init__.py b/ppdet/modeling/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..847ddc47ac89114f2012bc6b9990a69abfe39fb3
--- /dev/null
+++ b/ppdet/modeling/tests/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ppdet/modeling/tests/imgs/coco2017_val2017_000000000139.jpg b/ppdet/modeling/tests/imgs/coco2017_val2017_000000000139.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..19023f718333c56c70776c79201dc03d742c1ed3
Binary files /dev/null and b/ppdet/modeling/tests/imgs/coco2017_val2017_000000000139.jpg differ
diff --git a/ppdet/modeling/tests/imgs/coco2017_val2017_000000000724.jpg b/ppdet/modeling/tests/imgs/coco2017_val2017_000000000724.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2a17e0c6ee400dcba762c4d56dea03d7e124b9c5
Binary files /dev/null and b/ppdet/modeling/tests/imgs/coco2017_val2017_000000000724.jpg differ
diff --git a/ppdet/modeling/tests/test_architectures.py b/ppdet/modeling/tests/test_architectures.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fd4c5364d02dc62fb26d64e8cb281b5c52415e4
--- /dev/null
+++ b/ppdet/modeling/tests/test_architectures.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import ppdet
+
+
+class TestYOLOv3(unittest.TestCase):
+    def setUp(self):
+        self.set_config()
+
+    def set_config(self):
+        self.cfg_file = 'configs/yolov3/yolov3_darknet53_270e_coco.yml'
+
+    def test_trainer(self):
+        # Trainer __init__ will build model and DataLoader
+        # 'train' and 'eval' mode include dataset loading
+        # use 'test' mode to simplify tests
+        cfg = ppdet.core.workspace.load_config(self.cfg_file)
+        trainer = ppdet.engine.Trainer(cfg, mode='test')
+
+
+class TestPPYOLOE(TestYOLOv3):
+    def set_config(self):
+        self.cfg_file = 'configs/ppyoloe/ppyoloe_plus_crn_s_80e_coco.yml'
+
+
+class TestYOLOX(TestYOLOv3):
+    def set_config(self):
+        self.cfg_file = 'configs/yolox/yolox_s_300e_coco.yml'
+
+
+class TestYOLOv5(TestYOLOv3):
+    def set_config(self):
+        self.cfg_file = 'configs/yolov5/yolov5_s_300e_coco.yml'
+
+
+class TestYOLOv6(TestYOLOv3):
+    def set_config(self):
+        self.cfg_file = 'configs/yolov6/yolov6_s_400e_coco.yml'
+
+
+class TestYOLOv7(TestYOLOv3):
+    def set_config(self):
+        self.cfg_file = 'configs/yolov7/yolov7_l_300e_coco.yml'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/ppdet/modeling/tests/test_base.py b/ppdet/modeling/tests/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..451aa78e32ce0682f55a2ab0f9d1ea03e939e481
--- /dev/null
+++ b/ppdet/modeling/tests/test_base.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import contextlib
+
+import paddle
+from paddle.static import Program
+
+
+class LayerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.seed = 111
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def _get_place(self, force_to_use_cpu=False):
+        # this option for ops that only have cpu kernel
+        if force_to_use_cpu:
+            return 'cpu'
+        else:
+            return paddle.device.get_device()
+
+    @contextlib.contextmanager
+    def static_graph(self):
+        paddle.enable_static()
+        scope = paddle.static.Scope()
+        program = Program()
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(program):
+                paddle.seed(self.seed)
+                paddle.framework.random._manual_program_seed(self.seed)
+                yield
+
+    def get_static_graph_result(self,
+                                feed,
+                                fetch_list,
+                                with_lod=False,
+                                force_to_use_cpu=False):
+        exe = paddle.static.Executor(self._get_place(force_to_use_cpu))
+        exe.run(paddle.static.default_startup_program())
+        return exe.run(paddle.static.default_main_program(),
+                       feed=feed,
+                       fetch_list=fetch_list,
+                       return_numpy=(not with_lod))
+
+    @contextlib.contextmanager
+    def dynamic_graph(self, force_to_use_cpu=False):
+        paddle.disable_static()
+        place = self._get_place(force_to_use_cpu=force_to_use_cpu)
+        paddle.device.set_device(place)
+        paddle.seed(self.seed)
+        paddle.framework.random._manual_program_seed(self.seed)
+        yield
diff --git a/ppdet/modeling/tests/test_ops.py b/ppdet/modeling/tests/test_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..3614d5b9c232ae9c1ce37b158a502b3349473cb4
--- /dev/null
+++ b/ppdet/modeling/tests/test_ops.py
@@ -0,0 +1,456 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os, sys
+# add python path of PadleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+import unittest
+import numpy as np
+
+import paddle
+
+import ppdet.modeling.ops as ops
+from ppdet.modeling.tests.test_base import LayerTest
+
+
+def make_rois(h, w, rois_num, output_size):
+    rois = np.zeros((0, 4)).astype('float32')
+    for roi_num in rois_num:
+        roi = np.zeros((roi_num, 4)).astype('float32')
+        roi[:, 0] = np.random.randint(0, h - output_size[0], size=roi_num)
+        roi[:, 1] = np.random.randint(0, w - output_size[1], size=roi_num)
+        roi[:, 2] = np.random.randint(roi[:, 0] + output_size[0], h)
+        roi[:, 3] = np.random.randint(roi[:, 1] + output_size[1], w)
+        rois = np.vstack((rois, roi))
+    return rois
+
+
+def softmax(x):
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestROIAlign(LayerTest):
+    def test_roi_align(self):
+        b, c, h, w = 2, 12, 20, 20
+        inputs_np = np.random.rand(b, c, h, w).astype('float32')
+        rois_num = [4, 6]
+        output_size = (7, 7)
+        rois_np = make_rois(h, w, rois_num, output_size)
+        rois_num_np = np.array(rois_num).astype('int32')
+        with self.static_graph():
+            inputs = paddle.static.data(
+                name='inputs', shape=[b, c, h, w], dtype='float32')
+            rois = paddle.static.data(
+                name='rois', shape=[10, 4], dtype='float32')
+            rois_num = paddle.static.data(
+                name='rois_num', shape=[None], dtype='int32')
+
+            output = paddle.vision.ops.roi_align(
+                x=inputs,
+                boxes=rois,
+                boxes_num=rois_num,
+                output_size=output_size)
+            output_np, = self.get_static_graph_result(
+                feed={
+                    'inputs': inputs_np,
+                    'rois': rois_np,
+                    'rois_num': rois_num_np
+                },
+                fetch_list=output,
+                with_lod=False)
+
+        with self.dynamic_graph():
+            inputs_dy = paddle.to_tensor(inputs_np)
+            rois_dy = paddle.to_tensor(rois_np)
+            rois_num_dy = paddle.to_tensor(rois_num_np)
+
+            output_dy = paddle.vision.ops.roi_align(
+                x=inputs_dy,
+                boxes=rois_dy,
+                boxes_num=rois_num_dy,
+                output_size=output_size)
+            output_dy_np = output_dy.numpy()
+
+        self.assertTrue(np.array_equal(output_np, output_dy_np))
+
+    def test_roi_align_error(self):
+        with self.static_graph():
+            inputs = paddle.static.data(
+                name='inputs', shape=[2, 12, 20, 20], dtype='float32')
+            rois = paddle.static.data(
+                name='data_error', shape=[10, 4], dtype='int32', lod_level=1)
+            self.assertRaises(
+                TypeError,
+                paddle.vision.ops.roi_align,
+                input=inputs,
+                rois=rois,
+                output_size=(7, 7))
+
+        paddle.disable_static()
+
+
+class TestROIPool(LayerTest):
+    def test_roi_pool(self):
+        b, c, h, w = 2, 12, 20, 20
+        inputs_np = np.random.rand(b, c, h, w).astype('float32')
+        rois_num = [4, 6]
+        output_size = (7, 7)
+        rois_np = make_rois(h, w, rois_num, output_size)
+        rois_num_np = np.array(rois_num).astype('int32')
+        with self.static_graph():
+            inputs = paddle.static.data(
+                name='inputs', shape=[b, c, h, w], dtype='float32')
+            rois = paddle.static.data(
+                name='rois', shape=[10, 4], dtype='float32')
+            rois_num = paddle.static.data(
+                name='rois_num', shape=[None], dtype='int32')
+
+            output = paddle.vision.ops.roi_pool(
+                x=inputs,
+                boxes=rois,
+                boxes_num=rois_num,
+                output_size=output_size)
+            output_np, = self.get_static_graph_result(
+                feed={
+                    'inputs': inputs_np,
+                    'rois': rois_np,
+                    'rois_num': rois_num_np
+                },
+                fetch_list=[output],
+                with_lod=False)
+
+        with self.dynamic_graph():
+            inputs_dy = paddle.to_tensor(inputs_np)
+            rois_dy = paddle.to_tensor(rois_np)
+            rois_num_dy = paddle.to_tensor(rois_num_np)
+
+            output_dy = paddle.vision.ops.roi_pool(
+                x=inputs_dy,
+                boxes=rois_dy,
+                boxes_num=rois_num_dy,
+                output_size=output_size)
+            output_dy_np = output_dy.numpy()
+
+        self.assertTrue(np.array_equal(output_np, output_dy_np))
+
+    def test_roi_pool_error(self):
+        with self.static_graph():
+            inputs = paddle.static.data(
+                name='inputs', shape=[2, 12, 20, 20], dtype='float32')
+            rois = paddle.static.data(
+                name='data_error', shape=[10, 4], dtype='int32', lod_level=1)
+            self.assertRaises(
+                TypeError,
+                paddle.vision.ops.roi_pool,
+                input=inputs,
+                rois=rois,
+                output_size=(7, 7))
+
+        paddle.disable_static()
+
+
+class TestPriorBox(LayerTest):
+    def test_prior_box(self):
+        input_np = np.random.rand(2, 10, 32, 32).astype('float32')
+        image_np = np.random.rand(2, 10, 40, 40).astype('float32')
+        min_sizes = [2, 4]
+        with self.static_graph():
+            input = paddle.static.data(
+                name='input', shape=[2, 10, 32, 32], dtype='float32')
+            image = paddle.static.data(
+                name='image', shape=[2, 10, 40, 40], dtype='float32')
+
+            box, var = ops.prior_box(
+                input=input,
+                image=image,
+                min_sizes=min_sizes,
+                clip=True,
+                flip=True)
+            box_np, var_np = self.get_static_graph_result(
+                feed={
+                    'input': input_np,
+                    'image': image_np,
+                },
+                fetch_list=[box, var],
+                with_lod=False)
+
+        with self.dynamic_graph():
+            inputs_dy = paddle.to_tensor(input_np)
+            image_dy = paddle.to_tensor(image_np)
+
+            box_dy, var_dy = ops.prior_box(
+                input=inputs_dy,
+                image=image_dy,
+                min_sizes=min_sizes,
+                clip=True,
+                flip=True)
+            box_dy_np = box_dy.numpy()
+            var_dy_np = var_dy.numpy()
+
+        self.assertTrue(np.array_equal(box_np, box_dy_np))
+        self.assertTrue(np.array_equal(var_np, var_dy_np))
+
+    def test_prior_box_error(self):
+        with self.static_graph():
+            input = paddle.static.data(
+                name='input', shape=[2, 10, 32, 32], dtype='int32')
+            image = paddle.static.data(
+                name='image', shape=[2, 10, 40, 40], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                ops.prior_box,
+                input=input,
+                image=image,
+                min_sizes=[2, 4],
+                clip=True,
+                flip=True)
+
+        paddle.disable_static()
+
+
+class TestMulticlassNms(LayerTest):
+    def test_multiclass_nms(self):
+        boxes_np = np.random.rand(10, 81, 4).astype('float32')
+        scores_np = np.random.rand(10, 81).astype('float32')
+        rois_num_np = np.array([2, 8]).astype('int32')
+        with self.static_graph():
+            boxes = paddle.static.data(
+                name='bboxes',
+                shape=[None, 81, 4],
+                dtype='float32',
+                lod_level=1)
+            scores = paddle.static.data(
+                name='scores', shape=[None, 81], dtype='float32', lod_level=1)
+            rois_num = paddle.static.data(
+                name='rois_num', shape=[None], dtype='int32')
+
+            output = ops.multiclass_nms(
+                bboxes=boxes,
+                scores=scores,
+                background_label=0,
+                score_threshold=0.5,
+                nms_top_k=400,
+                nms_threshold=0.3,
+                keep_top_k=200,
+                normalized=False,
+                return_index=True,
+                rois_num=rois_num)
+            out_np, index_np, nms_rois_num_np = self.get_static_graph_result(
+                feed={
+                    'bboxes': boxes_np,
+                    'scores': scores_np,
+                    'rois_num': rois_num_np
+                },
+                fetch_list=output,
+                with_lod=True)
+            out_np = np.array(out_np)
+            index_np = np.array(index_np)
+            nms_rois_num_np = np.array(nms_rois_num_np)
+
+        with self.dynamic_graph():
+            boxes_dy = paddle.to_tensor(boxes_np)
+            scores_dy = paddle.to_tensor(scores_np)
+            rois_num_dy = paddle.to_tensor(rois_num_np)
+
+            out_dy, index_dy, nms_rois_num_dy = ops.multiclass_nms(
+                bboxes=boxes_dy,
+                scores=scores_dy,
+                background_label=0,
+                score_threshold=0.5,
+                nms_top_k=400,
+                nms_threshold=0.3,
+                keep_top_k=200,
+                normalized=False,
+                return_index=True,
+                rois_num=rois_num_dy)
+            out_dy_np = out_dy.numpy()
+            index_dy_np = index_dy.numpy()
+            nms_rois_num_dy_np = nms_rois_num_dy.numpy()
+
+        self.assertTrue(np.array_equal(out_np, out_dy_np))
+        self.assertTrue(np.array_equal(index_np, index_dy_np))
+        self.assertTrue(np.array_equal(nms_rois_num_np, nms_rois_num_dy_np))
+
+    def test_multiclass_nms_error(self):
+        with self.static_graph():
+            boxes = paddle.static.data(
+                name='bboxes', shape=[81, 4], dtype='float32', lod_level=1)
+            scores = paddle.static.data(
+                name='scores', shape=[81], dtype='float32', lod_level=1)
+            rois_num = paddle.static.data(
+                name='rois_num', shape=[40, 41], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                ops.multiclass_nms,
+                boxes=boxes,
+                scores=scores,
+                background_label=0,
+                score_threshold=0.5,
+                nms_top_k=400,
+                nms_threshold=0.3,
+                keep_top_k=200,
+                normalized=False,
+                return_index=True,
+                rois_num=rois_num)
+
+
+class TestMatrixNMS(LayerTest):
+    def test_matrix_nms(self):
+        N, M, C = 7, 1200, 21
+        BOX_SIZE = 4
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = 0.01
+        post_threshold = 0.
+
+        scores_np = np.random.random((N * M, C)).astype('float32')
+        scores_np = np.apply_along_axis(softmax, 1, scores_np)
+        scores_np = np.reshape(scores_np, (N, M, C))
+        scores_np = np.transpose(scores_np, (0, 2, 1))
+
+        boxes_np = np.random.random((N, M, BOX_SIZE)).astype('float32')
+        boxes_np[:, :, 0:2] = boxes_np[:, :, 0:2] * 0.5
+        boxes_np[:, :, 2:4] = boxes_np[:, :, 2:4] * 0.5 + 0.5
+
+        with self.static_graph():
+            boxes = paddle.static.data(
+                name='boxes', shape=[N, M, BOX_SIZE], dtype='float32')
+            scores = paddle.static.data(
+                name='scores', shape=[N, C, M], dtype='float32')
+            out, index, _ = ops.matrix_nms(
+                bboxes=boxes,
+                scores=scores,
+                score_threshold=score_threshold,
+                post_threshold=post_threshold,
+                nms_top_k=nms_top_k,
+                keep_top_k=keep_top_k,
+                return_index=True)
+            out_np, index_np = self.get_static_graph_result(
+                feed={'boxes': boxes_np,
+                      'scores': scores_np},
+                fetch_list=[out, index],
+                with_lod=True)
+
+        with self.dynamic_graph():
+            boxes_dy = paddle.to_tensor(boxes_np)
+            scores_dy = paddle.to_tensor(scores_np)
+
+            out_dy, index_dy, _ = ops.matrix_nms(
+                bboxes=boxes_dy,
+                scores=scores_dy,
+                score_threshold=score_threshold,
+                post_threshold=post_threshold,
+                nms_top_k=nms_top_k,
+                keep_top_k=keep_top_k,
+                return_index=True)
+            out_dy_np = out_dy.numpy()
+            index_dy_np = index_dy.numpy()
+
+        self.assertTrue(np.array_equal(out_np, out_dy_np))
+        self.assertTrue(np.array_equal(index_np, index_dy_np))
+
+    def test_matrix_nms_error(self):
+        with self.static_graph():
+            bboxes = paddle.static.data(
+                name='bboxes', shape=[7, 1200, 4], dtype='float32')
+            scores = paddle.static.data(
+                name='data_error', shape=[7, 21, 1200], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                ops.matrix_nms,
+                bboxes=bboxes,
+                scores=scores,
+                score_threshold=0.01,
+                post_threshold=0.,
+                nms_top_k=400,
+                keep_top_k=200,
+                return_index=True)
+
+        paddle.disable_static()
+
+
+class TestBoxCoder(LayerTest):
+    def test_box_coder(self):
+
+        prior_box_np = np.random.random((81, 4)).astype('float32')
+        prior_box_var_np = np.random.random((81, 4)).astype('float32')
+        target_box_np = np.random.random((20, 81, 4)).astype('float32')
+
+        # static
+        with self.static_graph():
+            prior_box = paddle.static.data(
+                name='prior_box', shape=[81, 4], dtype='float32')
+            prior_box_var = paddle.static.data(
+                name='prior_box_var', shape=[81, 4], dtype='float32')
+            target_box = paddle.static.data(
+                name='target_box', shape=[20, 81, 4], dtype='float32')
+
+            boxes = ops.box_coder(
+                prior_box=prior_box,
+                prior_box_var=prior_box_var,
+                target_box=target_box,
+                code_type="decode_center_size",
+                box_normalized=False)
+
+            boxes_np, = self.get_static_graph_result(
+                feed={
+                    'prior_box': prior_box_np,
+                    'prior_box_var': prior_box_var_np,
+                    'target_box': target_box_np,
+                },
+                fetch_list=[boxes],
+                with_lod=False)
+
+        # dygraph
+        with self.dynamic_graph():
+            prior_box_dy = paddle.to_tensor(prior_box_np)
+            prior_box_var_dy = paddle.to_tensor(prior_box_var_np)
+            target_box_dy = paddle.to_tensor(target_box_np)
+
+            boxes_dy = ops.box_coder(
+                prior_box=prior_box_dy,
+                prior_box_var=prior_box_var_dy,
+                target_box=target_box_dy,
+                code_type="decode_center_size",
+                box_normalized=False)
+
+            boxes_dy_np = boxes_dy.numpy()
+
+            self.assertTrue(np.array_equal(boxes_np, boxes_dy_np))
+
+    def test_box_coder_error(self):
+        with self.static_graph():
+            prior_box = paddle.static.data(
+                name='prior_box', shape=[81, 4], dtype='int32')
+            prior_box_var = paddle.static.data(
+                name='prior_box_var', shape=[81, 4], dtype='float32')
+            target_box = paddle.static.data(
+                name='target_box', shape=[20, 81, 4], dtype='float32')
+
+            self.assertRaises(TypeError, ops.box_coder, prior_box,
+                              prior_box_var, target_box)
+
+        paddle.disable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/ppdet/modeling/tests/test_yolov3_loss.py b/ppdet/modeling/tests/test_yolov3_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c1c403b3690053440a8e88405f4313be08a765
--- /dev/null
+++ b/ppdet/modeling/tests/test_yolov3_loss.py
@@ -0,0 +1,403 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+
+import paddle
+import paddle.nn.functional as F
+# add python path of PadleDetection to sys.path
+import os
+import sys
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.modeling.losses import YOLOv3Loss
+from ppdet.data.transform.op_helper import jaccard_overlap
+from ppdet.modeling.bbox_utils import iou_similarity
+import numpy as np
+np.random.seed(0)
+
+
+def _split_output(output, an_num, num_classes):
+    """
+    Split output feature map to x, y, w, h, objectness, classification
+    along channel dimension
+    """
+    x = paddle.strided_slice(
+        output,
+        axes=[1],
+        starts=[0],
+        ends=[output.shape[1]],
+        strides=[5 + num_classes])
+    y = paddle.strided_slice(
+        output,
+        axes=[1],
+        starts=[1],
+        ends=[output.shape[1]],
+        strides=[5 + num_classes])
+    w = paddle.strided_slice(
+        output,
+        axes=[1],
+        starts=[2],
+        ends=[output.shape[1]],
+        strides=[5 + num_classes])
+    h = paddle.strided_slice(
+        output,
+        axes=[1],
+        starts=[3],
+        ends=[output.shape[1]],
+        strides=[5 + num_classes])
+    obj = paddle.strided_slice(
+        output,
+        axes=[1],
+        starts=[4],
+        ends=[output.shape[1]],
+        strides=[5 + num_classes])
+    clss = []
+    stride = output.shape[1] // an_num
+    for m in range(an_num):
+        clss.append(
+            paddle.slice(
+                output,
+                axes=[1],
+                starts=[stride * m + 5],
+                ends=[stride * m + 5 + num_classes]))
+    cls = paddle.transpose(paddle.stack(clss, axis=1), perm=[0, 1, 3, 4, 2])
+    return (x, y, w, h, obj, cls)
+
+
+def _split_target(target):
+    """
+    split target to x, y, w, h, objectness, classification
+    along dimension 2
+    target is in shape [N, an_num, 6 + class_num, H, W]
+    """
+    tx = target[:, :, 0, :, :]
+    ty = target[:, :, 1, :, :]
+    tw = target[:, :, 2, :, :]
+    th = target[:, :, 3, :, :]
+    tscale = target[:, :, 4, :, :]
+    tobj = target[:, :, 5, :, :]
+    tcls = paddle.transpose(target[:, :, 6:, :, :], perm=[0, 1, 3, 4, 2])
+    tcls.stop_gradient = True
+    return (tx, ty, tw, th, tscale, tobj, tcls)
+
+
+def _calc_obj_loss(output, obj, tobj, gt_box, batch_size, anchors, num_classes,
+                   downsample, ignore_thresh, scale_x_y):
+    # A prediction bbox overlap any gt_bbox over ignore_thresh, 
+    # objectness loss will be ignored, process as follows:
+    # 1. get pred bbox, which is same with YOLOv3 infer mode, use yolo_box here
+    # NOTE: img_size is set as 1.0 to get noramlized pred bbox
+    bbox, prob = paddle.vision.ops.yolo_box(
+        x=output,
+        img_size=paddle.ones(
+            shape=[batch_size, 2], dtype="int32"),
+        anchors=anchors,
+        class_num=num_classes,
+        conf_thresh=0.,
+        downsample_ratio=downsample,
+        clip_bbox=False,
+        scale_x_y=scale_x_y)
+    # 2. split pred bbox and gt bbox by sample, calculate IoU between pred bbox
+    #    and gt bbox in each sample
+    if batch_size > 1:
+        preds = paddle.split(bbox, batch_size, axis=0)
+        gts = paddle.split(gt_box, batch_size, axis=0)
+    else:
+        preds = [bbox]
+        gts = [gt_box]
+        probs = [prob]
+    ious = []
+    for pred, gt in zip(preds, gts):
+
+        def box_xywh2xyxy(box):
+            x = box[:, 0]
+            y = box[:, 1]
+            w = box[:, 2]
+            h = box[:, 3]
+            return paddle.stack(
+                [
+                    x - w / 2.,
+                    y - h / 2.,
+                    x + w / 2.,
+                    y + h / 2.,
+                ], axis=1)
+
+        pred = paddle.squeeze(pred, axis=[0])
+        gt = box_xywh2xyxy(paddle.squeeze(gt, axis=[0]))
+        ious.append(iou_similarity(pred, gt))
+    iou = paddle.stack(ious, axis=0)
+    # 3. Get iou_mask by IoU between gt bbox and prediction bbox,
+    #    Get obj_mask by tobj(holds gt_score), calculate objectness loss
+    max_iou = paddle.max(iou, axis=-1)
+    iou_mask = paddle.cast(max_iou <= ignore_thresh, dtype="float32")
+    output_shape = paddle.shape(output)
+    an_num = len(anchors) // 2
+    iou_mask = paddle.reshape(iou_mask, (-1, an_num, output_shape[2],
+                                         output_shape[3]))
+    iou_mask.stop_gradient = True
+    # NOTE: tobj holds gt_score, obj_mask holds object existence mask
+    obj_mask = paddle.cast(tobj > 0., dtype="float32")
+    obj_mask.stop_gradient = True
+    # For positive objectness grids, objectness loss should be calculated
+    # For negative objectness grids, objectness loss is calculated only iou_mask == 1.0
+    obj_sigmoid = F.sigmoid(obj)
+    loss_obj = F.binary_cross_entropy(obj_sigmoid, obj_mask, reduction='none')
+    loss_obj_pos = paddle.sum(loss_obj * tobj, axis=[1, 2, 3])
+    loss_obj_neg = paddle.sum(loss_obj * (1.0 - obj_mask) * iou_mask,
+                              axis=[1, 2, 3])
+    return loss_obj_pos, loss_obj_neg
+
+
+def fine_grained_loss(output,
+                      target,
+                      gt_box,
+                      batch_size,
+                      num_classes,
+                      anchors,
+                      ignore_thresh,
+                      downsample,
+                      scale_x_y=1.,
+                      eps=1e-10):
+    an_num = len(anchors) // 2
+    x, y, w, h, obj, cls = _split_output(output, an_num, num_classes)
+    tx, ty, tw, th, tscale, tobj, tcls = _split_target(target)
+
+    tscale_tobj = tscale * tobj
+
+    scale_x_y = scale_x_y
+
+    if (abs(scale_x_y - 1.0) < eps):
+        x = F.sigmoid(x)
+        y = F.sigmoid(y)
+        loss_x = F.binary_cross_entropy(x, tx, reduction='none') * tscale_tobj
+        loss_x = paddle.sum(loss_x, axis=[1, 2, 3])
+        loss_y = F.binary_cross_entropy(y, ty, reduction='none') * tscale_tobj
+        loss_y = paddle.sum(loss_y, axis=[1, 2, 3])
+    else:
+        dx = scale_x_y * F.sigmoid(x) - 0.5 * (scale_x_y - 1.0)
+        dy = scale_x_y * F.sigmoid(y) - 0.5 * (scale_x_y - 1.0)
+        loss_x = paddle.abs(dx - tx) * tscale_tobj
+        loss_x = paddle.sum(loss_x, axis=[1, 2, 3])
+        loss_y = paddle.abs(dy - ty) * tscale_tobj
+        loss_y = paddle.sum(loss_y, axis=[1, 2, 3])
+
+    # NOTE: we refined loss function of (w, h) as L1Loss
+    loss_w = paddle.abs(w - tw) * tscale_tobj
+    loss_w = paddle.sum(loss_w, axis=[1, 2, 3])
+    loss_h = paddle.abs(h - th) * tscale_tobj
+    loss_h = paddle.sum(loss_h, axis=[1, 2, 3])
+
+    loss_obj_pos, loss_obj_neg = _calc_obj_loss(
+        output, obj, tobj, gt_box, batch_size, anchors, num_classes, downsample,
+        ignore_thresh, scale_x_y)
+
+    cls = F.sigmoid(cls)
+    loss_cls = F.binary_cross_entropy(cls, tcls, reduction='none')
+    tobj = paddle.unsqueeze(tobj, axis=-1)
+
+    loss_cls = paddle.multiply(loss_cls, tobj)
+    loss_cls = paddle.sum(loss_cls, axis=[1, 2, 3, 4])
+
+    loss_xys = paddle.mean(loss_x + loss_y)
+    loss_whs = paddle.mean(loss_w + loss_h)
+    loss_objs = paddle.mean(loss_obj_pos + loss_obj_neg)
+    loss_clss = paddle.mean(loss_cls)
+
+    losses_all = {
+        "loss_xy": paddle.sum(loss_xys),
+        "loss_wh": paddle.sum(loss_whs),
+        "loss_loc": paddle.sum(loss_xys) + paddle.sum(loss_whs),
+        "loss_obj": paddle.sum(loss_objs),
+        "loss_cls": paddle.sum(loss_clss),
+    }
+    return losses_all, x, y, tx, ty
+
+
+def gt2yolotarget(gt_bbox, gt_class, gt_score, anchors, mask, num_classes, size,
+                  stride):
+    grid_h, grid_w = size
+    h, w = grid_h * stride, grid_w * stride
+    an_hw = np.array(anchors) / np.array([[w, h]])
+    target = np.zeros(
+        (len(mask), 6 + num_classes, grid_h, grid_w), dtype=np.float32)
+    for b in range(gt_bbox.shape[0]):
+        gx, gy, gw, gh = gt_bbox[b, :]
+        cls = gt_class[b]
+        score = gt_score[b]
+        if gw <= 0. or gh <= 0. or score <= 0.:
+            continue
+
+        # find best match anchor index
+        best_iou = 0.
+        best_idx = -1
+        for an_idx in range(an_hw.shape[0]):
+            iou = jaccard_overlap([0., 0., gw, gh],
+                                  [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])
+            if iou > best_iou:
+                best_iou = iou
+                best_idx = an_idx
+
+        gi = int(gx * grid_w)
+        gj = int(gy * grid_h)
+
+        # gtbox should be regresed in this layes if best match 
+        # anchor index in anchor mask of this layer
+        if best_idx in mask:
+            best_n = mask.index(best_idx)
+
+            # x, y, w, h, scale
+            target[best_n, 0, gj, gi] = gx * grid_w - gi
+            target[best_n, 1, gj, gi] = gy * grid_h - gj
+            target[best_n, 2, gj, gi] = np.log(gw * w / anchors[best_idx][0])
+            target[best_n, 3, gj, gi] = np.log(gh * h / anchors[best_idx][1])
+            target[best_n, 4, gj, gi] = 2.0 - gw * gh
+
+            # objectness record gt_score
+            # if target[best_n, 5, gj, gi] > 0:
+            #     print('find 1 duplicate')
+            target[best_n, 5, gj, gi] = score
+
+            # classification
+            target[best_n, 6 + cls, gj, gi] = 1.
+
+    return target
+
+
+class TestYolov3LossOp(unittest.TestCase):
+    def setUp(self):
+        self.initTestCase()
+        x = np.random.uniform(0, 1, self.x_shape).astype('float64')
+        gtbox = np.random.random(size=self.gtbox_shape).astype('float64')
+        gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
+        gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
+        gtbox = gtbox * gtmask[:, :, np.newaxis]
+        gtlabel = gtlabel * gtmask
+
+        gtscore = np.ones(self.gtbox_shape[:2]).astype('float64')
+        if self.gtscore:
+            gtscore = np.random.random(self.gtbox_shape[:2]).astype('float64')
+
+        target = []
+        for box, label, score in zip(gtbox, gtlabel, gtscore):
+            target.append(
+                gt2yolotarget(box, label, score, self.anchors, self.anchor_mask,
+                              self.class_num, (self.h, self.w
+                                               ), self.downsample_ratio))
+
+        self.target = np.array(target).astype('float64')
+
+        self.mask_anchors = []
+        for i in self.anchor_mask:
+            self.mask_anchors.extend(self.anchors[i])
+        self.x = x
+        self.gtbox = gtbox
+        self.gtlabel = gtlabel
+        self.gtscore = gtscore
+
+    def initTestCase(self):
+        self.b = 8
+        self.h = 19
+        self.w = 19
+        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
+                        [59, 119], [116, 90], [156, 198], [373, 326]]
+        self.anchor_mask = [6, 7, 8]
+        self.na = len(self.anchor_mask)
+        self.class_num = 80
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
+        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),
+                        self.h, self.w)
+        self.gtbox_shape = (self.b, 40, 4)
+        self.gtscore = True
+        self.use_label_smooth = False
+        self.scale_x_y = 1.
+
+    def test_loss(self):
+        x, gtbox, gtlabel, gtscore, target = self.x, self.gtbox, self.gtlabel, self.gtscore, self.target
+        yolo_loss = YOLOv3Loss(
+            ignore_thresh=self.ignore_thresh,
+            label_smooth=self.use_label_smooth,
+            num_classes=self.class_num,
+            downsample=self.downsample_ratio,
+            scale_x_y=self.scale_x_y)
+        x = paddle.to_tensor(x.astype(np.float32))
+        gtbox = paddle.to_tensor(gtbox.astype(np.float32))
+        gtlabel = paddle.to_tensor(gtlabel.astype(np.float32))
+        gtscore = paddle.to_tensor(gtscore.astype(np.float32))
+        t = paddle.to_tensor(target.astype(np.float32))
+        anchor = [self.anchors[i] for i in self.anchor_mask]
+        (yolo_loss1, px, py, tx, ty) = fine_grained_loss(
+            output=x,
+            target=t,
+            gt_box=gtbox,
+            batch_size=self.b,
+            num_classes=self.class_num,
+            anchors=self.mask_anchors,
+            ignore_thresh=self.ignore_thresh,
+            downsample=self.downsample_ratio,
+            scale_x_y=self.scale_x_y)
+        yolo_loss2 = yolo_loss.yolov3_loss(
+            x, t, gtbox, anchor, self.downsample_ratio, self.scale_x_y)
+        for k in yolo_loss2:
+            self.assertAlmostEqual(
+                float(yolo_loss1[k]), float(yolo_loss2[k]), delta=1e-2, msg=k)
+
+
+class TestYolov3LossNoGTScore(TestYolov3LossOp):
+    def initTestCase(self):
+        self.b = 1
+        self.h = 76
+        self.w = 76
+        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
+                        [59, 119], [116, 90], [156, 198], [373, 326]]
+        self.anchor_mask = [0, 1, 2]
+        self.na = len(self.anchor_mask)
+        self.class_num = 80
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 8
+        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),
+                        self.h, self.w)
+        self.gtbox_shape = (self.b, 40, 4)
+        self.gtscore = False
+        self.use_label_smooth = False
+        self.scale_x_y = 1.
+
+
+class TestYolov3LossWithScaleXY(TestYolov3LossOp):
+    def initTestCase(self):
+        self.b = 5
+        self.h = 38
+        self.w = 38
+        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
+                        [59, 119], [116, 90], [156, 198], [373, 326]]
+        self.anchor_mask = [3, 4, 5]
+        self.na = len(self.anchor_mask)
+        self.class_num = 80
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 16
+        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),
+                        self.h, self.w)
+        self.gtbox_shape = (self.b, 40, 4)
+        self.gtscore = True
+        self.use_label_smooth = False
+        self.scale_x_y = 1.2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/ppdet/modeling/transformers/__init__.py b/ppdet/modeling/transformers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a124026563c64dd75e345a809206812ddce749
--- /dev/null
+++ b/ppdet/modeling/transformers/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import detr_transformer
+from . import utils
+from . import matchers
+from . import position_encoding
+from . import deformable_transformer
+from . import dino_transformer
+from . import group_detr_transformer
+from . import mask_dino_transformer
+from . import rtdetr_transformer
+from . import hybrid_encoder
+
+from .detr_transformer import *
+from .utils import *
+from .matchers import *
+from .position_encoding import *
+from .deformable_transformer import *
+from .dino_transformer import *
+from .petr_transformer import *
+from .group_detr_transformer import *
+from .mask_dino_transformer import *
+from .rtdetr_transformer import *
+from .hybrid_encoder import *
diff --git a/ppdet/modeling/transformers/deformable_transformer.py b/ppdet/modeling/transformers/deformable_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab05704f4e8aa730bac1ad638e15dbc3e33abec5
--- /dev/null
+++ b/ppdet/modeling/transformers/deformable_transformer.py
@@ -0,0 +1,537 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from .utils import _get_clones, get_valid_ratio
+from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
+
+__all__ = ['DeformableTransformer']
+
+
+class MSDeformableAttention(nn.Layer):
+    def __init__(self,
+                 embed_dim=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 lr_mult=0.1):
+        """
+        Multi-Scale Deformable Attention Module
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.total_points = num_heads * num_levels * num_points
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(
+            embed_dim,
+            self.total_points * 2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+        try:
+            # use cuda op
+            from deformable_detr_ops import ms_deformable_attn
+        except:
+            # use paddle func
+            from .utils import deformable_attention_core_func as ms_deformable_attn
+        self.ms_deformable_attn_core = ms_deformable_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        constant_(self.sampling_offsets.weight)
+        thetas = paddle.arange(
+            self.num_heads,
+            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
+        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
+            [1, self.num_levels, self.num_points, 1])
+        scaling = paddle.arange(
+            1, self.num_points + 1,
+            dtype=paddle.float32).reshape([1, 1, -1, 1])
+        grid_init *= scaling
+        self.sampling_offsets.bias.set_value(grid_init.flatten())
+        # attention_weights
+        constant_(self.attention_weights.weight)
+        constant_(self.attention_weights.bias)
+        # proj
+        xavier_uniform_(self.value_proj.weight)
+        constant_(self.value_proj.bias)
+        xavier_uniform_(self.output_proj.weight)
+        constant_(self.output_proj.bias)
+
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+        assert int(value_spatial_shapes.prod(1).sum()) == Len_v
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2])
+            sampling_locations = reference_points.reshape([
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ]) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] *
+                0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(
+            value, value_spatial_shapes, value_level_start_index,
+            sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output
+
+
+class DeformableTransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=0.1,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DeformableTransformerEncoderLayer, self).__init__()
+        # self attention
+        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                               n_points, lr_mult)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self,
+                src,
+                reference_points,
+                spatial_shapes,
+                level_start_index,
+                src_mask=None,
+                query_pos_embed=None):
+        # self attention
+        src2 = self.self_attn(
+            self.with_pos_embed(src, query_pos_embed), reference_points, src,
+            spatial_shapes, level_start_index, src_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class DeformableTransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers):
+        super(DeformableTransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
+        valid_ratios = valid_ratios.unsqueeze(1)
+        reference_points = []
+        for i, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = paddle.meshgrid(
+                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
+            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
+                                                    H)
+            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
+                                                    W)
+            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
+        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
+        reference_points = reference_points * valid_ratios
+        return reference_points
+
+    def forward(self,
+                feat,
+                spatial_shapes,
+                level_start_index,
+                feat_mask=None,
+                query_pos_embed=None,
+                valid_ratios=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [feat.shape[0], spatial_shapes.shape[0], 2])
+        reference_points = self.get_reference_points(spatial_shapes,
+                                                     valid_ratios)
+        for layer in self.layers:
+            feat = layer(feat, reference_points, spatial_shapes,
+                         level_start_index, feat_mask, query_pos_embed)
+
+        return feat
+
+
+class DeformableTransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=0.1,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DeformableTransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                                n_points, lr_mult)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt2 = self.self_attn(q, k, value=tgt)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
+
+
+class DeformableTransformerDecoder(nn.Layer):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
+        super(DeformableTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask=None,
+                query_pos_embed=None):
+        output = tgt
+        intermediate = []
+        for lid, layer in enumerate(self.layers):
+            output = layer(output, reference_points, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           memory_mask, query_pos_embed)
+
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+@register
+class DeformableTransformer(nn.Layer):
+    __shared__ = ['hidden_dim']
+
+    def __init__(self,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 return_intermediate_dec=True,
+                 in_feats_channel=[512, 1024, 2048],
+                 num_feature_levels=4,
+                 num_encoder_points=4,
+                 num_decoder_points=4,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 lr_mult=0.1,
+                 pe_temperature=10000,
+                 pe_offset=-0.5):
+        super(DeformableTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(in_feats_channel) <= num_feature_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_feature_levels = num_feature_levels
+
+        encoder_layer = DeformableTransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            num_feature_levels, num_encoder_points, lr_mult)
+        self.encoder = DeformableTransformerEncoder(encoder_layer,
+                                                    num_encoder_layers)
+
+        decoder_layer = DeformableTransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            num_feature_levels, num_decoder_points)
+        self.decoder = DeformableTransformerDecoder(
+            decoder_layer, num_decoder_layers, return_intermediate_dec)
+
+        self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
+        self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+
+        self.reference_points = nn.Linear(
+            hidden_dim,
+            2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+        self.input_proj = nn.LayerList()
+        for in_channels in in_feats_channel:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channels, hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim)))
+        in_channels = in_feats_channel[-1]
+        for _ in range(num_feature_levels - len(in_feats_channel)):
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channels,
+                        hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1),
+                    nn.GroupNorm(32, hidden_dim)))
+            in_channels = hidden_dim
+
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset,
+            eps=1e-4)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        normal_(self.level_embed.weight)
+        normal_(self.tgt_embed.weight)
+        normal_(self.query_pos_embed.weight)
+        xavier_uniform_(self.reference_points.weight)
+        constant_(self.reference_points.bias)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+            constant_(l[0].bias)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_feats_channel': [i.channels for i in input_shape], }
+
+    def forward(self, src_feats, src_mask=None, *args, **kwargs):
+        srcs = []
+        for i in range(len(src_feats)):
+            srcs.append(self.input_proj[i](src_feats[i]))
+        if self.num_feature_levels > len(srcs):
+            len_srcs = len(srcs)
+            for i in range(len_srcs, self.num_feature_levels):
+                if i == len_srcs:
+                    srcs.append(self.input_proj[i](src_feats[-1]))
+                else:
+                    srcs.append(self.input_proj[i](srcs[-1]))
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for level, src in enumerate(srcs):
+            src_shape = paddle.shape(src)
+            bs = src_shape[0:1]
+            h = src_shape[2:3]
+            w = src_shape[3:4]
+            spatial_shapes.append(paddle.concat([h, w]))
+            src = src.flatten(2).transpose([0, 2, 1])
+            src_flatten.append(src)
+            if src_mask is not None:
+                mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[level]
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            mask = mask.flatten(1)
+            mask_flatten.append(mask)
+        src_flatten = paddle.concat(src_flatten, 1)
+        mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
+                                                                   1)
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [l, 2]
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes).astype('int64'))
+        # [l], 每一个level的起始index
+        level_start_index = paddle.concat([
+            paddle.zeros(
+                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
+        ])
+        # [b, l, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
+                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
+
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        reference_points = F.sigmoid(self.reference_points(query_embed))
+        reference_points_input = reference_points.unsqueeze(
+            2) * valid_ratios.unsqueeze(1)
+
+        # decoder
+        hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
+                          level_start_index, mask_flatten, query_embed)
+
+        return (hs, memory, reference_points)
diff --git a/ppdet/modeling/transformers/detr_transformer.py b/ppdet/modeling/transformers/detr_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..efeb320862752bf2ba8cf51381fd736ea2ee5506
--- /dev/null
+++ b/ppdet/modeling/transformers/detr_transformer.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention, _convert_attention_mask
+from .position_encoding import PositionEmbedding
+from .utils import _get_clones
+from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
+
+__all__ = ['DETRTransformer']
+
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        q = self.with_pos_embed(tgt, query_pos_embed)
+        k = self.with_pos_embed(memory, pos_embed)
+        tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self,
+                 decoder_layer,
+                 num_layers,
+                 norm=None,
+                 return_intermediate=False):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                pos_embed=pos_embed,
+                query_pos_embed=query_pos_embed)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+@register
+class DETRTransformer(nn.Layer):
+    __shared__ = ['hidden_dim']
+
+    def __init__(self,
+                 num_queries=100,
+                 position_embed_type='sine',
+                 return_intermediate_dec=True,
+                 backbone_num_channels=2048,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 pe_temperature=10000,
+                 pe_offset=0.,
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(DETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'],\
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            attn_dropout, act_dropout, normalize_before)
+        encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
+                                          encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            attn_dropout, act_dropout, normalize_before)
+        decoder_norm = nn.LayerNorm(hidden_dim)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec)
+
+        self.input_proj = nn.Conv2D(
+            backbone_num_channels, hidden_dim, kernel_size=1)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+        conv_init_(self.input_proj)
+        normal_(self.query_pos_embed.weight)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'backbone_num_channels': [i.channels for i in input_shape][-1],
+        }
+
+    def _convert_attention_mask(self, mask):
+        return (mask - 1.0) * 1e9
+
+    def forward(self, src, src_mask=None, *args, **kwargs):
+        r"""
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                [bs, H, W]`. When the data type is bool, the unwanted positions
+                have `False` values and the others have `True` values. When the
+                data type is int, the unwanted positions have 0 values and the
+                others have 1 values. When the data type is float, the unwanted
+                positions have `-INF` values and the others have 0 values. It
+                can be None when nothing wanted or needed to be prevented
+                attention to. Default None.
+
+        Returns:
+            output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
+            memory (Tensor): [batch_size, hidden_dim, h, w]
+        """
+        # use last level feature map
+        src_proj = self.input_proj(src[-1])
+        bs, c, h, w = paddle.shape(src_proj)
+        # flatten [B, C, H, W] to [B, HxW, C]
+        src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
+        if src_mask is not None:
+            src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
+        else:
+            src_mask = paddle.ones([bs, h, w])
+        pos_embed = self.position_embedding(src_mask).flatten(1, 2)
+
+        if self.training:
+            src_mask = self._convert_attention_mask(src_mask)
+            src_mask = src_mask.reshape([bs, 1, 1, h * w])
+        else:
+            src_mask = None
+
+        memory = self.encoder(
+            src_flatten, src_mask=src_mask, pos_embed=pos_embed)
+
+        query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
+            [bs, 1, 1])
+        tgt = paddle.zeros_like(query_pos_embed)
+        output = self.decoder(
+            tgt,
+            memory,
+            memory_mask=src_mask,
+            pos_embed=pos_embed,
+            query_pos_embed=query_pos_embed)
+
+        if self.training:
+            src_mask = src_mask.reshape([bs, 1, 1, h, w])
+        else:
+            src_mask = None
+
+        return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
+                src_proj, src_mask)
diff --git a/ppdet/modeling/transformers/dino_transformer.py b/ppdet/modeling/transformers/dino_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..89073e821d3e012dc8b61ef8cd6f367b870a44bd
--- /dev/null
+++ b/ppdet/modeling/transformers/dino_transformer.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from ..heads.detr_head import MLP
+from .deformable_transformer import (MSDeformableAttention,
+                                     DeformableTransformerEncoderLayer,
+                                     DeformableTransformerEncoder)
+from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
+                           bias_init_with_prob)
+from .utils import (_get_clones, get_valid_ratio,
+                    get_contrastive_denoising_training_group,
+                    get_sine_pos_embed, inverse_sigmoid)
+
+__all__ = ['DINOTransformer']
+
+
+class DINOTransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=1.0,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DINOTransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                                n_points, lr_mult)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        if attn_mask is not None:
+            attn_mask = paddle.where(
+                attn_mask.astype('bool'),
+                paddle.zeros(attn_mask.shape, tgt.dtype),
+                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+class DINOTransformerDecoder(nn.Layer):
+    def __init__(self,
+                 hidden_dim,
+                 decoder_layer,
+                 num_layers,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DINOTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(
+            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                query_pos_head,
+                valid_ratios=None,
+                attn_mask=None,
+                memory_mask=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [memory.shape[0], memory_spatial_shapes.shape[0], 2])
+
+        output = tgt
+        intermediate = []
+        inter_bboxes = []
+        ref_points = F.sigmoid(ref_points_unact)
+        for i, layer in enumerate(self.layers):
+            reference_points_input = ref_points.detach().unsqueeze(
+                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
+            query_pos_embed = get_sine_pos_embed(
+                reference_points_input[..., 0, :], self.hidden_dim // 2)
+            query_pos_embed = query_pos_head(query_pos_embed)
+
+            output = layer(output, reference_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                ref_points.detach()))
+
+            intermediate.append(self.norm(output))
+            inter_bboxes.append(ref_points)
+
+        return paddle.stack(intermediate), paddle.stack(inter_bboxes)
+
+
+@register
+class DINOTransformer(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=900,
+                 position_embed_type='sine',
+                 in_feats_channel=[512, 1024, 2048],
+                 num_levels=4,
+                 num_encoder_points=4,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 lr_mult=1.0,
+                 pe_temperature=10000,
+                 pe_offset=-0.5,
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=True,
+                 eps=1e-2):
+        super(DINOTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(in_feats_channel) <= num_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+
+        weight_attr = ParamAttr(regularizer=L2Decay(0.0))
+        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
+        # backbone feature projection
+        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
+
+        # Transformer module
+        encoder_layer = DeformableTransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_encoder_points, lr_mult, weight_attr, bias_attr)
+        self.encoder = DeformableTransformerEncoder(encoder_layer,
+                                                    num_encoder_layers)
+        decoder_layer = DINOTransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_decoder_points, lr_mult, weight_attr, bias_attr)
+        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
+                                              num_decoder_layers, weight_attr,
+                                              bias_attr)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # position embedding
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset)
+        self.level_embed = nn.Embedding(num_levels, hidden_dim)
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(2 * hidden_dim,
+                                  hidden_dim,
+                                  hidden_dim,
+                                  num_layers=2)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+        # decoder head
+        self.dec_score_head = nn.LayerList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_decoder_layers)
+        ])
+        self.dec_bbox_head = nn.LayerList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_decoder_layers)
+        ])
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight)
+        constant_(self.enc_bbox_head.layers[-1].bias)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight)
+            constant_(reg_.layers[-1].bias)
+
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        normal_(self.level_embed.weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+            constant_(l[0].bias)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_feats_channel': [i.channels for i in input_shape], }
+
+    def _build_input_proj_layer(self,
+                                in_feats_channel,
+                                weight_attr=None,
+                                bias_attr=None):
+        self.input_proj = nn.LayerList()
+        for in_channels in in_feats_channel:
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, self.hidden_dim, kernel_size=1)), (
+                            'norm', nn.GroupNorm(
+                                32,
+                                self.hidden_dim,
+                                weight_attr=weight_attr,
+                                bias_attr=bias_attr))))
+        in_channels = in_feats_channel[-1]
+        for _ in range(self.num_levels - len(in_feats_channel)):
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1)), ('norm', nn.GroupNorm(
+                            32,
+                            self.hidden_dim,
+                            weight_attr=weight_attr,
+                            bias_attr=bias_attr))))
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats, pad_mask=None):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for i, feat in enumerate(proj_feats):
+            bs, _, h, w = paddle.shape(feat)
+            spatial_shapes.append(paddle.concat([h, w]))
+            # [b,c,h,w] -> [b,h*w,c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            if pad_mask is not None:
+                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            # [b, h*w, c]
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[i]
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            if pad_mask is not None:
+                # [b, h*w]
+                mask_flatten.append(mask.flatten(1))
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        # [b, l]
+        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
+                                                                   1)
+        # [b, l, c]
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [num_levels, 2]
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes).astype('int64'))
+        # [l] start index of each level
+        level_start_index = paddle.concat([
+            paddle.zeros(
+                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
+        ])
+        # [b, num_levels, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+                lvl_pos_embed_flatten, valid_ratios)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None):
+        # input projection and embedding
+        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+         lvl_pos_embed_flatten,
+         valid_ratios) = self._get_encoder_input(feats, pad_mask)
+
+        # encoder
+        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
+                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(gt_meta,
+                                            self.num_classes,
+                                            self.num_queries,
+                                            self.denoising_class_embed.weight,
+                                            self.num_denoising,
+                                            self.label_noise_ratio,
+                                            self.box_noise_scale)
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(
+            memory, spatial_shapes, mask_flatten, denoising_class,
+            denoising_bbox_unact)
+
+        # decoder
+        inter_feats, inter_bboxes = self.decoder(
+            target, init_ref_points_unact, memory, spatial_shapes,
+            level_start_index, self.dec_bbox_head, self.query_pos_head,
+            valid_ratios, attn_mask, mask_flatten)
+        out_bboxes = []
+        out_logits = []
+        for i in range(self.num_decoder_layers):
+            out_logits.append(self.dec_score_head[i](inter_feats[i]))
+            if i == 0:
+                out_bboxes.append(
+                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
+                              init_ref_points_unact))
+            else:
+                out_bboxes.append(
+                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
+                              inverse_sigmoid(inter_bboxes[i - 1])))
+        out_bboxes = paddle.stack(out_bboxes)
+        out_logits = paddle.stack(out_logits)
+
+        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
+                dn_meta)
+
+    def _get_encoder_output_anchors(self,
+                                    memory,
+                                    spatial_shapes,
+                                    memory_mask=None,
+                                    grid_size=0.05):
+        output_anchors = []
+        idx = 0
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            if memory_mask is not None:
+                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
+                valid_H = paddle.sum(mask_[:, :, 0], 1)
+                valid_W = paddle.sum(mask_[:, 0, :], 1)
+            else:
+                valid_H, valid_W = h, w
+
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(end=h), paddle.arange(end=w))
+            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
+
+            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
+                [-1, 1, 1, 2]).astype(grid_xy.dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            output_anchors.append(
+                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+            idx += h * w
+
+        output_anchors = paddle.concat(output_anchors, 1)
+        valid_mask = ((output_anchors > self.eps) *
+                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
+        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
+        if memory_mask is not None:
+            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
+        output_anchors = paddle.where(valid_mask, output_anchors,
+                                      paddle.to_tensor(float("inf")))
+
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
+        output_memory = self.enc_output(memory)
+        return output_memory, output_anchors
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           memory_mask=None,
+                           denoising_class=None,
+                           denoising_bbox_unact=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        output_memory, output_anchors = self._get_encoder_output_anchors(
+            memory, spatial_shapes, memory_mask)
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(
+            output_memory) + output_anchors
+
+        _, topk_ind = paddle.topk(
+            enc_outputs_class.max(-1), self.num_queries, axis=1)
+        # extract region proposal boxes
+        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
+                                                  topk_ind)  # unsigmoided.
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = paddle.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = paddle.gather_nd(output_memory, topk_ind).detach()
+        if denoising_class is not None:
+            target = paddle.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact.detach(
+        ), enc_topk_bboxes, enc_topk_logits
diff --git a/ppdet/modeling/transformers/ext_op/README.md b/ppdet/modeling/transformers/ext_op/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..290926d56a3ae23ccd1b36861d047c6e6bf13187
--- /dev/null
+++ b/ppdet/modeling/transformers/ext_op/README.md
@@ -0,0 +1,85 @@
+# Multi-scale deformable attention自定义OP编译
+该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
+
+## 1. 环境依赖
+- Paddle >= 2.3.2
+- gcc 8.2
+
+## 2. 安装
+请在当前路径下进行编译安装
+```
+cd PaddleDetection/ppdet/modeling/transformers/ext_op/
+python setup_ms_deformable_attn_op.py install
+```
+
+编译完成后即可使用，以下为`ms_deformable_attn`的使用示例
+```
+# 引入自定义op
+from deformable_detr_ops import ms_deformable_attn
+
+# 构造fake input tensor
+bs, n_heads, c = 2, 8, 8
+query_length, n_levels, n_points = 2, 2, 2
+spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
+level_start_index = paddle.concat((paddle.to_tensor(
+    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+value_length = sum([(H * W).item() for H, W in spatial_shapes])
+
+def get_test_tensors(channels):
+    value = paddle.rand(
+        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points, 2],
+        dtype=paddle.float32)
+    attention_weights = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points],
+        dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
+        -2, keepdim=True)
+    return [value, sampling_locations, attention_weights]
+
+value, sampling_locations, attention_weights = get_test_tensors(c)
+
+output = ms_deformable_attn(value,
+                            spatial_shapes,
+                            level_start_index,
+                            sampling_locations,
+                            attention_weights)
+```
+
+## 3. 单元测试
+可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
+```
+python test_ms_deformable_attn_op.py
+```
+运行成功后，打印如下：
+```
+*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
+*tensor1 True check_gradient_numerical(D=30)
+*tensor2 True check_gradient_numerical(D=30)
+*tensor3 True check_gradient_numerical(D=30)
+*tensor1 True check_gradient_numerical(D=32)
+*tensor2 True check_gradient_numerical(D=32)
+*tensor3 True check_gradient_numerical(D=32)
+*tensor1 True check_gradient_numerical(D=64)
+*tensor2 True check_gradient_numerical(D=64)
+*tensor3 True check_gradient_numerical(D=64)
+*tensor1 True check_gradient_numerical(D=71)
+*tensor2 True check_gradient_numerical(D=71)
+*tensor3 True check_gradient_numerical(D=71)
+*tensor1 True check_gradient_numerical(D=128)
+*tensor2 True check_gradient_numerical(D=128)
+*tensor3 True check_gradient_numerical(D=128)
+*tensor1 True check_gradient_numerical(D=1024)
+*tensor2 True check_gradient_numerical(D=1024)
+*tensor3 True check_gradient_numerical(D=1024)
+*tensor1 True check_gradient_numerical(D=1025)
+*tensor2 True check_gradient_numerical(D=1025)
+*tensor3 True check_gradient_numerical(D=1025)
+*tensor1 True check_gradient_numerical(D=2048)
+*tensor2 True check_gradient_numerical(D=2048)
+*tensor3 True check_gradient_numerical(D=2048)
+*tensor1 True check_gradient_numerical(D=3096)
+*tensor2 True check_gradient_numerical(D=3096)
+*tensor3 True check_gradient_numerical(D=3096)
+```
diff --git a/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc b/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1758adbcd995189085ed1661be889cb7cf7a25c
--- /dev/null
+++ b/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+#include <vector>
+
+// declare GPU implementation
+std::vector<paddle::Tensor>
+MSDeformableAttnCUDAForward(const paddle::Tensor &value,
+                            const paddle::Tensor &value_spatial_shapes,
+                            const paddle::Tensor &value_level_start_index,
+                            const paddle::Tensor &sampling_locations,
+                            const paddle::Tensor &attention_weights);
+
+std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
+    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
+    const paddle::Tensor &value_level_start_index,
+    const paddle::Tensor &sampling_locations,
+    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
+
+//// CPU not implemented
+
+std::vector<std::vector<int64_t>>
+MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
+                           std::vector<int64_t> value_spatial_shapes_shape,
+                           std::vector<int64_t> value_level_start_index_shape,
+                           std::vector<int64_t> sampling_locations_shape,
+                           std::vector<int64_t> attention_weights_shape) {
+  return {{value_shape[0], sampling_locations_shape[1],
+           value_shape[2] * value_shape[3]}};
+}
+
+std::vector<paddle::DataType>
+MSDeformableAttnInferDtype(paddle::DataType value_dtype,
+                           paddle::DataType value_spatial_shapes_dtype,
+                           paddle::DataType value_level_start_index_dtype,
+                           paddle::DataType sampling_locations_dtype,
+                           paddle::DataType attention_weights_dtype) {
+  return {value_dtype};
+}
+
+PD_BUILD_OP(ms_deformable_attn)
+    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
+             "AttentionWeights"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
+
+PD_BUILD_GRAD_OP(ms_deformable_attn)
+    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
+             "AttentionWeights", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
+              paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
+              paddle::Grad("AttentionWeights")})
+    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
diff --git a/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu b/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d5a8d16181bb53b9e5e5b3167adb283fba4db763
--- /dev/null
+++ b/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
@@ -0,0 +1,1073 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
+// forward bilinear
+template <typename data_t>
+__device__ data_t deformable_attn_bilinear_forward(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// forward kernel
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_forward(
+    const int n, const data_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const data_t *data_sampling_loc,
+    const data_t *data_attn_weight, const int batch_size,
+    const int value_length, const int num_heads, const int channels,
+    const int num_levels, const int query_length, const int num_points,
+    data_t *output_data_ptr) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    data_t *data_ptr = output_data_ptr + index;
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+    data_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset +
+                                                   level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += deformable_attn_bilinear_forward(
+                     data_value_ptr, spatial_h, spatial_w, num_heads, channels,
+                     h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_ptr = col;
+  }
+}
+
+#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+// forward
+std::vector<paddle::Tensor>
+MSDeformableAttnCUDAForward(const paddle::Tensor &value,
+                            const paddle::Tensor &value_spatial_shapes,
+                            const paddle::Tensor &value_level_start_index,
+                            const paddle::Tensor &sampling_locations,
+                            const paddle::Tensor &attention_weights) {
+
+  CHECK_INPUT_GPU(value);
+  CHECK_INPUT_GPU(value_spatial_shapes);
+  CHECK_INPUT_GPU(value_level_start_index);
+  CHECK_INPUT_GPU(sampling_locations);
+  CHECK_INPUT_GPU(attention_weights);
+
+  const int batch_size = value.shape()[0];
+  const int value_length = value.shape()[1];
+  const int num_heads = value.shape()[2];
+  const int channels = value.shape()[3];
+
+  const int num_levels = value_spatial_shapes.shape()[0];
+  const int query_length = sampling_locations.shape()[1];
+  const int num_points = sampling_locations.shape()[4];
+
+  auto output = paddle::full({batch_size, query_length, num_heads * channels},
+                             0, value.dtype(), paddle::GPUPlace());
+
+  const int num_kernels = batch_size * query_length * num_heads * channels;
+  deformable_attn_cuda_kernel_forward<float>
+      <<<GET_BLOCKS(num_kernels, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
+         value.stream()>>>(num_kernels, value.data<float>(),
+                           value_spatial_shapes.data<int64_t>(),
+                           value_level_start_index.data<int64_t>(),
+                           sampling_locations.data<float>(),
+                           attention_weights.data<float>(), batch_size,
+                           value_length, num_heads, channels, num_levels,
+                           query_length, num_points, output.data<float>());
+  return {output};
+}
+
+// backward bilinear
+template <typename data_t>
+__device__ void deformable_attn_bilinear_backward(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c, const data_t &top_grad,
+    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const data_t top_grad_value = top_grad * attn_weight;
+  data_t grad_h_weight = 0, grad_w_weight = 0;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename data_t>
+__device__ void deformable_attn_bilinear_backward_gm(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c, const data_t &top_grad,
+    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const data_t top_grad_value = top_grad * attn_weight;
+  data_t grad_h_weight = 0, grad_w_weight = 0;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+// backward kernels
+// channels > 1024
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_gm(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+// channels <= 1024
+template <typename data_t, unsigned int blockSize>
+__global__ void
+deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ data_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          data_t _grad_w = cache_grad_sampling_loc[0],
+                 _grad_h = cache_grad_sampling_loc[1],
+                 _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t, unsigned int blockSize>
+__global__ void
+deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ data_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          data_t _grad_w = cache_grad_sampling_loc[0],
+                 _grad_h = cache_grad_sampling_loc[1],
+                 _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+// backward branch
+template <typename data_t>
+void deformable_attn_cuda_backward(
+    cudaStream_t stream, const data_t *grad_out, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
+  const int num_kernels = batch_size * query_length * num_heads * channels;
+  const int num_actual_kernels =
+      batch_size * query_length * num_heads * channels;
+  if (channels > 1024) {
+    if ((channels & 1023) == 0) {
+      deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks<data_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(data_t), stream>>>(
+              num_kernels, grad_out, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, value_length, num_heads, channels, num_levels,
+              query_length, num_points, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      deformable_attn_cuda_kernel_backward_gm<data_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+    case 1:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         1>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 2:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         2>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 4:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         4>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 8:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         8>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 16:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         16>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 32:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         32>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 64:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         64>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 128:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         128>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 256:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         256>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 512:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         512>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 1024:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         1024>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    default:
+      if (channels < 64) {
+        deformable_attn_cuda_kernel_backward_shm_reduce_v1<data_t>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+               num_threads * 3 * sizeof(data_t), stream>>>(
+                num_kernels, grad_out, data_value, data_spatial_shapes,
+                data_level_start_index, data_sampling_loc, data_attn_weight,
+                batch_size, value_length, num_heads, channels, num_levels,
+                query_length, num_points, grad_value, grad_sampling_loc,
+                grad_attn_weight);
+      } else {
+        deformable_attn_cuda_kernel_backward_shm_reduce_v2<data_t>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+               num_threads * 3 * sizeof(data_t), stream>>>(
+                num_kernels, grad_out, data_value, data_spatial_shapes,
+                data_level_start_index, data_sampling_loc, data_attn_weight,
+                batch_size, value_length, num_heads, channels, num_levels,
+                query_length, num_points, grad_value, grad_sampling_loc,
+                grad_attn_weight);
+      }
+    }
+  }
+}
+
+// backward
+std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
+    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
+    const paddle::Tensor &value_level_start_index,
+    const paddle::Tensor &sampling_locations,
+    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) {
+
+  CHECK_INPUT_GPU(value);
+  CHECK_INPUT_GPU(value_spatial_shapes);
+  CHECK_INPUT_GPU(value_level_start_index);
+  CHECK_INPUT_GPU(sampling_locations);
+  CHECK_INPUT_GPU(attention_weights);
+  CHECK_INPUT_GPU(grad_out);
+
+  const int batch_size = value.shape()[0];
+  const int value_length = value.shape()[1];
+  const int num_heads = value.shape()[2];
+  const int channels = value.shape()[3];
+
+  const int num_levels = value_spatial_shapes.shape()[0];
+  const int query_length = sampling_locations.shape()[1];
+  const int num_points = sampling_locations.shape()[4];
+
+  auto grad_value =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_spatial_shapes =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_level_start_index =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_sampling_locations =
+      paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(),
+                   paddle::GPUPlace());
+  auto grad_attention_weights =
+      paddle::full(attention_weights.shape(), 0, attention_weights.dtype(),
+                   paddle::GPUPlace());
+
+  deformable_attn_cuda_backward<float>(
+      value.stream(), grad_out.data<float>(), value.data<float>(),
+      value_spatial_shapes.data<int64_t>(),
+      value_level_start_index.data<int64_t>(), sampling_locations.data<float>(),
+      attention_weights.data<float>(), batch_size, value_length, num_heads,
+      channels, num_levels, query_length, num_points, grad_value.data<float>(),
+      grad_sampling_locations.data<float>(),
+      grad_attention_weights.data<float>());
+
+  return {grad_value, grad_spatial_shapes, grad_level_start_index,
+          grad_sampling_locations, grad_attention_weights};
+}
diff --git a/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py b/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3c386677e5d5eb5ccc91315e958cb9efc21c3e
--- /dev/null
+++ b/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
@@ -0,0 +1,7 @@
+from paddle.utils.cpp_extension import CUDAExtension, setup
+
+if __name__ == "__main__":
+    setup(
+        name='deformable_detr_ops',
+        ext_modules=CUDAExtension(
+            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
diff --git a/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py b/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a05737cbcd6deef55b10f73d39dbd46beeebf7
--- /dev/null
+++ b/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import os
+import sys
+import random
+import numpy as np
+import paddle
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.modeling.transformers.utils import deformable_attention_core_func
+ms_deform_attn_core_paddle = deformable_attention_core_func
+
+try:
+    gpu_index = int(sys.argv[1])
+except:
+    gpu_index = 0
+print(f'Use gpu {gpu_index} to test...')
+paddle.set_device(f'gpu:{gpu_index}')
+
+try:
+    from deformable_detr_ops import ms_deformable_attn
+except Exception as e:
+    print('import deformable_detr_ops error', e)
+    sys.exit(-1)
+
+paddle.seed(1)
+random.seed(1)
+np.random.seed(1)
+
+bs, n_heads, c = 2, 8, 8
+query_length, n_levels, n_points = 2, 2, 2
+spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
+level_start_index = paddle.concat((paddle.to_tensor(
+    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+value_length = sum([(H * W).item() for H, W in spatial_shapes])
+
+
+def get_test_tensors(channels):
+    value = paddle.rand(
+        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points, 2],
+        dtype=paddle.float32)
+    attention_weights = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points],
+        dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
+        -2, keepdim=True)
+
+    return [value, sampling_locations, attention_weights]
+
+
+@paddle.no_grad()
+def check_forward_equal_with_paddle_float():
+    value, sampling_locations, attention_weights = get_test_tensors(c)
+
+    output_paddle = ms_deform_attn_core_paddle(
+        value, spatial_shapes, level_start_index, sampling_locations,
+        attention_weights).detach().cpu()
+    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
+                                     sampling_locations,
+                                     attention_weights).detach().cpu()
+    fwdok = paddle.allclose(
+        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
+    max_abs_err = (output_cuda - output_paddle).abs().max().item()
+    max_rel_err = (
+        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
+
+    print(
+        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
+    )
+
+
+def check_gradient_numerical(channels=4):
+    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
+        channels)
+    value_paddle.stop_gradient = False
+    sampling_locations_paddle.stop_gradient = False
+    attention_weights_paddle.stop_gradient = False
+
+    value_cuda = value_paddle.detach().clone()
+    sampling_locations_cuda = sampling_locations_paddle.detach().clone()
+    attention_weights_cuda = attention_weights_paddle.detach().clone()
+    value_cuda.stop_gradient = False
+    sampling_locations_cuda.stop_gradient = False
+    attention_weights_cuda.stop_gradient = False
+
+    output_paddle = ms_deform_attn_core_paddle(
+        value_paddle, spatial_shapes, level_start_index,
+        sampling_locations_paddle, attention_weights_paddle)
+    output_paddle.sum().backward()
+
+    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
+                                     level_start_index, sampling_locations_cuda,
+                                     attention_weights_cuda)
+    output_cuda.sum().backward()
+
+    res = paddle.allclose(
+        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
+    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
+
+    res = paddle.allclose(
+        sampling_locations_paddle.grad,
+        sampling_locations_cuda.grad,
+        rtol=1e-2,
+        atol=1e-3).item()
+    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
+
+    res = paddle.allclose(
+        attention_weights_paddle.grad,
+        attention_weights_cuda.grad,
+        rtol=1e-2,
+        atol=1e-3).item()
+    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_paddle_float()
+
+    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
+        check_gradient_numerical(channels)
diff --git a/ppdet/modeling/transformers/group_detr_transformer.py b/ppdet/modeling/transformers/group_detr_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..31ec6172eb4b3fd607098e0828b5ab44fee3dae6
--- /dev/null
+++ b/ppdet/modeling/transformers/group_detr_transformer.py
@@ -0,0 +1,857 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from ..heads.detr_head import MLP
+from .deformable_transformer import MSDeformableAttention
+from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
+                           bias_init_with_prob)
+from .utils import (_get_clones, get_valid_ratio,
+                    get_contrastive_denoising_training_group,
+                    get_sine_pos_embed, inverse_sigmoid)
+
+__all__ = ['GroupDINOTransformer']
+
+
+class DINOTransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DINOTransformerEncoderLayer, self).__init__()
+        # self attention
+        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                               n_points, 1.0)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
+                                 bias_attr)
+        self.activation = getattr(F, activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
+                                 bias_attr)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self,
+                src,
+                reference_points,
+                spatial_shapes,
+                level_start_index,
+                src_mask=None,
+                query_pos_embed=None):
+        # self attention
+        src2 = self.self_attn(
+            self.with_pos_embed(src, query_pos_embed), reference_points, src,
+            spatial_shapes, level_start_index, src_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class DINOTransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers):
+        super(DINOTransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
+        valid_ratios = valid_ratios.unsqueeze(1)
+        reference_points = []
+        for i, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = paddle.meshgrid(
+                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
+            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
+                                                    H)
+            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
+                                                    W)
+            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
+        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
+        reference_points = reference_points * valid_ratios
+        return reference_points
+
+    def forward(self,
+                feat,
+                spatial_shapes,
+                level_start_index,
+                feat_mask=None,
+                query_pos_embed=None,
+                valid_ratios=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [feat.shape[0], spatial_shapes.shape[0], 2])
+        reference_points = self.get_reference_points(spatial_shapes,
+                                                     valid_ratios)
+        for layer in self.layers:
+            feat = layer(feat, reference_points, spatial_shapes,
+                         level_start_index, feat_mask, query_pos_embed)
+
+        return feat
+
+
+class DINOTransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 dual_queries=False,
+                 dual_groups=0,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DINOTransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                                n_points, 1.0)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
+                                 bias_attr)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
+                                 bias_attr)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # for dual groups 
+        self.dual_queries = dual_queries
+        self.dual_groups = dual_groups
+        self.n_head = n_head
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        if self.dual_queries:
+            dual_groups = self.dual_groups
+            bs, num_queries, n_model = paddle.shape(q)
+            q = paddle.concat(q.split(dual_groups + 1, axis=1), axis=0)
+            k = paddle.concat(k.split(dual_groups + 1, axis=1), axis=0)
+            tgt = paddle.concat(tgt.split(dual_groups + 1, axis=1), axis=0)
+
+            g_num_queries = num_queries // (dual_groups + 1)
+            if attn_mask is None or attn_mask[0] is None:
+                attn_mask = None
+            else:
+                # [(dual_groups + 1), g_num_queries, g_num_queries]
+                attn_mask = paddle.concat(
+                    [sa_mask.unsqueeze(0) for sa_mask in attn_mask], axis=0)
+                # [1, (dual_groups + 1), 1, g_num_queries, g_num_queries]
+                # --> [bs, (dual_groups + 1), nhead, g_num_queries, g_num_queries]
+                # --> [bs * (dual_groups + 1), nhead, g_num_queries, g_num_queries]
+                attn_mask = attn_mask.unsqueeze(0).unsqueeze(2).tile(
+                    [bs, 1, self.n_head, 1, 1])
+                attn_mask = attn_mask.reshape([
+                    bs * (dual_groups + 1), self.n_head, g_num_queries,
+                    g_num_queries
+                ])
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.astype('bool')
+
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm2(tgt)
+
+        # trace back
+        if self.dual_queries:
+            tgt = paddle.concat(tgt.split(dual_groups + 1, axis=0), axis=1)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm1(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+class DINOTransformerDecoder(nn.Layer):
+    def __init__(self,
+                 hidden_dim,
+                 decoder_layer,
+                 num_layers,
+                 return_intermediate=True):
+        super(DINOTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+
+        self.norm = nn.LayerNorm(
+            hidden_dim,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                query_pos_head,
+                valid_ratios=None,
+                attn_mask=None,
+                memory_mask=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [memory.shape[0], memory_spatial_shapes.shape[0], 2])
+
+        output = tgt
+        intermediate = []
+        inter_ref_bboxes = []
+        for i, layer in enumerate(self.layers):
+            reference_points_input = reference_points.unsqueeze(
+                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
+            query_pos_embed = get_sine_pos_embed(
+                reference_points_input[..., 0, :], self.hidden_dim // 2)
+            query_pos_embed = query_pos_head(query_pos_embed)
+
+            output = layer(output, reference_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                reference_points))
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+                inter_ref_bboxes.append(inter_ref_bbox)
+
+            reference_points = inter_ref_bbox.detach()
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate), paddle.stack(inter_ref_bboxes)
+
+        return output, reference_points
+
+
+@register
+class GroupDINOTransformer(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=900,
+                 position_embed_type='sine',
+                 return_intermediate_dec=True,
+                 backbone_feat_channels=[512, 1024, 2048],
+                 num_levels=4,
+                 num_encoder_points=4,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 pe_temperature=10000,
+                 pe_offset=-0.5,
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=True,
+                 use_input_proj=True,
+                 dual_queries=False,
+                 dual_groups=0,
+                 eps=1e-2):
+        super(GroupDINOTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(backbone_feat_channels) <= num_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+        self.use_input_proj = use_input_proj
+
+        if use_input_proj:
+            # backbone feature projection
+            self._build_input_proj_layer(backbone_feat_channels)
+
+        # Transformer module
+        encoder_layer = DINOTransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_encoder_points)
+        self.encoder = DINOTransformerEncoder(encoder_layer, num_encoder_layers)
+        decoder_layer = DINOTransformerDecoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_levels,
+            num_decoder_points,
+            dual_queries=dual_queries,
+            dual_groups=dual_groups)
+        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
+                                              num_decoder_layers,
+                                              return_intermediate_dec)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # for dual group
+        self.dual_queries = dual_queries
+        self.dual_groups = dual_groups
+        if self.dual_queries:
+            self.denoising_class_embed_groups = nn.LayerList([
+                nn.Embedding(
+                    num_classes,
+                    hidden_dim,
+                    weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+                for _ in range(self.dual_groups)
+            ])
+
+        # position embedding
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset)
+        self.level_embed = nn.Embedding(num_levels, hidden_dim)
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+            normal_(self.tgt_embed.weight)
+            if self.dual_queries:
+                self.tgt_embed_dual = nn.LayerList([
+                    nn.Embedding(num_queries, hidden_dim)
+                    for _ in range(self.dual_groups)
+                ])
+                for dual_tgt_module in self.tgt_embed_dual:
+                    normal_(dual_tgt_module.weight)
+        self.query_pos_head = MLP(2 * hidden_dim,
+                                  hidden_dim,
+                                  hidden_dim,
+                                  num_layers=2)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
+        if self.dual_queries:
+            self.enc_output = _get_clones(self.enc_output, self.dual_groups + 1)
+        else:
+            self.enc_output = _get_clones(self.enc_output, 1)
+
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        if self.dual_queries:
+            self.enc_bbox_head_dq = nn.LayerList([
+                MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+                for i in range(self.dual_groups)
+            ])
+            self.enc_score_head_dq = nn.LayerList([
+                nn.Linear(hidden_dim, num_classes)
+                for i in range(self.dual_groups)
+            ])
+
+        # decoder head
+        self.dec_score_head = nn.LayerList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_decoder_layers)
+        ])
+        self.dec_bbox_head = nn.LayerList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_decoder_layers)
+        ])
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight)
+        constant_(self.enc_bbox_head.layers[-1].bias)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight)
+            constant_(reg_.layers[-1].bias)
+
+        for enc_output in self.enc_output:
+            linear_init_(enc_output[0])
+            xavier_uniform_(enc_output[0].weight)
+        normal_(self.level_embed.weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        normal_(self.denoising_class_embed.weight)
+        if self.use_input_proj:
+            for l in self.input_proj:
+                xavier_uniform_(l[0].weight)
+                constant_(l[0].bias)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'backbone_feat_channels': [i.channels for i in input_shape], }
+
+    def _build_input_proj_layer(self, backbone_feat_channels):
+        self.input_proj = nn.LayerList()
+        for in_channels in backbone_feat_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, self.hidden_dim, kernel_size=1)),
+                    ('norm', nn.GroupNorm(
+                        32,
+                        self.hidden_dim,
+                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
+        in_channels = backbone_feat_channels[-1]
+        for _ in range(self.num_levels - len(backbone_feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1)), ('norm', nn.GroupNorm(
+                            32,
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats, pad_mask=None):
+        if self.use_input_proj:
+            # get projection features
+            proj_feats = [
+                self.input_proj[i](feat) for i, feat in enumerate(feats)
+            ]
+            if self.num_levels > len(proj_feats):
+                len_srcs = len(proj_feats)
+                for i in range(len_srcs, self.num_levels):
+                    if i == len_srcs:
+                        proj_feats.append(self.input_proj[i](feats[-1]))
+                    else:
+                        proj_feats.append(self.input_proj[i](proj_feats[-1]))
+        else:
+            proj_feats = feats
+        # get encoder inputs
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for i, feat in enumerate(proj_feats):
+            bs, _, h, w = paddle.shape(feat)
+            spatial_shapes.append(paddle.concat([h, w]))
+            # [b,c,h,w] -> [b,h*w,c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            if pad_mask is not None:
+                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            # [b, h*w, c]
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[i].reshape(
+                [1, 1, -1])
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            if pad_mask is not None:
+                # [b, h*w]
+                mask_flatten.append(mask.flatten(1))
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        # [b, l]
+        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
+                                                                   1)
+        # [b, l, c]
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [num_levels, 2]
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes).astype('int64'))
+        # [l] start index of each level
+        level_start_index = paddle.concat([
+            paddle.zeros(
+                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
+        ])
+        # [b, num_levels, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+                lvl_pos_embed_flatten, valid_ratios)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None):
+        # input projection and embedding
+        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+         lvl_pos_embed_flatten,
+         valid_ratios) = self._get_encoder_input(feats, pad_mask)
+
+        # encoder
+        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
+                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(gt_meta,
+                                            self.num_classes,
+                                            self.num_queries,
+                                            self.denoising_class_embed.weight,
+                                            self.num_denoising,
+                                            self.label_noise_ratio,
+                                            self.box_noise_scale)
+            if self.dual_queries:
+                denoising_class_groups = []
+                denoising_bbox_groups = []
+                attn_mask_groups = []
+                dn_meta_groups = []
+                for g_id in range(self.dual_groups):
+                    denoising_class_gid, denoising_bbox_gid, attn_mask_gid, dn_meta_gid = \
+                        get_contrastive_denoising_training_group(gt_meta,
+                                                    self.num_classes,
+                                                    self.num_queries,
+                                                    self.denoising_class_embed_groups[g_id].weight,
+                                                    self.num_denoising,
+                                                    self.label_noise_ratio,
+                                                    self.box_noise_scale)
+                    denoising_class_groups.append(denoising_class_gid)
+                    denoising_bbox_groups.append(denoising_bbox_gid)
+                    attn_mask_groups.append(attn_mask_gid)
+                    dn_meta_groups.append(dn_meta_gid)
+
+                # combine
+                denoising_class = [denoising_class] + denoising_class_groups
+                denoising_bbox = [denoising_bbox] + denoising_bbox_groups
+                attn_mask = [attn_mask] + attn_mask_groups
+                dn_meta = [dn_meta] + dn_meta_groups
+        else:
+            denoising_class, denoising_bbox, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(
+            memory, spatial_shapes, mask_flatten, denoising_class,
+            denoising_bbox)
+
+        # decoder
+        inter_feats, inter_ref_bboxes = self.decoder(
+            target, init_ref_points, memory, spatial_shapes, level_start_index,
+            self.dec_bbox_head, self.query_pos_head, valid_ratios, attn_mask,
+            mask_flatten)
+        # solve hang during distributed training
+        inter_feats[0] += self.denoising_class_embed.weight[0, 0] * 0.
+        if self.dual_queries:
+            for g_id in range(self.dual_groups):
+                inter_feats[0] += self.denoising_class_embed_groups[
+                    g_id].weight[0, 0] * 0.0
+
+        out_bboxes = []
+        out_logits = []
+        for i in range(self.num_decoder_layers):
+            out_logits.append(self.dec_score_head[i](inter_feats[i]))
+            if i == 0:
+                out_bboxes.append(
+                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
+                              inverse_sigmoid(init_ref_points)))
+            else:
+                out_bboxes.append(
+                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
+                              inverse_sigmoid(inter_ref_bboxes[i - 1])))
+
+        out_bboxes = paddle.stack(out_bboxes)
+        out_logits = paddle.stack(out_logits)
+        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
+                dn_meta)
+
+    def _get_encoder_output_anchors(self,
+                                    memory,
+                                    spatial_shapes,
+                                    memory_mask=None,
+                                    grid_size=0.05):
+        output_anchors = []
+        idx = 0
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            if memory_mask is not None:
+                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
+                valid_H = paddle.sum(mask_[:, :, 0], 1)
+                valid_W = paddle.sum(mask_[:, 0, :], 1)
+            else:
+                valid_H, valid_W = h, w
+
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(
+                    end=h, dtype=memory.dtype),
+                paddle.arange(
+                    end=w, dtype=memory.dtype))
+            grid_xy = paddle.stack([grid_x, grid_y], -1)
+
+            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
+                [-1, 1, 1, 2]).astype(grid_xy.dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            output_anchors.append(
+                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+            idx += h * w
+
+        output_anchors = paddle.concat(output_anchors, 1)
+        valid_mask = ((output_anchors > self.eps) *
+                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
+        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
+        if memory_mask is not None:
+            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
+        output_anchors = paddle.where(valid_mask, output_anchors,
+                                      paddle.to_tensor(float("inf")))
+
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
+        if self.dual_queries:
+            output_memory = [
+                self.enc_output[g_id](memory)
+                for g_id in range(self.dual_groups + 1)
+            ]
+        else:
+            output_memory = self.enc_output[0](memory)
+        return output_memory, output_anchors
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           memory_mask=None,
+                           denoising_class=None,
+                           denoising_bbox=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        output_memory, output_anchors = self._get_encoder_output_anchors(
+            memory, spatial_shapes, memory_mask)
+        if self.dual_queries:
+            enc_outputs_class = self.enc_score_head(output_memory[0])
+            enc_outputs_coord_unact = self.enc_bbox_head(output_memory[
+                0]) + output_anchors
+        else:
+            enc_outputs_class = self.enc_score_head(output_memory)
+            enc_outputs_coord_unact = self.enc_bbox_head(
+                output_memory) + output_anchors
+
+        _, topk_ind = paddle.topk(
+            enc_outputs_class.max(-1), self.num_queries, axis=1)
+        # extract region proposal boxes
+        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+        topk_coords_unact = paddle.gather_nd(enc_outputs_coord_unact,
+                                             topk_ind)  # unsigmoided.
+        enc_topk_bboxes = F.sigmoid(topk_coords_unact)
+        reference_points = enc_topk_bboxes.detach()
+        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
+
+        if self.dual_queries:
+            enc_topk_logits_groups = []
+            enc_topk_bboxes_groups = []
+            reference_points_groups = []
+            topk_ind_groups = []
+            for g_id in range(self.dual_groups):
+                enc_outputs_class_gid = self.enc_score_head_dq[g_id](
+                    output_memory[g_id + 1])
+                enc_outputs_coord_unact_gid = self.enc_bbox_head_dq[g_id](
+                    output_memory[g_id + 1]) + output_anchors
+                _, topk_ind_gid = paddle.topk(
+                    enc_outputs_class_gid.max(-1), self.num_queries, axis=1)
+                # extract region proposal boxes
+                batch_ind = paddle.arange(end=bs, dtype=topk_ind_gid.dtype)
+                batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+                topk_ind_gid = paddle.stack([batch_ind, topk_ind_gid], axis=-1)
+                topk_coords_unact_gid = paddle.gather_nd(
+                    enc_outputs_coord_unact_gid, topk_ind_gid)  # unsigmoided.
+                enc_topk_bboxes_gid = F.sigmoid(topk_coords_unact_gid)
+                reference_points_gid = enc_topk_bboxes_gid.detach()
+                enc_topk_logits_gid = paddle.gather_nd(enc_outputs_class_gid,
+                                                       topk_ind_gid)
+
+                # append and combine
+                topk_ind_groups.append(topk_ind_gid)
+                enc_topk_logits_groups.append(enc_topk_logits_gid)
+                enc_topk_bboxes_groups.append(enc_topk_bboxes_gid)
+                reference_points_groups.append(reference_points_gid)
+
+            enc_topk_bboxes = paddle.concat(
+                [enc_topk_bboxes] + enc_topk_bboxes_groups, 1)
+            enc_topk_logits = paddle.concat(
+                [enc_topk_logits] + enc_topk_logits_groups, 1)
+            reference_points = paddle.concat(
+                [reference_points] + reference_points_groups, 1)
+            topk_ind = paddle.concat([topk_ind] + topk_ind_groups, 1)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+            if self.dual_queries:
+                target = paddle.concat([target] + [
+                    self.tgt_embed_dual[g_id].weight.unsqueeze(0).tile(
+                        [bs, 1, 1]) for g_id in range(self.dual_groups)
+                ], 1)
+        else:
+            if self.dual_queries:
+                target = paddle.gather_nd(output_memory[0], topk_ind)
+                target_groups = []
+                for g_id in range(self.dual_groups):
+                    target_gid = paddle.gather_nd(output_memory[g_id + 1],
+                                                  topk_ind_groups[g_id])
+                    target_groups.append(target_gid)
+                target = paddle.concat([target] + target_groups, 1).detach()
+            else:
+                target = paddle.gather_nd(output_memory, topk_ind).detach()
+
+        if denoising_bbox is not None:
+            if isinstance(denoising_bbox, list) and isinstance(
+                    denoising_class, list) and self.dual_queries:
+                if denoising_bbox[0] is not None:
+                    reference_points_list = paddle.split(
+                        reference_points, self.dual_groups + 1, axis=1)
+                    reference_points = paddle.concat(
+                        [
+                            paddle.concat(
+                                [ref, ref_], axis=1)
+                            for ref, ref_ in zip(denoising_bbox,
+                                                 reference_points_list)
+                        ],
+                        axis=1)
+
+                    target_list = paddle.split(
+                        target, self.dual_groups + 1, axis=1)
+                    target = paddle.concat(
+                        [
+                            paddle.concat(
+                                [tgt, tgt_], axis=1)
+                            for tgt, tgt_ in zip(denoising_class, target_list)
+                        ],
+                        axis=1)
+                else:
+                    reference_points, target = reference_points, target
+            else:
+                reference_points = paddle.concat(
+                    [denoising_bbox, reference_points], 1)
+                target = paddle.concat([denoising_class, target], 1)
+
+        return target, reference_points, enc_topk_bboxes, enc_topk_logits
diff --git a/ppdet/modeling/transformers/hybrid_encoder.py b/ppdet/modeling/transformers/hybrid_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b64c4ee3ba64996c5bf83c40762d6742500a0a50
--- /dev/null
+++ b/ppdet/modeling/transformers/hybrid_encoder.py
@@ -0,0 +1,301 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.ops import get_act_fn
+from ..shape_spec import ShapeSpec
+from ..backbones.csp_darknet import BaseConv
+from ..backbones.cspresnet import RepVggBlock
+from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
+from ..initializer import xavier_uniform_, linear_init_
+from ..layers import MultiHeadAttention
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+__all__ = ['HybridEncoder']
+
+
+class CSPRepLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=3,
+                 expansion=1.0,
+                 bias=False,
+                 act="silu"):
+        super(CSPRepLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*[
+            RepVggBlock(
+                hidden_channels, hidden_channels, act=act)
+            for _ in range(num_blocks)
+        ])
+        if hidden_channels != out_channels:
+            self.conv3 = BaseConv(
+                hidden_channels,
+                out_channels,
+                ksize=1,
+                stride=1,
+                bias=bias,
+                act=act)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        return self.conv3(x_1 + x_2)
+
+
+@register
+class TransformerLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+@register
+@serializable
+class HybridEncoder(nn.Layer):
+    __shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
+    __inject__ = ['encoder_layer']
+
+    def __init__(self,
+                 in_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 hidden_dim=256,
+                 use_encoder_idx=[2],
+                 num_encoder_layers=1,
+                 encoder_layer='TransformerLayer',
+                 pe_temperature=10000,
+                 expansion=1.0,
+                 depth_mult=1.0,
+                 act='silu',
+                 trt=False,
+                 eval_size=None):
+        super(HybridEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.pe_temperature = pe_temperature
+        self.eval_size = eval_size
+
+        # channel projection
+        self.input_proj = nn.LayerList()
+        for in_channel in in_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channel, hidden_dim, kernel_size=1, bias_attr=False),
+                    nn.BatchNorm2D(
+                        hidden_dim,
+                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
+        # encoder transformer
+        self.encoder = nn.LayerList([
+            TransformerEncoder(encoder_layer, num_encoder_layers)
+            for _ in range(len(use_encoder_idx))
+        ])
+
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+        # top-down fpn
+        self.lateral_convs = nn.LayerList()
+        self.fpn_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(
+                BaseConv(
+                    hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                CSPRepLayer(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    round(3 * depth_mult),
+                    act=act,
+                    expansion=expansion))
+
+        # bottom-up pan
+        self.downsample_convs = nn.LayerList()
+        self.pan_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                BaseConv(
+                    hidden_dim, hidden_dim, 3, stride=2, act=act))
+            self.pan_blocks.append(
+                CSPRepLayer(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    round(3 * depth_mult),
+                    act=act,
+                    expansion=expansion))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self.eval_size:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_size[1] // stride, self.eval_size[0] // stride,
+                    self.hidden_dim, self.pe_temperature)
+                setattr(self, f'pos_embed{idx}', pos_embed)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w,
+                                           h,
+                                           embed_dim=256,
+                                           temperature=10000.):
+        grid_w = paddle.arange(int(w), dtype=paddle.float32)
+        grid_h = paddle.arange(int(h), dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, \
+            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        return paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).transpose(
+                    [0, 2, 1])
+                if self.training or self.eval_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature)
+                else:
+                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
+                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
+                    [-1, self.hidden_dim, h, w])
+
+        # top-down fpn
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = F.interpolate(
+                feat_heigh, scale_factor=2., mode="nearest")
+            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat(
+                    [upsample_feat, feat_low], axis=1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up pan
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](paddle.concat(
+                [downsample_feat, feat_height], axis=1))
+            outs.append(out)
+
+        return outs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'feat_strides': [i.stride for i in input_shape]
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.hidden_dim, stride=self.feat_strides[idx])
+            for idx in range(len(self.in_channels))
+        ]
diff --git a/ppdet/modeling/transformers/mask_dino_transformer.py b/ppdet/modeling/transformers/mask_dino_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b292238596823135e7c9027b605babf2281d2ec
--- /dev/null
+++ b/ppdet/modeling/transformers/mask_dino_transformer.py
@@ -0,0 +1,536 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+from .position_encoding import PositionEmbedding
+from ..heads.detr_head import MLP
+from .deformable_transformer import (DeformableTransformerEncoderLayer,
+                                     DeformableTransformerEncoder)
+from .dino_transformer import (DINOTransformerDecoderLayer)
+from ..initializer import (linear_init_, constant_, xavier_uniform_,
+                           bias_init_with_prob)
+from .utils import (_get_clones, get_valid_ratio, get_denoising_training_group,
+                    get_sine_pos_embed, inverse_sigmoid, mask_to_box_coordinate)
+
+__all__ = ['MaskDINO']
+
+
+class ConvGNBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 num_groups=32,
+                 bias=False,
+                 act=None):
+        super(ConvGNBlock, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias_attr=bias)
+        self.norm = nn.GroupNorm(
+            num_groups,
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.act = getattr(F, act) if act is not None else None
+
+        self._init_weights()
+
+    def _init_weights(self):
+        xavier_uniform_(self.conv.weight)
+
+    def forward(self, x):
+        x = self.norm(self.conv(x))
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+class MaskDINOTransformerDecoder(nn.Layer):
+    def __init__(self, hidden_dim, decoder_layer, num_layers):
+        super(MaskDINOTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                query_pos_head,
+                dec_norm,
+                valid_ratios=None,
+                attn_mask=None,
+                memory_mask=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [memory.shape[0], memory_spatial_shapes.shape[0], 2])
+
+        output = tgt
+        intermediate = []
+        inter_bboxes = []
+        ref_points = F.sigmoid(ref_points_unact)
+        for i, layer in enumerate(self.layers):
+            reference_points_input = ref_points.detach().unsqueeze(
+                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
+            query_pos_embed = get_sine_pos_embed(
+                reference_points_input[..., 0, :], self.hidden_dim // 2)
+            query_pos_embed = query_pos_head(query_pos_embed)
+
+            output = layer(output, reference_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            ref_points = F.sigmoid(
+                bbox_head(output) + inverse_sigmoid(ref_points.detach()))
+
+            intermediate.append(dec_norm(output))
+            inter_bboxes.append(ref_points)
+
+        return paddle.stack(intermediate), paddle.stack(inter_bboxes)
+
+
+@register
+class MaskDINO(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 in_feats_channel=[256, 512, 1024, 2048],
+                 num_levels=3,
+                 num_encoder_points=4,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=9,
+                 enc_dim_feedforward=1024,
+                 dec_dim_feedforward=2048,
+                 dropout=0.,
+                 activation="relu",
+                 lr_mult=1.0,
+                 pe_temperature=10000,
+                 pe_offset=-0.5,
+                 num_denoising=100,
+                 label_noise_ratio=0.4,
+                 box_noise_scale=0.4,
+                 learnt_init_query=False,
+                 mask_enhanced=True,
+                 eps=1e-2):
+        super(MaskDINO, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        feat0_dim = in_feats_channel.pop(0)
+        assert len(in_feats_channel) <= num_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+        self.mask_enhanced = mask_enhanced
+
+        weight_attr = ParamAttr(regularizer=L2Decay(0.0))
+        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
+        # backbone feature projection
+        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
+
+        # Transformer module
+        encoder_layer = DeformableTransformerEncoderLayer(
+            hidden_dim, nhead, enc_dim_feedforward, dropout, activation,
+            num_levels, num_encoder_points, lr_mult, weight_attr, bias_attr)
+        self.encoder = DeformableTransformerEncoder(encoder_layer,
+                                                    num_encoder_layers)
+        decoder_layer = DINOTransformerDecoderLayer(
+            hidden_dim, nhead, dec_dim_feedforward, dropout, activation,
+            num_levels, num_decoder_points, lr_mult, weight_attr, bias_attr)
+        self.decoder = MaskDINOTransformerDecoder(hidden_dim, decoder_layer,
+                                                  num_decoder_layers)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # position embedding
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset)
+        self.level_embed = nn.Embedding(
+            num_levels,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(2 * hidden_dim,
+                                  hidden_dim,
+                                  hidden_dim,
+                                  num_layers=2)
+        # mask embedding
+        self.mask_query_head = MLP(hidden_dim,
+                                   hidden_dim,
+                                   hidden_dim,
+                                   num_layers=3)
+
+        # encoder mask head
+        self.enc_mask_lateral = ConvGNBlock(feat0_dim, hidden_dim, 1)
+        self.enc_mask_output = nn.Sequential(
+            ConvGNBlock(
+                hidden_dim, hidden_dim, 3, act=activation),
+            nn.Conv2D(hidden_dim, hidden_dim, 1))
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
+        # decoder norm layer
+        self.dec_norm = nn.LayerNorm(
+            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
+        # shared prediction head
+        self.class_head = nn.Linear(hidden_dim, num_classes)
+        self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.class_head)
+        constant_(self.class_head.bias, bias_cls)
+        constant_(self.bbox_head.layers[-1].weight)
+        constant_(self.bbox_head.layers[-1].bias)
+
+        xavier_uniform_(self.enc_mask_output[1].weight)
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_feats_channel': [i.channels for i in input_shape], }
+
+    def _build_input_proj_layer(self,
+                                in_feats_channel,
+                                weight_attr=None,
+                                bias_attr=None):
+        self.input_proj = nn.LayerList()
+        for in_channels in in_feats_channel:
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, self.hidden_dim, kernel_size=1)), (
+                            'norm', nn.GroupNorm(
+                                32,
+                                self.hidden_dim,
+                                weight_attr=weight_attr,
+                                bias_attr=bias_attr))))
+        in_channels = in_feats_channel[-1]
+        for _ in range(self.num_levels - len(in_feats_channel)):
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1)), ('norm', nn.GroupNorm(
+                            32,
+                            self.hidden_dim,
+                            weight_attr=weight_attr,
+                            bias_attr=bias_attr))))
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats, pad_mask=None):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for i, feat in enumerate(proj_feats):
+            bs, _, h, w = paddle.shape(feat)
+            spatial_shapes.append(paddle.concat([h, w]))
+            # [b,c,h,w] -> [b,h*w,c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            if pad_mask is not None:
+                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            # [b, h*w, c]
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[i]
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            if pad_mask is not None:
+                # [b, h*w]
+                mask_flatten.append(mask.flatten(1))
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        # [b, l]
+        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
+                                                                   1)
+        # [b, l, c]
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [num_levels, 2]
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes).astype('int64'))
+        # [l], 每一个level的起始index
+        level_start_index = paddle.concat([
+            paddle.zeros(
+                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
+        ])
+        # [b, num_levels, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+                lvl_pos_embed_flatten, valid_ratios)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None):
+        feat0 = feats.pop(0)
+        # input projection and embedding
+        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+         lvl_pos_embed_flatten,
+         valid_ratios) = self._get_encoder_input(feats, pad_mask)
+
+        # encoder
+        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
+                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
+
+        mask_feat = self._get_encoder_mask_feature(feat0, memory,
+                                                   spatial_shapes)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_denoising_training_group(gt_meta,
+                                            self.num_classes,
+                                            self.num_queries,
+                                            self.denoising_class_embed.weight,
+                                            self.num_denoising,
+                                            self.label_noise_ratio,
+                                            self.box_noise_scale)
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_out, init_out = \
+            self._get_decoder_input(
+            memory, mask_feat, spatial_shapes, mask_flatten, denoising_class,
+            denoising_bbox_unact)
+
+        # decoder
+        inter_feats, inter_bboxes = self.decoder(
+            target, init_ref_points_unact, memory, spatial_shapes,
+            level_start_index, self.bbox_head, self.query_pos_head,
+            self.dec_norm, valid_ratios, attn_mask, mask_flatten)
+
+        out_logits = []
+        out_bboxes = []
+        out_masks = []
+        for i in range(self.num_decoder_layers):
+            if self.training or i == self.num_decoder_layers - 1:
+                logits_, masks_ = self._get_pred_class_and_mask(inter_feats[i],
+                                                                mask_feat)
+            else:
+                continue
+            out_logits.append(logits_)
+            out_masks.append(masks_)
+            if i == 0:
+                out_bboxes.append(
+                    F.sigmoid(
+                        self.bbox_head(inter_feats[i]) + init_ref_points_unact))
+            else:
+                out_bboxes.append(
+                    F.sigmoid(
+                        self.bbox_head(inter_feats[i]) + inverse_sigmoid(
+                            inter_bboxes[i - 1])))
+        out_bboxes = paddle.stack(out_bboxes)
+        out_logits = paddle.stack(out_logits)
+        out_masks = paddle.stack(out_masks)
+
+        return (out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta)
+
+    def _get_encoder_mask_feature(self, in_feat, memory, spatial_shapes):
+        memory_feat0 = memory.split(
+            spatial_shapes.prod(1).split(self.num_levels), axis=1)[0]
+        h, w = spatial_shapes[0]
+        memory_feat0 = memory_feat0.reshape(
+            [0, h, w, self.hidden_dim]).transpose([0, 3, 1, 2])
+        out = self.enc_mask_lateral(in_feat) + F.interpolate(
+            memory_feat0,
+            scale_factor=2.0,
+            mode='bilinear',
+            align_corners=False)
+        return self.enc_mask_output(out)
+
+    def _get_encoder_output_anchors(self,
+                                    memory,
+                                    spatial_shapes,
+                                    memory_mask=None,
+                                    grid_size=0.05):
+        output_anchors = []
+        idx = 0
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            if memory_mask is not None:
+                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
+                valid_H = paddle.sum(mask_[:, :, 0], 1)
+                valid_W = paddle.sum(mask_[:, 0, :], 1)
+            else:
+                valid_H, valid_W = h, w
+
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(end=h), paddle.arange(end=w))
+            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
+
+            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
+                [-1, 1, 1, 2]).astype(grid_xy.dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            output_anchors.append(
+                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+            idx += h * w
+
+        output_anchors = paddle.concat(output_anchors, 1)
+        valid_mask = ((output_anchors > self.eps) *
+                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
+        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
+        if memory_mask is not None:
+            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
+        output_anchors = paddle.where(valid_mask, output_anchors,
+                                      paddle.to_tensor(float("inf")))
+
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
+        output_memory = self.enc_output(memory)
+        return output_memory, output_anchors
+
+    def _get_decoder_input(self,
+                           memory,
+                           mask_feat,
+                           spatial_shapes,
+                           memory_mask=None,
+                           denoising_class=None,
+                           denoising_bbox_unact=None):
+        # prepare input for decoder
+        bs, _, _ = memory.shape
+        output_memory, output_anchors = self._get_encoder_output_anchors(
+            memory, spatial_shapes, memory_mask)
+        enc_logits_unact = self.class_head(output_memory)
+        enc_bboxes_unact = self.bbox_head(output_memory) + output_anchors
+
+        # get topk index
+        _, topk_ind = paddle.topk(
+            enc_logits_unact.max(-1), self.num_queries, axis=1)
+        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        # extract content and position query embedding
+        target = paddle.gather_nd(output_memory, topk_ind)
+        reference_points_unact = paddle.gather_nd(enc_bboxes_unact,
+                                                  topk_ind)  # unsigmoided.
+        # get encoder output: {logits, bboxes, masks}
+        enc_out_logits, enc_out_masks = self._get_pred_class_and_mask(target,
+                                                                      mask_feat)
+        enc_out_bboxes = F.sigmoid(reference_points_unact)
+        enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks)
+
+        # concat denoising query
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = target.detach()
+        if denoising_class is not None:
+            target = paddle.concat([denoising_class, target], 1)
+        if self.mask_enhanced:
+            # use mask-enhanced anchor box initialization
+            reference_points = mask_to_box_coordinate(
+                enc_out_masks > 0, normalize=True, format="xywh")
+            reference_points_unact = inverse_sigmoid(reference_points)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = paddle.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+
+        # direct prediction from the matching and denoising part in the begining
+        if self.training and denoising_class is not None:
+            init_out_logits, init_out_masks = self._get_pred_class_and_mask(
+                target, mask_feat)
+            init_out_bboxes = F.sigmoid(reference_points_unact)
+            init_out = (init_out_logits, init_out_bboxes, init_out_masks)
+        else:
+            init_out = None
+
+        return target, reference_points_unact.detach(), enc_out, init_out
+
+    def _get_pred_class_and_mask(self, query_embed, mask_feat):
+        out_query = self.dec_norm(query_embed)
+        out_logits = self.class_head(out_query)
+        mask_query_embed = self.mask_query_head(out_query)
+        _, _, h, w = paddle.shape(mask_feat)
+        # [b, q, c] x [b, c, h, w] -> [b, q, h, w]
+        out_mask = paddle.bmm(mask_query_embed, mask_feat.flatten(2)).reshape(
+            [0, 0, h, w])
+        return out_logits, out_mask
diff --git a/ppdet/modeling/transformers/matchers.py b/ppdet/modeling/transformers/matchers.py
new file mode 100644
index 0000000000000000000000000000000000000000..72459a3f909806f212a4b204a50a494875589e51
--- /dev/null
+++ b/ppdet/modeling/transformers/matchers.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+
+from ppdet.core.workspace import register, serializable
+from ..losses.iou_loss import GIoULoss
+from .utils import bbox_cxcywh_to_xyxy
+
+__all__ = ['HungarianMatcher']
+
+
+@register
+@serializable
+class HungarianMatcher(nn.Layer):
+    __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']
+
+    def __init__(self,
+                 matcher_coeff={
+                     'class': 1,
+                     'bbox': 5,
+                     'giou': 2,
+                     'mask': 1,
+                     'dice': 1
+                 },
+                 use_focal_loss=False,
+                 with_mask=False,
+                 num_sample_points=12544,
+                 alpha=0.25,
+                 gamma=2.0):
+        r"""
+        Args:
+            matcher_coeff (dict): The coefficient of hungarian matcher cost.
+        """
+        super(HungarianMatcher, self).__init__()
+        self.matcher_coeff = matcher_coeff
+        self.use_focal_loss = use_focal_loss
+        self.with_mask = with_mask
+        self.num_sample_points = num_sample_points
+        self.alpha = alpha
+        self.gamma = gamma
+
+        self.giou_loss = GIoULoss()
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None):
+        r"""
+        Args:
+            boxes (Tensor): [b, query, 4]
+            logits (Tensor): [b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor|None): [b, query, h, w]
+            gt_mask (List(Tensor)): list[[n, H, W]]
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = boxes.shape[:2]
+
+        num_gts = [len(a) for a in gt_class]
+        if sum(num_gts) == 0:
+            return [(paddle.to_tensor(
+                [], dtype=paddle.int64), paddle.to_tensor(
+                    [], dtype=paddle.int64)) for _ in range(bs)]
+
+        # We flatten to compute the cost matrices in a batch
+        # [batch_size * num_queries, num_classes]
+        logits = logits.detach()
+        out_prob = F.sigmoid(logits.flatten(
+            0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
+        # [batch_size * num_queries, 4]
+        out_bbox = boxes.detach().flatten(0, 1)
+
+        # Also concat the target labels and boxes
+        tgt_ids = paddle.concat(gt_class).flatten()
+        tgt_bbox = paddle.concat(gt_bbox)
+
+        # Compute the classification cost
+        out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
+        if self.use_focal_loss:
+            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
+                1 - out_prob + 1e-8).log())
+            pos_cost_class = self.alpha * (
+                (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class - neg_cost_class
+        else:
+            cost_class = -out_prob
+
+        # Compute the L1 cost between boxes
+        cost_bbox = (
+            out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = self.giou_loss(
+            bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
+            bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
+
+        # Final cost matrix
+        C = self.matcher_coeff['class'] * cost_class + \
+            self.matcher_coeff['bbox'] * cost_bbox + \
+            self.matcher_coeff['giou'] * cost_giou
+        # Compute the mask cost and dice cost
+        if self.with_mask:
+            assert (masks is not None and gt_mask is not None,
+                    'Make sure the input has `mask` and `gt_mask`')
+            # all masks share the same set of points for efficient matching
+            sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
+            sample_points = 2.0 * sample_points - 1.0
+
+            out_mask = F.grid_sample(
+                masks.detach(), sample_points, align_corners=False).squeeze(-2)
+            out_mask = out_mask.flatten(0, 1)
+
+            tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
+            sample_points = paddle.concat([
+                a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
+                if b > 0
+            ])
+            tgt_mask = F.grid_sample(
+                tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
+
+            with paddle.amp.auto_cast(enable=False):
+                # binary cross entropy cost
+                pos_cost_mask = F.binary_cross_entropy_with_logits(
+                    out_mask, paddle.ones_like(out_mask), reduction='none')
+                neg_cost_mask = F.binary_cross_entropy_with_logits(
+                    out_mask, paddle.zeros_like(out_mask), reduction='none')
+                cost_mask = paddle.matmul(
+                    pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
+                        neg_cost_mask, 1 - tgt_mask, transpose_y=True)
+                cost_mask /= self.num_sample_points
+
+                # dice cost
+                out_mask = F.sigmoid(out_mask)
+                numerator = 2 * paddle.matmul(
+                    out_mask, tgt_mask, transpose_y=True)
+                denominator = out_mask.sum(
+                    -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
+                cost_dice = 1 - (numerator + 1) / (denominator + 1)
+
+                C = C + self.matcher_coeff['mask'] * cost_mask + \
+                    self.matcher_coeff['dice'] * cost_dice
+
+        C = C.reshape([bs, num_queries, -1])
+        C = [a.squeeze(0) for a in C.chunk(bs)]
+        sizes = [a.shape[0] for a in gt_bbox]
+        indices = [
+            linear_sum_assignment(c.split(sizes, -1)[i].numpy())
+            for i, c in enumerate(C)
+        ]
+        return [(paddle.to_tensor(
+            i, dtype=paddle.int64), paddle.to_tensor(
+                j, dtype=paddle.int64)) for i, j in indices]
diff --git a/ppdet/modeling/transformers/petr_transformer.py b/ppdet/modeling/transformers/petr_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7859b0df028bf5da7a615c427de4fb0850bfca2e
--- /dev/null
+++ b/ppdet/modeling/transformers/petr_transformer.py
@@ -0,0 +1,1198 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/utils/transformer.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention, _convert_attention_mask
+from .utils import _get_clones
+from ..initializer import linear_init_, normal_, constant_, xavier_uniform_
+
+__all__ = [
+    'PETRTransformer', 'MultiScaleDeformablePoseAttention',
+    'PETR_TransformerDecoderLayer', 'PETR_TransformerDecoder',
+    'PETR_DeformableDetrTransformerDecoder',
+    'PETR_DeformableTransformerDecoder', 'TransformerEncoderLayer',
+    'TransformerEncoder', 'MSDeformableAttention'
+]
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clip(min=0, max=1)
+    x1 = x.clip(min=eps)
+    x2 = (1 - x).clip(min=eps)
+    return paddle.log(x1 / x2)
+
+
+@register
+class TransformerEncoderLayer(nn.Layer):
+    __inject__ = ['attn']
+
+    def __init__(self,
+                 d_model,
+                 attn=None,
+                 nhead=8,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+        self.embed_dims = d_model
+
+        if attn is None:
+            self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        else:
+            self.self_attn = attn
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None, **kwargs):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask, **kwargs)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+@register
+class TransformerEncoder(nn.Layer):
+    __inject__ = ['encoder_layer']
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.embed_dims = encoder_layer.embed_dims
+
+    def forward(self, src, src_mask=None, pos_embed=None, **kwargs):
+        output = src
+        for layer in self.layers:
+            output = layer(
+                output, src_mask=src_mask, pos_embed=pos_embed, **kwargs)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+@register
+class MSDeformableAttention(nn.Layer):
+    def __init__(self,
+                 embed_dim=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 lr_mult=0.1):
+        """
+        Multi-Scale Deformable Attention Module
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.total_points = num_heads * num_levels * num_points
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(
+            embed_dim,
+            self.total_points * 2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+        try:
+            # use cuda op
+            print("use deformable_detr_ops in ms_deformable_attn")
+            from deformable_detr_ops import ms_deformable_attn
+        except:
+            # use paddle func
+            from .utils import deformable_attention_core_func as ms_deformable_attn
+        self.ms_deformable_attn_core = ms_deformable_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        constant_(self.sampling_offsets.weight)
+        thetas = paddle.arange(
+            self.num_heads,
+            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
+        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
+            [1, self.num_levels, self.num_points, 1])
+        scaling = paddle.arange(
+            1, self.num_points + 1,
+            dtype=paddle.float32).reshape([1, 1, -1, 1])
+        grid_init *= scaling
+        self.sampling_offsets.bias.set_value(grid_init.flatten())
+        # attention_weights
+        constant_(self.attention_weights.weight)
+        constant_(self.attention_weights.bias)
+        # proj
+        xavier_uniform_(self.value_proj.weight)
+        constant_(self.value_proj.bias)
+        xavier_uniform_(self.output_proj.weight)
+        constant_(self.output_proj.bias)
+
+    def forward(self,
+                query,
+                key,
+                value,
+                reference_points,
+                value_spatial_shapes,
+                value_level_start_index,
+                attn_mask=None,
+                **kwargs):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            attn_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+        assert int(value_spatial_shapes.prod(1).sum()) == Len_v
+
+        value = self.value_proj(value)
+        if attn_mask is not None:
+            attn_mask = attn_mask.astype(value.dtype).unsqueeze(-1)
+            value *= attn_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2])
+            sampling_locations = reference_points.reshape([
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ]) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] *
+                0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(
+            value, value_spatial_shapes, value_level_start_index,
+            sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output
+
+
+@register
+class MultiScaleDeformablePoseAttention(nn.Layer):
+    """An attention module used in PETR. `End-to-End Multi-Person
+    Pose Estimation with Transformers`.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 8.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 17.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0.1.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=17,
+                 im2col_step=64,
+                 dropout=0.1,
+                 norm_cfg=None,
+                 init_cfg=None,
+                 batch_first=False,
+                 lr_mult=0.1):
+        super().__init__()
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn("You'd better set embed_dims in "
+                          'MultiScaleDeformAttention to make '
+                          'the dimension of each attention head a power of 2 '
+                          'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims,
+            num_heads * num_levels * num_points * 2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+
+        try:
+            # use cuda op
+            from deformable_detr_ops import ms_deformable_attn
+        except:
+            # use paddle func
+            from .utils import deformable_attention_core_func as ms_deformable_attn
+        self.ms_deformable_attn_core = ms_deformable_attn
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_(self.sampling_offsets.weight)
+        constant_(self.sampling_offsets.bias)
+        constant_(self.attention_weights.weight)
+        constant_(self.attention_weights.bias)
+        xavier_uniform_(self.value_proj.weight)
+        constant_(self.value_proj.bias)
+        xavier_uniform_(self.output_proj.weight)
+        constant_(self.output_proj.bias)
+
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                attn_mask=None,
+                reference_points=None,
+                value_spatial_shapes=None,
+                value_level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape (num_key, bs, embed_dims).
+            value (Tensor): The value tensor with shape
+                (num_key, bs, embed_dims).
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            reference_points (Tensor):  The normalized reference points with
+                shape (bs, num_query, num_levels, K*2), all elements is range
+                in [0, 1], top-left (0,0), bottom-right (1, 1), including
+                padding area.
+            attn_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            value_spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            value_level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        bs, num_query, _ = query.shape
+        bs, num_key, _ = value.shape
+        assert (value_spatial_shapes[:, 0].numpy() *
+                value_spatial_shapes[:, 1].numpy()).sum() == num_key
+
+        value = self.value_proj(value)
+        if attn_mask is not None:
+            # value = value.masked_fill(attn_mask[..., None], 0.0)
+            value *= attn_mask.unsqueeze(-1)
+        value = value.reshape([bs, num_key, self.num_heads, -1])
+        sampling_offsets = self.sampling_offsets(query).reshape([
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2
+        ])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, num_query, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights, axis=-1)
+
+        attention_weights = attention_weights.reshape(
+            [bs, num_query, self.num_heads, self.num_levels, self.num_points])
+        if reference_points.shape[-1] == self.num_points * 2:
+            reference_points_reshape = reference_points.reshape(
+                (bs, num_query, self.num_levels, -1, 2)).unsqueeze(2)
+            x1 = reference_points[:, :, :, 0::2].min(axis=-1, keepdim=True)
+            y1 = reference_points[:, :, :, 1::2].min(axis=-1, keepdim=True)
+            x2 = reference_points[:, :, :, 0::2].max(axis=-1, keepdim=True)
+            y2 = reference_points[:, :, :, 1::2].max(axis=-1, keepdim=True)
+            w = paddle.clip(x2 - x1, min=1e-4)
+            h = paddle.clip(y2 - y1, min=1e-4)
+            wh = paddle.concat([w, h], axis=-1)[:, :, None, :, None, :]
+
+            sampling_locations = reference_points_reshape \
+                                 + sampling_offsets * wh * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2K, but get {reference_points.shape[-1]} instead.')
+
+        output = self.ms_deformable_attn_core(
+            value, value_spatial_shapes, value_level_start_index,
+            sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+        return output
+
+
+@register
+class PETR_TransformerDecoderLayer(nn.Layer):
+    __inject__ = ['self_attn', 'cross_attn']
+
+    def __init__(self,
+                 d_model,
+                 nhead=8,
+                 self_attn=None,
+                 cross_attn=None,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(PETR_TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        if self_attn is None:
+            self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        else:
+            self.self_attn = self_attn
+        if cross_attn is None:
+            self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        else:
+            self.cross_attn = cross_attn
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None,
+                **kwargs):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        q = self.with_pos_embed(tgt, query_pos_embed)
+        key_tmp = tgt
+        # k = self.with_pos_embed(memory, pos_embed)
+        tgt = self.cross_attn(
+            q, key=key_tmp, value=memory, attn_mask=memory_mask, **kwargs)
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt
+
+
+@register
+class PETR_TransformerDecoder(nn.Layer):
+    """Implements the decoder in PETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+    __inject__ = ['decoder_layer']
+
+    def __init__(self,
+                 decoder_layer,
+                 num_layers,
+                 norm=None,
+                 return_intermediate=False,
+                 num_keypoints=17,
+                 **kwargs):
+        super(PETR_TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        self.num_keypoints = num_keypoints
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                valid_ratios=None,
+                kpt_branches=None,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape (num_query, bs, embed_dims).
+            reference_points (Tensor): The reference points of offset,
+                has shape (bs, num_query, K*2).
+            valid_ratios (Tensor): The radios of valid points on the feature
+                map, has shape (bs, num_levels, 2).
+            kpt_branches: (obj:`nn.LayerList`): Used for refining the
+                regression results. Only would be passed when `with_box_refine`
+                is True, otherwise would be passed a `None`.
+
+        Returns:
+            tuple (Tensor): Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims] and
+                [num_layers, bs, num_query, K*2].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == self.num_keypoints * 2:
+                reference_points_input = \
+                    reference_points[:, :, None] * \
+                    valid_ratios.tile((1, 1, self.num_keypoints))[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * \
+                                         valid_ratios[:, None]
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if kpt_branches is not None:
+                tmp = kpt_branches[lid](output)
+                if reference_points.shape[-1] == self.num_keypoints * 2:
+                    new_reference_points = tmp + inverse_sigmoid(
+                        reference_points)
+                    new_reference_points = F.sigmoid(new_reference_points)
+                else:
+                    raise NotImplementedError
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate), paddle.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@register
+class PETR_DeformableTransformerDecoder(nn.Layer):
+    __inject__ = ['decoder_layer']
+
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
+        super(PETR_DeformableTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_mask=None,
+                query_pos_embed=None):
+        output = tgt
+        intermediate = []
+        for lid, layer in enumerate(self.layers):
+            output = layer(output, reference_points, memory,
+                           memory_spatial_shapes, memory_mask, query_pos_embed)
+
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+@register
+class PETR_DeformableDetrTransformerDecoder(PETR_DeformableTransformerDecoder):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+
+        super(PETR_DeformableDetrTransformerDecoder, self).__init__(*args,
+                                                                    **kwargs)
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                valid_ratios=None,
+                reg_branches=None,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            reg_branch: (obj:`nn.LayerList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] * \
+                    paddle.concat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * \
+                    valid_ratios[:, None]
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(
+                        reference_points)
+                    new_reference_points = F.sigmoid(new_reference_points)
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[
+                        ..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = F.sigmoid(new_reference_points)
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate), paddle.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@register
+class PETRTransformer(nn.Layer):
+    """Implements the PETR transformer.
+
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+    __inject__ = ["encoder", "decoder", "hm_encoder", "refine_decoder"]
+
+    def __init__(self,
+                 encoder="",
+                 decoder="",
+                 hm_encoder="",
+                 refine_decoder="",
+                 as_two_stage=True,
+                 num_feature_levels=4,
+                 two_stage_num_proposals=300,
+                 num_keypoints=17,
+                 **kwargs):
+        super(PETRTransformer, self).__init__(**kwargs)
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.num_keypoints = num_keypoints
+        self.encoder = encoder
+        self.decoder = decoder
+        self.embed_dims = self.encoder.embed_dims
+        self.hm_encoder = hm_encoder
+        self.refine_decoder = refine_decoder
+        self.init_layers()
+        self.init_weights()
+
+    def init_layers(self):
+        """Initialize layers of the DeformableDetrTransformer."""
+        #paddle.create_parameter
+        self.level_embeds = paddle.create_parameter(
+            (self.num_feature_levels, self.embed_dims), dtype="float32")
+
+        if self.as_two_stage:
+            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
+            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
+            self.refine_query_embedding = nn.Embedding(self.num_keypoints,
+                                                       self.embed_dims * 2)
+        else:
+            self.reference_points = nn.Linear(self.embed_dims,
+                                              2 * self.num_keypoints)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.rank() > 1:
+                xavier_uniform_(p)
+                if hasattr(p, 'bias') and p.bias is not None:
+                    constant_(p.bais)
+        for m in self.sublayers():
+            if isinstance(m, MSDeformableAttention):
+                m._reset_parameters()
+        for m in self.sublayers():
+            if isinstance(m, MultiScaleDeformablePoseAttention):
+                m.init_weights()
+        if not self.as_two_stage:
+            xavier_uniform_(self.reference_points.weight)
+            constant_(self.reference_points.bias)
+        normal_(self.level_embeds)
+        normal_(self.refine_query_embedding.weight)
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
+                                     spatial_shapes):
+        """Generate proposals from encoded memory.
+
+        Args:
+            memory (Tensor): The output of encoder, has shape
+                (bs, num_key, embed_dim). num_key is equal the number of points
+                on feature map from all level.
+            memory_padding_mask (Tensor): Padding mask for memory.
+                has shape (bs, num_key).
+            spatial_shapes (Tensor): The shape of all feature maps.
+                has shape (num_level, 2).
+
+        Returns:
+            tuple: A tuple of feature map and bbox prediction.
+
+                - output_memory (Tensor): The input of decoder, has shape
+                    (bs, num_key, embed_dim). num_key is equal the number of
+                    points on feature map from all levels.
+                - output_proposals (Tensor): The normalized proposal
+                    after a inverse sigmoid, has shape (bs, num_keys, 4).
+        """
+
+        N, S, C = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].reshape(
+                [N, H, W, 1])
+            valid_H = paddle.sum(mask_flatten_[:, :, 0, 0], 1)
+            valid_W = paddle.sum(mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.linspace(
+                    0, H - 1, H, dtype="float32"),
+                paddle.linspace(
+                    0, W - 1, W, dtype="float32"))
+            grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)],
+                                 -1)
+
+            scale = paddle.concat(
+                [valid_W.unsqueeze(-1),
+                 valid_H.unsqueeze(-1)], 1).reshape([N, 1, 1, 2])
+            grid = (grid.unsqueeze(0).expand((N, -1, -1, -1)) + 0.5) / scale
+            proposal = grid.reshape([N, -1, 2])
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = paddle.concat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) &
+                                  (output_proposals < 0.99)).all(
+                                      -1, keepdim=True).astype("bool")
+        output_proposals = paddle.log(output_proposals / (1 - output_proposals))
+        output_proposals = masked_fill(
+            output_proposals, ~memory_padding_mask.astype("bool").unsqueeze(-1),
+            float('inf'))
+        output_proposals = masked_fill(output_proposals,
+                                       ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = masked_fill(
+            output_memory, ~memory_padding_mask.astype("bool").unsqueeze(-1),
+            float(0))
+        output_memory = masked_fill(output_memory, ~output_proposals_valid,
+                                    float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios):
+        """Get the reference points used in decoder.
+
+        Args:
+            spatial_shapes (Tensor): The shape of all feature maps,
+                has shape (num_level, 2).
+            valid_ratios (Tensor): The radios of valid points on the
+                feature map, has shape (bs, num_levels, 2).
+
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = paddle.meshgrid(
+                paddle.linspace(
+                    0.5, H - 0.5, H, dtype="float32"),
+                paddle.linspace(
+                    0.5, W - 0.5, W, dtype="float32"))
+            ref_y = ref_y.reshape(
+                (-1, ))[None] / (valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(
+                (-1, ))[None] / (valid_ratios[:, None, lvl, 0] * W)
+            ref = paddle.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = paddle.concat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def get_valid_ratio(self, mask):
+        """Get the valid radios of feature maps of all level."""
+        _, H, W = mask.shape
+        valid_H = paddle.sum(mask[:, :, 0].astype('float'), 1)
+        valid_W = paddle.sum(mask[:, 0, :].astype('float'), 1)
+        valid_ratio_h = valid_H.astype('float') / H
+        valid_ratio_w = valid_W.astype('float') / W
+        valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self,
+                               proposals,
+                               num_pos_feats=128,
+                               temperature=10000):
+        """Get the position embedding of proposal."""
+        scale = 2 * math.pi
+        dim_t = paddle.arange(num_pos_feats, dtype="float32")
+        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = F.sigmoid(proposals) * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = paddle.stack(
+            (pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
+            axis=4).flatten(2)
+        return pos
+
+    def forward(self,
+                mlvl_feats,
+                mlvl_masks,
+                query_embed,
+                mlvl_pos_embeds,
+                kpt_branches=None,
+                cls_branches=None):
+        """Forward function for `Transformer`.
+
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from different level.
+                Each element has shape [bs, embed_dims, h, w].
+            mlvl_masks (list(Tensor)): The key_padding_mask from different
+                level used for encoder and decoder, each element has shape
+                [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            mlvl_pos_embeds (list(Tensor)): The positional encoding
+                of feats from different level, has the shape
+                 [bs, embed_dims, h, w].
+            kpt_branches (obj:`nn.LayerList`): Keypoint Regression heads for
+                feature maps from each decoder layer. Only would be passed when
+                `with_box_refine` is Ture. Default to None.
+            cls_branches (obj:`nn.LayerList`): Classification heads for
+                feature maps from each decoder layer. Only would be passed when
+                `as_two_stage` is Ture. Default to None.
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - inter_states: Outputs from decoder. If
+                    `return_intermediate_dec` is True output has shape \
+                    (num_dec_layers, bs, num_query, embed_dims), else has \
+                    shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of proposals \
+                    generated from encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_kpt_unact: The regression results generated from \
+                    encoder's feature maps., has shape (batch, h*w, K*2).
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+        assert self.as_two_stage or query_embed is not None
+
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed
+                  ) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose((0, 2, 1))
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose((0, 2, 1))
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].reshape(
+                [1, 1, -1])
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        mask_flatten = paddle.concat(mask_flatten, 1)
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        spatial_shapes_cumsum = paddle.to_tensor(
+            np.array(spatial_shapes).prod(1).cumsum(0))
+        spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64")
+        level_start_index = paddle.concat((paddle.zeros(
+            (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1]))
+        valid_ratios = paddle.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        reference_points = \
+            self.get_reference_points(spatial_shapes,
+                                      valid_ratios)
+
+        memory = self.encoder(
+            src=feat_flatten,
+            pos_embed=lvl_pos_embed_flatten,
+            src_mask=mask_flatten,
+            value_spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            value_level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+
+        bs, _, c = memory.shape
+
+        hm_proto = None
+        if self.training:
+            hm_memory = paddle.slice(
+                memory,
+                starts=level_start_index[0],
+                ends=level_start_index[1],
+                axes=[1])
+            hm_pos_embed = paddle.slice(
+                lvl_pos_embed_flatten,
+                starts=level_start_index[0],
+                ends=level_start_index[1],
+                axes=[1])
+            hm_mask = paddle.slice(
+                mask_flatten,
+                starts=level_start_index[0],
+                ends=level_start_index[1],
+                axes=[1])
+            hm_reference_points = paddle.slice(
+                reference_points,
+                starts=level_start_index[0],
+                ends=level_start_index[1],
+                axes=[1])[:, :, :1, :]
+
+            # official code make a mistake of pos_embed to pose_embed, which disable pos_embed
+            hm_memory = self.hm_encoder(
+                src=hm_memory,
+                pose_embed=hm_pos_embed,
+                src_mask=hm_mask,
+                value_spatial_shapes=spatial_shapes[[0]],
+                reference_points=hm_reference_points,
+                value_level_start_index=level_start_index[0],
+                valid_ratios=valid_ratios[:, :1, :])
+            hm_memory = hm_memory.reshape((bs, spatial_shapes[0, 0],
+                                           spatial_shapes[0, 1], -1))
+            hm_proto = (hm_memory, mlvl_masks[0])
+
+        if self.as_two_stage:
+            output_memory, output_proposals = \
+                self.gen_encoder_output_proposals(
+                    memory, mask_flatten, spatial_shapes)
+            enc_outputs_class = cls_branches[self.decoder.num_layers](
+                output_memory)
+            enc_outputs_kpt_unact = \
+                kpt_branches[self.decoder.num_layers](output_memory)
+            enc_outputs_kpt_unact[..., 0::2] += output_proposals[..., 0:1]
+            enc_outputs_kpt_unact[..., 1::2] += output_proposals[..., 1:2]
+
+            topk = self.two_stage_num_proposals
+            topk_proposals = paddle.topk(
+                enc_outputs_class[..., 0], topk, axis=1)[1].unsqueeze(-1)
+
+            #paddle.take_along_axis 对应torch.gather
+            topk_kpts_unact = paddle.take_along_axis(enc_outputs_kpt_unact,
+                                                     topk_proposals, 1)
+            topk_kpts_unact = topk_kpts_unact.detach()
+
+            reference_points = F.sigmoid(topk_kpts_unact)
+            init_reference_out = reference_points
+            # learnable query and query_pos
+            query_pos, query = paddle.split(
+                query_embed, query_embed.shape[1] // c, axis=1)
+            query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1))
+            query = query.unsqueeze(0).expand((bs, -1, -1))
+        else:
+            query_pos, query = paddle.split(
+                query_embed, query_embed.shape[1] // c, axis=1)
+            query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1))
+            query = query.unsqueeze(0).expand((bs, -1, -1))
+            reference_points = F.sigmoid(self.reference_points(query_pos))
+            init_reference_out = reference_points
+
+        # decoder
+        inter_states, inter_references = self.decoder(
+            query=query,
+            memory=memory,
+            query_pos_embed=query_pos,
+            memory_mask=mask_flatten,
+            reference_points=reference_points,
+            value_spatial_shapes=spatial_shapes,
+            value_level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            kpt_branches=kpt_branches)
+
+        inter_references_out = inter_references
+        if self.as_two_stage:
+            return inter_states, init_reference_out, \
+                   inter_references_out, enc_outputs_class, \
+                   enc_outputs_kpt_unact, hm_proto, memory
+        return inter_states, init_reference_out, \
+               inter_references_out, None, None, None, None, None, hm_proto
+
+    def forward_refine(self,
+                       mlvl_masks,
+                       memory,
+                       reference_points_pose,
+                       img_inds,
+                       kpt_branches=None,
+                       **kwargs):
+        mask_flatten = []
+        spatial_shapes = []
+        for lvl, mask in enumerate(mlvl_masks):
+            bs, h, w = mask.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            mask = mask.flatten(1)
+            mask_flatten.append(mask)
+        mask_flatten = paddle.concat(mask_flatten, 1)
+        spatial_shapes_cumsum = paddle.to_tensor(
+            np.array(
+                spatial_shapes, dtype='int64').prod(1).cumsum(0))
+        spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64")
+        level_start_index = paddle.concat((paddle.zeros(
+            (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1]))
+        valid_ratios = paddle.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        # pose refinement (17 queries corresponding to 17 keypoints)
+        # learnable query and query_pos
+        refine_query_embedding = self.refine_query_embedding.weight
+        query_pos, query = paddle.split(refine_query_embedding, 2, axis=1)
+        pos_num = reference_points_pose.shape[0]
+        query_pos = query_pos.unsqueeze(0).expand((pos_num, -1, -1))
+        query = query.unsqueeze(0).expand((pos_num, -1, -1))
+        reference_points = reference_points_pose.reshape(
+            (pos_num, reference_points_pose.shape[1] // 2, 2))
+        pos_memory = memory[img_inds]
+        mask_flatten = mask_flatten[img_inds]
+        valid_ratios = valid_ratios[img_inds]
+        if img_inds.size == 1:
+            pos_memory = pos_memory.unsqueeze(0)
+            mask_flatten = mask_flatten.unsqueeze(0)
+            valid_ratios = valid_ratios.unsqueeze(0)
+        inter_states, inter_references = self.refine_decoder(
+            query=query,
+            memory=pos_memory,
+            query_pos_embed=query_pos,
+            memory_mask=mask_flatten,
+            reference_points=reference_points,
+            value_spatial_shapes=spatial_shapes,
+            value_level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=kpt_branches,
+            **kwargs)
+        # [num_decoder, num_query, bs, embed_dim]
+
+        init_reference_out = reference_points
+        return inter_states, init_reference_out, inter_references
diff --git a/ppdet/modeling/transformers/position_encoding.py b/ppdet/modeling/transformers/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2c3260974251295f93b22cd145d5a170d63b2ad
--- /dev/null
+++ b/ppdet/modeling/transformers/position_encoding.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+
+from ppdet.core.workspace import register, serializable
+
+
+@register
+@serializable
+class PositionEmbedding(nn.Layer):
+    def __init__(self,
+                 num_pos_feats=128,
+                 temperature=10000,
+                 normalize=True,
+                 scale=2 * math.pi,
+                 embed_type='sine',
+                 num_embeddings=50,
+                 offset=0.,
+                 eps=1e-6):
+        super(PositionEmbedding, self).__init__()
+        assert embed_type in ['sine', 'learned']
+
+        self.embed_type = embed_type
+        self.offset = offset
+        self.eps = eps
+        if self.embed_type == 'sine':
+            self.num_pos_feats = num_pos_feats
+            self.temperature = temperature
+            self.normalize = normalize
+            self.scale = scale
+        elif self.embed_type == 'learned':
+            self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
+            self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
+        else:
+            raise ValueError(f"{self.embed_type} is not supported.")
+
+    def forward(self, mask):
+        """
+        Args:
+            mask (Tensor): [B, H, W]
+        Returns:
+            pos (Tensor): [B, H, W, C]
+        """
+        if self.embed_type == 'sine':
+            y_embed = mask.cumsum(1)
+            x_embed = mask.cumsum(2)
+            if self.normalize:
+                y_embed = (y_embed + self.offset) / (
+                    y_embed[:, -1:, :] + self.eps) * self.scale
+                x_embed = (x_embed + self.offset) / (
+                    x_embed[:, :, -1:] + self.eps) * self.scale
+
+            dim_t = 2 * (paddle.arange(self.num_pos_feats) //
+                         2).astype('float32')
+            dim_t = self.temperature**(dim_t / self.num_pos_feats)
+
+            pos_x = x_embed.unsqueeze(-1) / dim_t
+            pos_y = y_embed.unsqueeze(-1) / dim_t
+            pos_x = paddle.stack(
+                (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+                axis=4).flatten(3)
+            pos_y = paddle.stack(
+                (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+                axis=4).flatten(3)
+            return paddle.concat((pos_y, pos_x), axis=3)
+        elif self.embed_type == 'learned':
+            h, w = mask.shape[-2:]
+            i = paddle.arange(w)
+            j = paddle.arange(h)
+            x_emb = self.col_embed(i)
+            y_emb = self.row_embed(j)
+            return paddle.concat(
+                [
+                    x_emb.unsqueeze(0).tile([h, 1, 1]),
+                    y_emb.unsqueeze(1).tile([1, w, 1]),
+                ],
+                axis=-1).unsqueeze(0)
+        else:
+            raise ValueError(f"not supported {self.embed_type}")
diff --git a/ppdet/modeling/transformers/rtdetr_transformer.py b/ppdet/modeling/transformers/rtdetr_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3d021f66aa102cacebca8d568893c0ff3d84429
--- /dev/null
+++ b/ppdet/modeling/transformers/rtdetr_transformer.py
@@ -0,0 +1,557 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from ..heads.detr_head import MLP
+from .deformable_transformer import MSDeformableAttention
+from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
+                           bias_init_with_prob)
+from .utils import (_get_clones, get_sine_pos_embed,
+                    get_contrastive_denoising_training_group, inverse_sigmoid)
+
+__all__ = ['RTDETRTransformer']
+
+
+class PPMSDeformableAttention(MSDeformableAttention):
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = paddle.to_tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2])
+            sampling_locations = reference_points.reshape([
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ]) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] *
+                0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        if not isinstance(query, paddle.Tensor):
+            from ppdet.modeling.transformers.utils import deformable_attention_core_func
+            output = deformable_attention_core_func(
+                value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights)
+        else:
+            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
+            value_level_start_index = paddle.to_tensor(value_level_start_index)
+            output = self.ms_deformable_attn_core(
+                value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(TransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # cross attention
+        self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
+                                                  n_points, 1.0)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
+                                 bias_attr)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
+                                 bias_attr)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        if attn_mask is not None:
+            attn_mask = paddle.where(
+                attn_mask.astype('bool'),
+                paddle.zeros(attn_mask.shape, tgt.dtype),
+                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_head_inv_sig=False):
+        output = tgt
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points_detach = F.sigmoid(ref_points_unact)
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            if not query_pos_head_inv_sig:
+                query_pos_embed = query_pos_head(ref_points_detach)
+            else:
+                query_pos_embed = query_pos_head(
+                    inverse_sigmoid(ref_points_detach))
+
+            output = layer(output, ref_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                ref_points_detach))
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(
+                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                            ref_points)))
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = inter_ref_bbox.detach(
+            ) if self.training else inter_ref_bbox
+
+        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
+
+
+@register
+class RTDETRTransformer(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim', 'eval_size']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 backbone_feat_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 num_levels=3,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=True,
+                 query_pos_head_inv_sig=False,
+                 eval_size=None,
+                 eval_idx=-1,
+                 eps=1e-2):
+        super(RTDETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(backbone_feat_channels) <= num_levels
+        assert len(feat_strides) == len(backbone_feat_channels)
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+        self.eval_size = eval_size
+
+        # backbone feature projection
+        self._build_input_proj_layer(backbone_feat_channels)
+
+        # Transformer module
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_decoder_points)
+        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
+                                          num_decoder_layers, eval_idx)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
+        self.query_pos_head_inv_sig = query_pos_head_inv_sig
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        # decoder head
+        self.dec_score_head = nn.LayerList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_decoder_layers)
+        ])
+        self.dec_bbox_head = nn.LayerList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_decoder_layers)
+        ])
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight)
+        constant_(self.enc_bbox_head.layers[-1].bias)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight)
+            constant_(reg_.layers[-1].bias)
+
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+
+        # init encoder output anchors and valid_mask
+        if self.eval_size:
+            self.anchors, self.valid_mask = self._generate_anchors()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'backbone_feat_channels': [i.channels for i in input_shape]}
+
+    def _build_input_proj_layer(self, backbone_feat_channels):
+        self.input_proj = nn.LayerList()
+        for in_channels in backbone_feat_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=1,
+                        bias_attr=False)), ('norm', nn.BatchNorm2D(
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
+        in_channels = backbone_feat_channels[-1]
+        for _ in range(self.num_levels - len(backbone_feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        bias_attr=False)), ('norm', nn.BatchNorm2D(
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        level_start_index = [0, ]
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+            # [l], start index of each level
+            level_start_index.append(h * w + level_start_index[-1])
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        level_start_index.pop()
+        return (feat_flatten, spatial_shapes, level_start_index)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):
+        # input projection and embedding
+        (memory, spatial_shapes,
+         level_start_index) = self._get_encoder_input(feats)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(gt_meta,
+                                            self.num_classes,
+                                            self.num_queries,
+                                            self.denoising_class_embed.weight,
+                                            self.num_denoising,
+                                            self.label_noise_ratio,
+                                            self.box_noise_scale)
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(
+            memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher)
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(
+            target,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask,
+            memory_mask=None,
+            query_pos_head_inv_sig=self.query_pos_head_inv_sig)
+        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
+                dn_meta)
+
+    def _generate_anchors(self,
+                          spatial_shapes=None,
+                          grid_size=0.05,
+                          dtype="float32"):
+        if spatial_shapes is None:
+            spatial_shapes = [
+                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
+                for s in self.feat_strides
+            ]
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(
+                    end=h, dtype=dtype),
+                paddle.arange(
+                    end=w, dtype=dtype))
+            grid_xy = paddle.stack([grid_x, grid_y], -1)
+
+            valid_WH = paddle.to_tensor([h, w]).astype(dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            anchors.append(
+                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+
+        anchors = paddle.concat(anchors, 1)
+        valid_mask = ((anchors > self.eps) *
+                      (anchors < 1 - self.eps)).all(-1, keepdim=True)
+        anchors = paddle.log(anchors / (1 - anchors))
+        anchors = paddle.where(valid_mask, anchors,
+                               paddle.to_tensor(float("inf")))
+        return anchors, valid_mask
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           denoising_class=None,
+                           denoising_bbox_unact=None,
+                           is_teacher=False):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        if self.training or self.eval_size is None or is_teacher:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes)
+        else:
+            anchors, valid_mask = self.anchors, self.valid_mask
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = paddle.topk(
+            enc_outputs_class.max(-1), self.num_queries, axis=1)
+        # extract region proposal boxes
+        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
+                                                  topk_ind)  # unsigmoided.
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = paddle.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+        if self.training:
+            reference_points_unact = reference_points_unact.detach()
+        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = paddle.gather_nd(output_memory, topk_ind)
+            if self.training:
+                target = target.detach()
+        if denoising_class is not None:
+            target = paddle.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
diff --git a/ppdet/modeling/transformers/utils.py b/ppdet/modeling/transformers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6f211a78f21de6bc15b9332dc0f823dedbe6efa
--- /dev/null
+++ b/ppdet/modeling/transformers/utils.py
@@ -0,0 +1,410 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..bbox_utils import bbox_overlaps
+
+__all__ = [
+    '_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy',
+    'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
+    'deformable_attention_core_func', 'varifocal_loss_with_logits'
+]
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
+
+
+def bbox_cxcywh_to_xyxy(x):
+    cxcy, wh = paddle.split(x, 2, axis=-1)
+    return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
+
+
+def bbox_xyxy_to_cxcywh(x):
+    x1, y1, x2, y2 = x.split(4, axis=-1)
+    return paddle.concat(
+        [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
+
+
+def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
+    prob = F.sigmoid(logit)
+    ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
+    p_t = prob * label + (1 - prob) * (1 - label)
+    loss = ce_loss * ((1 - p_t)**gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * label + (1 - alpha) * (1 - label)
+        loss = alpha_t * loss
+    return loss.mean(1).sum() / normalizer
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clip(min=0., max=1.)
+    return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
+
+
+def deformable_attention_core_func(value, value_spatial_shapes,
+                                   value_level_start_index, sampling_locations,
+                                   attention_weights):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
+
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.split(split_shape, axis=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(
+            [0, 2, 1]).reshape([bs * n_head, c, h, w])
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
+            [0, 2, 1, 3, 4]).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
+        [bs * n_head, 1, Len_q, n_levels * n_points])
+    output = (paddle.stack(
+        sampling_value_list, axis=-2).flatten(-2) *
+              attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
+
+    return output.transpose([0, 2, 1])
+
+
+def get_valid_ratio(mask):
+    _, H, W = paddle.shape(mask)
+    valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
+    valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
+    # [b, 2]
+    return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
+
+
+def get_denoising_training_group(targets,
+                                 num_classes,
+                                 num_queries,
+                                 class_embed,
+                                 num_denoising=100,
+                                 label_noise_ratio=0.5,
+                                 box_noise_scale=1.0):
+    if num_denoising <= 0:
+        return None, None, None, None
+    num_gts = [len(t) for t in targets["gt_class"]]
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(targets["gt_class"])
+    input_query_class = paddle.full(
+        [bs, max_gt_num], num_classes, dtype='int32')
+    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
+    pad_gt_mask = paddle.zeros([bs, max_gt_num])
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
+            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
+            pad_gt_mask[i, :num_gt] = 1
+
+    input_query_class = input_query_class.tile([1, num_group])
+    input_query_bbox = input_query_bbox.tile([1, num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, num_group])
+
+    dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
+    dn_positive_idx = paddle.split(dn_positive_idx,
+                                   [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * num_group)
+
+    if label_noise_ratio > 0:
+        input_query_class = input_query_class.flatten()
+        pad_gt_mask = pad_gt_mask.flatten()
+        # half of bbox prob
+        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
+        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
+        # randomly put a new one here
+        new_label = paddle.randint_like(
+            chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class.scatter_(chosen_idx, new_label)
+        input_query_class.reshape_([bs, num_denoising])
+        pad_gt_mask.reshape_([bs, num_denoising])
+
+    if box_noise_scale > 0:
+        diff = paddle.concat(
+            [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
+            axis=-1) * box_noise_scale
+        diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
+        input_query_bbox += diff
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    class_embed = paddle.concat(
+        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
+    input_query_class = paddle.gather(
+        class_embed, input_query_class.flatten(),
+        axis=0).reshape([bs, num_denoising, -1])
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
+                      num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
+                      i] = True
+        else:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
+                      num_denoising] = True
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
+                      i] = True
+    attn_mask = ~attn_mask
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
+
+
+def get_contrastive_denoising_training_group(targets,
+                                             num_classes,
+                                             num_queries,
+                                             class_embed,
+                                             num_denoising=100,
+                                             label_noise_ratio=0.5,
+                                             box_noise_scale=1.0):
+    if num_denoising <= 0:
+        return None, None, None, None
+    num_gts = [len(t) for t in targets["gt_class"]]
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(targets["gt_class"])
+    input_query_class = paddle.full(
+        [bs, max_gt_num], num_classes, dtype='int32')
+    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
+    pad_gt_mask = paddle.zeros([bs, max_gt_num])
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
+            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_group])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
+    # positive and negative mask
+    negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
+    dn_positive_idx = paddle.split(dn_positive_idx,
+                                   [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * 2 * num_group)
+
+    if label_noise_ratio > 0:
+        input_query_class = input_query_class.flatten()
+        pad_gt_mask = pad_gt_mask.flatten()
+        # half of bbox prob
+        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
+        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
+        # randomly put a new one here
+        new_label = paddle.randint_like(
+            chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class.scatter_(chosen_idx, new_label)
+        input_query_class.reshape_([bs, num_denoising])
+        pad_gt_mask.reshape_([bs, num_denoising])
+
+    if box_noise_scale > 0:
+        known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
+
+        diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
+                           [1, 1, 2]) * box_noise_scale
+
+        rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = paddle.rand(input_query_bbox.shape)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
+            1 - negative_gt_mask)
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    class_embed = paddle.concat(
+        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
+    input_query_class = paddle.gather(
+        class_embed, input_query_class.flatten(),
+        axis=0).reshape([bs, num_denoising, -1])
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
+                      2 * (i + 1):num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
+                      i * 2] = True
+        else:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
+                      2 * (i + 1):num_denoising] = True
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
+                      2 * i] = True
+    attn_mask = ~attn_mask
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
+
+
+def get_sine_pos_embed(pos_tensor,
+                       num_pos_feats=128,
+                       temperature=10000,
+                       exchange_xy=True):
+    """generate sine position embedding from a position tensor
+
+    Args:
+        pos_tensor (Tensor): Shape as `(None, n)`.
+        num_pos_feats (int): projected shape for each float in the tensor. Default: 128
+        temperature (int): The temperature used for scaling
+            the position embedding. Default: 10000.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is `[x, y]`, the results will  # noqa
+            be `[pos(y), pos(x)]`. Defaults: True.
+
+    Returns:
+        Tensor: Returned position embedding  # noqa
+        with shape `(None, n * num_pos_feats)`.
+    """
+    scale = 2. * math.pi
+    dim_t = 2. * paddle.floor_divide(
+        paddle.arange(num_pos_feats), paddle.to_tensor(2))
+    dim_t = scale / temperature**(dim_t / num_pos_feats)
+
+    def sine_func(x):
+        x *= dim_t
+        return paddle.stack(
+            (x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2)
+
+    pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = paddle.concat(pos_res, axis=2)
+    return pos_res
+
+
+def mask_to_box_coordinate(mask,
+                           normalize=False,
+                           format="xyxy",
+                           dtype="float32"):
+    """
+    Compute the bounding boxes around the provided mask.
+    Args:
+        mask (Tensor:bool): [b, c, h, w]
+
+    Returns:
+        bbox (Tensor): [b, c, 4]
+    """
+    assert mask.ndim == 4
+    assert format in ["xyxy", "xywh"]
+    if mask.sum() == 0:
+        return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype)
+
+    h, w = mask.shape[-2:]
+    y, x = paddle.meshgrid(
+        paddle.arange(
+            end=h, dtype=dtype), paddle.arange(
+                end=w, dtype=dtype))
+
+    x_mask = x * mask
+    x_max = x_mask.flatten(-2).max(-1) + 1
+    x_min = paddle.where(mask, x_mask,
+                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
+
+    y_mask = y * mask
+    y_max = y_mask.flatten(-2).max(-1) + 1
+    y_min = paddle.where(mask, y_mask,
+                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
+    out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
+    if normalize:
+        out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
+
+    return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
+
+
+def varifocal_loss_with_logits(pred_logits,
+                               gt_score,
+                               label,
+                               normalizer=1.0,
+                               alpha=0.75,
+                               gamma=2.0):
+    pred_score = F.sigmoid(pred_logits)
+    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
+    loss = F.binary_cross_entropy_with_logits(
+        pred_logits, gt_score, weight=weight, reduction='none')
+    return loss.mean(1).sum() / normalizer
diff --git a/ppdet/optimizer/__init__.py b/ppdet/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa690dc85029300c4b23fa2a0a27c1ef551c2ef6
--- /dev/null
+++ b/ppdet/optimizer/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import optimizer
+from . import ema
+
+from .optimizer import *
+from .ema import *
diff --git a/ppdet/optimizer/adamw.py b/ppdet/optimizer/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..12ab619a33623c4f2c30f539e2e1d7f30acf8c24
--- /dev/null
+++ b/ppdet/optimizer/adamw.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle.optimizer import AdamW
+from functools import partial
+import re
+
+IS_PADDLE_LATER_2_4 = (
+    int(paddle.version.major) >= 2 and
+    int(paddle.version.minor) >= 4) or int(paddle.version.major) == 0
+
+
+def layerwise_lr_decay(decay_rate, name_dict, n_layers, param):
+    """
+    Args:
+        decay_rate (float): 
+            The layer-wise decay ratio.
+        name_dict (dict): 
+            The keys of name_dict is dynamic name of model while the value
+            of name_dict is static name.
+            Use model.named_parameters() to get name_dict.
+        n_layers (int):
+            Total number of layers in the transformer encoder.
+    """
+    ratio = 1.0
+    static_name = name_dict[param.name]
+    if 'blocks.' in static_name or 'layers.' in static_name:
+        idx_1 = static_name.find('blocks.')
+        idx_2 = static_name.find('layers.')
+        assert any([x >= 0 for x in [idx_1, idx_2]]), ''
+        idx = idx_1 if idx_1 >= 0 else idx_2
+        # idx = re.findall('[blocks|layers]\.(\d+)\.', static_name)[0]
+
+        layer = int(static_name[idx:].split('.')[1])
+        ratio = decay_rate**(n_layers - layer)
+
+    elif 'cls_token' in static_name or 'patch_embed' in static_name or 'pos_embed' in static_name:
+        ratio = decay_rate**(n_layers + 1)
+
+    if IS_PADDLE_LATER_2_4:
+        return ratio
+    else:
+        param.optimize_attr['learning_rate'] *= ratio
+
+
+class AdamWDL(AdamW):
+    r"""
+    The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting.
+    Generally it's used for transformer model.
+
+    We use "layerwise_lr_decay" as default dynamic lr setting method of AdamWDL.
+    “Layer-wise decay” means exponentially decaying the learning rates of individual 
+    layers in a top-down manner. For example, suppose the 24-th layer uses a learning
+    rate l, and the Layer-wise decay rate is α, then the learning rate of layer m 
+    is lα^(24-m). See more details on: https://arxiv.org/abs/1906.08237.
+
+    .. math::
+        & t = t + 1
+    
+        & moment\_1\_out = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
+
+        & moment\_2\_out = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
+
+        & learning\_rate = learning\_rate * \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t}
+
+        & param\_out = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
+
+    Args:
+        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LRScheduler. The default value is 0.001.
+        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 1e-08.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
+        apply_decay_param_fun (function|None, optional): If it is not None,
+            only tensors that makes apply_decay_param_fun(Tensor.name)==True
+            will be updated. It only works when we want to specify tensors.
+            Default: None.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
+            The accumulators are updated at every step. Every element of the two moving-average
+            is updated in both dense mode and sparse mode. If the size of parameter is very large,
+            then the update may be very slow. The lazy mode only update the element that has
+            gradient in current mini-batch, so it will be much more faster. But this mode has
+            different semantics with the original Adam algorithm and may lead to different result.
+            The default value is False.
+        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.  
+        layerwise_decay (float, optional): The layer-wise decay ratio. Defaults to 1.0.
+        n_layers (int, optional): The total number of encoder layers. Defaults to 12.
+        set_param_lr_fun (function|None, optional): If it's not None, set_param_lr_fun() will set the the parameter 
+            learning rate before it executes Adam Operator. Defaults to :ref:`layerwise_lr_decay`.
+        name_dict (dict, optional): The keys of name_dict is dynamic name of model while the value
+            of name_dict is static name. Use model.named_parameters() to get name_dict.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddlenlp.ops.optimizer import AdamWDL
+            def simple_lr_setting(decay_rate, name_dict, n_layers, param):
+                ratio = 1.0
+                static_name = name_dict[param.name]
+                if "weight" in static_name:
+                    ratio = decay_rate**0.5
+                param.optimize_attr["learning_rate"] *= ratio
+            
+            linear = paddle.nn.Linear(10, 10)
+
+            name_dict = dict()
+            for n, p in linear.named_parameters():
+                name_dict[p.name] = n
+
+            inp = paddle.rand([10,10], dtype="float32")
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            adamwdl = AdamWDL(
+                learning_rate=1e-4,
+                parameters=linear.parameters(),
+                set_param_lr_fun=simple_lr_setting,
+                layerwise_decay=0.8,
+                name_dict=name_dict)
+            
+            loss.backward()
+            adamwdl.step()
+            adamwdl.clear_grad()
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 parameters=None,
+                 weight_decay=0.01,
+                 apply_decay_param_fun=None,
+                 grad_clip=None,
+                 lazy_mode=False,
+                 multi_precision=False,
+                 layerwise_decay=1.0,
+                 n_layers=12,
+                 set_param_lr_func=None,
+                 name_dict=None,
+                 name=None):
+        if not isinstance(layerwise_decay, float):
+            raise TypeError("coeff should be float or Tensor.")
+        self.layerwise_decay = layerwise_decay
+        self.n_layers = n_layers
+        self.set_param_lr_func = partial(
+            set_param_lr_func, layerwise_decay, name_dict,
+            n_layers) if set_param_lr_func is not None else set_param_lr_func
+
+        if IS_PADDLE_LATER_2_4:
+            super(AdamWDL, self).__init__(
+                learning_rate=learning_rate,
+                parameters=parameters,
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                grad_clip=grad_clip,
+                name=name,
+                apply_decay_param_fun=apply_decay_param_fun,
+                weight_decay=weight_decay,
+                lazy_mode=lazy_mode,
+                multi_precision=multi_precision,
+                lr_ratio=self.set_param_lr_func)
+        else:
+            super(AdamWDL, self).__init__(
+                learning_rate=learning_rate,
+                parameters=parameters,
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                grad_clip=grad_clip,
+                name=name,
+                apply_decay_param_fun=apply_decay_param_fun,
+                weight_decay=weight_decay,
+                lazy_mode=lazy_mode,
+                multi_precision=multi_precision)
+
+
+def _append_optimize_op(self, block, param_and_grad):
+    if self.set_param_lr_func is None:
+        return super(AdamWDL, self)._append_optimize_op(block, param_and_grad)
+
+    self._append_decoupled_weight_decay(block, param_and_grad)
+    prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
+    self.set_param_lr_func(param_and_grad[0])
+    # excute Adam op
+    res = super(AdamW, self)._append_optimize_op(block, param_and_grad)
+    param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
+    return res
+
+
+if not IS_PADDLE_LATER_2_4:
+    AdamWDL._append_optimize_op = _append_optimize_op
+
+
+def build_adamwdl(model,
+                  lr=1e-4,
+                  weight_decay=0.05,
+                  betas=(0.9, 0.999),
+                  layer_decay=0.65,
+                  num_layers=None,
+                  filter_bias_and_bn=True,
+                  skip_decay_names=None,
+                  set_param_lr_func='layerwise_lr_decay'):
+
+    if skip_decay_names and filter_bias_and_bn:
+        decay_dict = {
+            param.name: not (len(param.shape) == 1 or name.endswith('.bias') or
+                             any([_n in name for _n in skip_decay_names]))
+            for name, param in model.named_parameters()
+        }
+        parameters = [p for p in model.parameters()]
+
+    else:
+        parameters = model.parameters()
+
+    opt_args = dict(
+        parameters=parameters, learning_rate=lr, weight_decay=weight_decay)
+
+    if decay_dict is not None:
+        opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n]
+
+    if isinstance(set_param_lr_func, str):
+        set_param_lr_func = eval(set_param_lr_func)
+        opt_args['set_param_lr_func'] = set_param_lr_func
+
+    opt_args['beta1'] = betas[0]
+    opt_args['beta2'] = betas[1]
+
+    opt_args['layerwise_decay'] = layer_decay
+    name_dict = {p.name: n for n, p in model.named_parameters()}
+
+    opt_args['name_dict'] = name_dict
+    opt_args['n_layers'] = num_layers
+
+    optimizer = AdamWDL(**opt_args)
+
+    return optimizer
diff --git a/ppdet/optimizer/ema.py b/ppdet/optimizer/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d006b8fe30b6c4895a4a1c5aeee29c04550636
--- /dev/null
+++ b/ppdet/optimizer/ema.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import weakref
+from copy import deepcopy
+
+from .utils import get_bn_running_state_names
+
+__all__ = ['ModelEMA', 'SimpleModelEMA']
+
+
+class ModelEMA(object):
+    """
+    Exponential Weighted Average for Deep Neutal Networks
+    Args:
+        model (nn.Layer): Detector of model.
+        decay (int):  The decay used for updating ema parameter.
+            Ema's parameter are updated with the formula:
+           `ema_param = decay * ema_param + (1 - decay) * cur_param`.
+            Defaults is 0.9998.
+        ema_decay_type (str): type in ['threshold', 'normal', 'exponential'],
+            'threshold' as default.
+        cycle_epoch (int): The epoch of interval to reset ema_param and
+            step. Defaults is -1, which means not reset. Its function is to
+            add a regular effect to ema, which is set according to experience
+            and is effective when the total training epoch is large.
+        ema_black_list (set|list|tuple, optional): The custom EMA black_list.
+            Blacklist of weight names that will not participate in EMA
+            calculation. Default: None.
+    """
+
+    def __init__(self,
+                 model,
+                 decay=0.9998,
+                 ema_decay_type='threshold',
+                 cycle_epoch=-1,
+                 ema_black_list=None,
+                 ema_filter_no_grad=False):
+        self.step = 0
+        self.epoch = 0
+        self.decay = decay
+        self.ema_decay_type = ema_decay_type
+        self.cycle_epoch = cycle_epoch
+        self.ema_black_list = self._match_ema_black_list(
+            model.state_dict().keys(), ema_black_list)
+        bn_states_names = get_bn_running_state_names(model)
+        if ema_filter_no_grad:
+            for n, p in model.named_parameters():
+                if p.stop_gradient and n not in bn_states_names:
+                    self.ema_black_list.add(n)
+
+        self.state_dict = dict()
+        for k, v in model.state_dict().items():
+            if k in self.ema_black_list:
+                self.state_dict[k] = v
+            else:
+                self.state_dict[k] = paddle.zeros_like(v)
+
+        self._model_state = {
+            k: weakref.ref(p)
+            for k, p in model.state_dict().items()
+        }
+
+    def reset(self):
+        self.step = 0
+        self.epoch = 0
+        for k, v in self.state_dict.items():
+            if k in self.ema_black_list:
+                self.state_dict[k] = v
+            else:
+                self.state_dict[k] = paddle.zeros_like(v)
+
+    def resume(self, state_dict, step=0):
+        for k, v in state_dict.items():
+            if k in self.state_dict:
+                if self.state_dict[k].dtype == v.dtype:
+                    self.state_dict[k] = v
+                else:
+                    self.state_dict[k] = v.astype(self.state_dict[k].dtype)
+        self.step = step
+
+    def update(self, model=None):
+        if self.ema_decay_type == 'threshold':
+            decay = min(self.decay, (1 + self.step) / (10 + self.step))
+        elif self.ema_decay_type == 'exponential':
+            decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000))
+        else:
+            decay = self.decay
+        self._decay = decay
+
+        if model is not None:
+            model_dict = model.state_dict()
+        else:
+            model_dict = {k: p() for k, p in self._model_state.items()}
+            assert all(
+                [v is not None for _, v in model_dict.items()]), 'python gc.'
+
+        for k, v in self.state_dict.items():
+            if k not in self.ema_black_list:
+                v = decay * v + (1 - decay) * model_dict[k]
+                v.stop_gradient = True
+                self.state_dict[k] = v
+        self.step += 1
+
+    def apply(self):
+        if self.step == 0:
+            return self.state_dict
+        state_dict = dict()
+        for k, v in self.state_dict.items():
+            if k in self.ema_black_list:
+                v.stop_gradient = True
+                state_dict[k] = v
+            else:
+                if self.ema_decay_type != 'exponential':
+                    v = v / (1 - self._decay**self.step)
+                v.stop_gradient = True
+                state_dict[k] = v
+        self.epoch += 1
+        if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch:
+            self.reset()
+
+        return state_dict
+
+    def _match_ema_black_list(self, weight_name, ema_black_list=None):
+        out_list = set()
+        if ema_black_list:
+            for name in weight_name:
+                for key in ema_black_list:
+                    if key in name:
+                        out_list.add(name)
+        return out_list
+
+
+class SimpleModelEMA(object):
+    """
+    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+
+    def __init__(self, model=None, decay=0.9996):
+        """
+        Args:
+            model (nn.Module): model to apply EMA.
+            decay (float): ema decay reate.
+        """
+        self.model = deepcopy(model)
+        self.decay = decay
+
+    def update(self, model, decay=None):
+        if decay is None:
+            decay = self.decay
+
+        with paddle.no_grad():
+            state = {}
+            msd = model.state_dict()
+            for k, v in self.model.state_dict().items():
+                if paddle.is_floating_point(v):
+                    v *= decay
+                    v += (1.0 - decay) * msd[k].detach()
+                state[k] = v
+            self.model.set_state_dict(state)
+
+    def resume(self, state_dict, step=0):
+        state = {}
+        msd = state_dict
+        for k, v in self.model.state_dict().items():
+            if paddle.is_floating_point(v):
+                v = msd[k].detach()
+            state[k] = v
+        self.model.set_state_dict(state)
+        self.step = step
diff --git a/ppdet/optimizer/optimizer.py b/ppdet/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8235ade1f1514096ae920843df4e45af9f775723
--- /dev/null
+++ b/ppdet/optimizer/optimizer.py
@@ -0,0 +1,385 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import math
+import paddle
+import paddle.nn as nn
+
+import paddle.optimizer as optimizer
+import paddle.regularizer as regularizer
+
+from ppdet.core.workspace import register, serializable
+import copy
+
+from .adamw import AdamWDL, build_adamwdl
+
+__all__ = ['LearningRate', 'OptimizerBuilder']
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@serializable
+class CosineDecay(object):
+    """
+    Cosine learning rate decay
+
+    Args:
+        max_epochs (int): max epochs for the training process.
+            if you commbine cosine decay with warmup, it is recommended that
+            the max_iters is much larger than the warmup iter
+        use_warmup (bool): whether to use warmup. Default: True.
+        min_lr_ratio (float): minimum learning rate ratio. Default: 0.
+        last_plateau_epochs (int): use minimum learning rate in
+            the last few epochs. Default: 0.
+    """
+
+    def __init__(self,
+                 max_epochs=1000,
+                 use_warmup=True,
+                 min_lr_ratio=0.,
+                 last_plateau_epochs=0):
+        self.max_epochs = max_epochs
+        self.use_warmup = use_warmup
+        self.min_lr_ratio = min_lr_ratio
+        self.last_plateau_epochs = last_plateau_epochs
+
+    def __call__(self,
+                 base_lr=None,
+                 boundary=None,
+                 value=None,
+                 step_per_epoch=None):
+        assert base_lr is not None, "either base LR or values should be provided"
+
+        max_iters = self.max_epochs * int(step_per_epoch)
+        last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch)
+        min_lr = base_lr * self.min_lr_ratio
+        if boundary is not None and value is not None and self.use_warmup:
+            # use warmup
+            warmup_iters = len(boundary)
+            for i in range(int(boundary[-1]), max_iters):
+                boundary.append(i)
+                if i < max_iters - last_plateau_iters:
+                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(
+                        (i - warmup_iters) * math.pi /
+                        (max_iters - warmup_iters - last_plateau_iters)) + 1)
+                    value.append(decayed_lr)
+                else:
+                    value.append(min_lr)
+            return optimizer.lr.PiecewiseDecay(boundary, value)
+        elif last_plateau_iters > 0:
+            # not use warmup, but set `last_plateau_epochs` > 0
+            boundary = []
+            value = []
+            for i in range(max_iters):
+                if i < max_iters - last_plateau_iters:
+                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(
+                        i * math.pi / (max_iters - last_plateau_iters)) + 1)
+                    value.append(decayed_lr)
+                else:
+                    value.append(min_lr)
+                if i > 0:
+                    boundary.append(i)
+            return optimizer.lr.PiecewiseDecay(boundary, value)
+
+        return optimizer.lr.CosineAnnealingDecay(
+            base_lr, T_max=max_iters, eta_min=min_lr)
+
+
+@serializable
+class PiecewiseDecay(object):
+    """
+    Multi step learning rate decay
+
+    Args:
+        gamma (float | list): decay factor
+        milestones (list): steps at which to decay learning rate
+    """
+
+    def __init__(self,
+                 gamma=[0.1, 0.01],
+                 milestones=[8, 11],
+                 values=None,
+                 use_warmup=True):
+        super(PiecewiseDecay, self).__init__()
+        if type(gamma) is not list:
+            self.gamma = []
+            for i in range(len(milestones)):
+                self.gamma.append(gamma / 10**i)
+        else:
+            self.gamma = gamma
+        self.milestones = milestones
+        self.values = values
+        self.use_warmup = use_warmup
+
+    def __call__(self,
+                 base_lr=None,
+                 boundary=None,
+                 value=None,
+                 step_per_epoch=None):
+        if boundary is not None and self.use_warmup:
+            boundary.extend([int(step_per_epoch) * i for i in self.milestones])
+        else:
+            # do not use LinearWarmup
+            boundary = [int(step_per_epoch) * i for i in self.milestones]
+            value = [base_lr]  # during step[0, boundary[0]] is base_lr
+
+        # self.values is setted directly in config
+        if self.values is not None:
+            assert len(self.milestones) + 1 == len(self.values)
+            return optimizer.lr.PiecewiseDecay(boundary, self.values)
+
+        # value is computed by self.gamma
+        value = value if value is not None else [base_lr]
+        for i in self.gamma:
+            value.append(base_lr * i)
+
+        return optimizer.lr.PiecewiseDecay(boundary, value)
+
+
+@serializable
+class YOLOv5LRDecay(object):
+    def __init__(self, max_epochs=300, min_lr_ratio=0.01, use_warmup=True):
+        self.max_epochs = max_epochs
+        self.min_lr_ratio = min_lr_ratio
+        self.use_warmup = use_warmup
+
+    def __call__(self,
+                 base_lr=None,
+                 boundary=None,
+                 value=None,
+                 step_per_epoch=None):
+        assert base_lr is not None, "either base LR or values should be provided"
+
+        max_iters = self.max_epochs * int(step_per_epoch)
+        warmup_iters = int(boundary[-1])
+
+        for i in range(warmup_iters + 1, max_iters):
+            boundary.append(i)
+            epoch_i = i // step_per_epoch - 1
+            if epoch_i == 2:  # TODO
+                epoch_i = epoch_i + 1
+
+            decayed_lr = base_lr * (
+                (1 - epoch_i / self.max_epochs) *
+                (1.0 - self.min_lr_ratio) + self.min_lr_ratio)
+            value.append(decayed_lr)
+        return optimizer.lr.PiecewiseDecay(boundary, value)
+
+
+@serializable
+class LinearWarmup(object):
+    """
+    Warm up learning rate linearly
+
+    Args:
+        steps (int): warm up steps
+        start_factor (float): initial learning rate factor
+        epochs (int|None): use epochs as warm up steps, the priority
+            of `epochs` is higher than `steps`. Default: None.
+    """
+
+    def __init__(self, steps=500, start_factor=1. / 3, epochs=None):
+        super(LinearWarmup, self).__init__()
+        self.steps = steps
+        self.start_factor = start_factor
+        self.epochs = epochs
+
+    def __call__(self, base_lr, step_per_epoch):
+        boundary = []
+        value = []
+        warmup_steps = self.epochs * step_per_epoch \
+            if self.epochs is not None else self.steps
+        warmup_steps = max(warmup_steps, 1)
+        for i in range(warmup_steps + 1):
+            if warmup_steps > 0:
+                alpha = i / warmup_steps
+                factor = self.start_factor * (1 - alpha) + alpha
+                lr = base_lr * factor
+                value.append(lr)
+            if i > 0:
+                boundary.append(i)
+        return boundary, value
+
+
+@serializable
+class ExpWarmup(object):
+    """
+    Warm up learning rate in exponential mode
+    Args:
+        steps (int): warm up steps.
+        epochs (int|None): use epochs as warm up steps, the priority
+            of `epochs` is higher than `steps`. Default: None.
+        power (int): Exponential coefficient. Default: 2.
+    """
+
+    def __init__(self, steps=1000, epochs=None, power=2):
+        super(ExpWarmup, self).__init__()
+        self.steps = steps
+        self.epochs = epochs
+        self.power = power
+
+    def __call__(self, base_lr, step_per_epoch):
+        boundary = []
+        value = []
+        warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps
+        warmup_steps = max(warmup_steps, 1)
+        for i in range(warmup_steps + 1):
+            factor = (i / float(warmup_steps))**self.power
+            value.append(base_lr * factor)
+            if i > 0:
+                boundary.append(i)
+        return boundary, value
+
+
+@register
+class LearningRate(object):
+    """
+    Learning Rate configuration
+
+    Args:
+        base_lr (float): base learning rate
+        schedulers (list): learning rate schedulers
+    """
+    __category__ = 'optim'
+
+    def __init__(self,
+                 base_lr=0.01,
+                 schedulers=[PiecewiseDecay(), LinearWarmup()]):
+        super(LearningRate, self).__init__()
+        self.base_lr = base_lr
+        self.schedulers = []
+
+        schedulers = copy.deepcopy(schedulers)
+        for sched in schedulers:
+            if isinstance(sched, dict):
+                # support dict sched instantiate
+                module = sys.modules[__name__]
+                type = sched.pop("name")
+                scheduler = getattr(module, type)(**sched)
+                self.schedulers.append(scheduler)
+            else:
+                self.schedulers.append(sched)
+
+    def __call__(self, step_per_epoch):
+        assert len(self.schedulers) >= 1
+        if not self.schedulers[0].use_warmup:
+            return self.schedulers[0](base_lr=self.base_lr,
+                                      step_per_epoch=step_per_epoch)
+
+        # TODO: split warmup & decay
+        # warmup
+        boundary, value = self.schedulers[1](self.base_lr, step_per_epoch)
+        # decay
+        decay_lr = self.schedulers[0](self.base_lr, boundary, value,
+                                      step_per_epoch)
+        return decay_lr
+
+
+@register
+class OptimizerBuilder():
+    """
+    Build optimizer handles
+    Args:
+        regularizer (object): an `Regularizer` instance
+        optimizer (object): an `Optimizer` instance
+    """
+    __category__ = 'optim'
+
+    def __init__(self,
+                 clip_grad_by_norm=None,
+                 clip_grad_by_value=None,
+                 regularizer={'type': 'L2',
+                              'factor': .0001},
+                 optimizer={'type': 'Momentum',
+                            'momentum': .9}):
+        self.clip_grad_by_norm = clip_grad_by_norm
+        self.clip_grad_by_value = clip_grad_by_value
+        self.regularizer = regularizer
+        self.optimizer = optimizer
+
+    def __call__(self, learning_rate, model=None):
+        if self.clip_grad_by_norm is not None:
+            grad_clip = nn.ClipGradByGlobalNorm(
+                clip_norm=self.clip_grad_by_norm)
+        elif self.clip_grad_by_value is not None:
+            var = abs(self.clip_grad_by_value)
+            grad_clip = nn.ClipGradByValue(min=-var, max=var)
+        else:
+            grad_clip = None
+        if self.regularizer and self.regularizer != 'None':
+            reg_type = self.regularizer['type'] + 'Decay'
+            reg_factor = self.regularizer['factor']
+            regularization = getattr(regularizer, reg_type)(reg_factor)
+        else:
+            regularization = None
+
+        optim_args = self.optimizer.copy()
+        optim_type = optim_args['type']
+        del optim_args['type']
+
+        if optim_type == 'AdamWDL':
+            return build_adamwdl(model, lr=learning_rate, **optim_args)
+
+        if optim_type != 'AdamW':
+            optim_args['weight_decay'] = regularization
+
+        op = getattr(optimizer, optim_type)
+
+        if 'param_groups' in optim_args:
+            assert isinstance(optim_args['param_groups'], list), ''
+
+            param_groups = optim_args.pop('param_groups')
+
+            params, visited = [], []
+            for group in param_groups:
+                assert isinstance(group,
+                                  dict) and 'params' in group and isinstance(
+                                      group['params'], list), ''
+                _params = {
+                    n: p
+                    for n, p in model.named_parameters()
+                    if any([k in n
+                            for k in group['params']]) and p.trainable is True
+                }
+                _group = group.copy()
+                _group.update({'params': list(_params.values())})
+
+                params.append(_group)
+                visited.extend(list(_params.keys()))
+
+            ext_params = [
+                p for n, p in model.named_parameters()
+                if n not in visited and p.trainable is True
+            ]
+
+            if len(ext_params) < len(model.parameters()):
+                params.append({'params': ext_params})
+
+            elif len(ext_params) > len(model.parameters()):
+                raise RuntimeError
+
+        else:
+            _params = model.parameters()
+            params = [param for param in _params if param.trainable is True]
+
+        return op(learning_rate=learning_rate,
+                  parameters=params,
+                  grad_clip=grad_clip,
+                  **optim_args)
diff --git a/ppdet/optimizer/utils.py b/ppdet/optimizer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce2de49bf5973ee0b69a9ecc62028cca67f4d1e0
--- /dev/null
+++ b/ppdet/optimizer/utils.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+from typing import List
+
+
+def get_bn_running_state_names(model: nn.Layer) -> List[str]:
+    """Get all bn state full names including running mean and variance
+    """
+    names = []
+    for n, m in model.named_sublayers():
+        if isinstance(m, (nn.BatchNorm2D, nn.SyncBatchNorm)):
+            assert hasattr(m, '_mean'), f'assert {m} has _mean'
+            assert hasattr(m, '_variance'), f'assert {m} has _variance'
+            running_mean = f'{n}._mean'
+            running_var = f'{n}._variance'
+            names.extend([running_mean, running_var])
+
+    return names
diff --git a/ppdet/slim/__init__.py b/ppdet/slim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..712919002ff49d9ff503fa8caaed85c954a02104
--- /dev/null
+++ b/ppdet/slim/__init__.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import distill_loss
+from . import distill_model
+from . import ofa
+from . import prune
+from . import quant
+from . import unstructured_prune
+
+from .distill_loss import *
+from .distill_model import *
+from .ofa import *
+from .prune import *
+from .quant import *
+from .unstructured_prune import *
+
+import yaml
+from ppdet.core.workspace import load_config
+from ppdet.utils.checkpoint import load_pretrain_weight
+
+
+def build_slim_model(cfg, slim_cfg, mode='train'):
+    with open(slim_cfg) as f:
+        slim_load_cfg = yaml.load(f, Loader=yaml.Loader)
+
+    if mode != 'train' and slim_load_cfg['slim'] == 'Distill':
+        return cfg
+
+    if slim_load_cfg['slim'] == 'Distill':
+        if "slim_method" in slim_load_cfg and slim_load_cfg[
+                'slim_method'] == "FGD":
+            model = FGDDistillModel(cfg, slim_cfg)
+        elif "slim_method" in slim_load_cfg and slim_load_cfg[
+                'slim_method'] == "LD":
+            model = LDDistillModel(cfg, slim_cfg)
+        elif "slim_method" in slim_load_cfg and slim_load_cfg[
+                'slim_method'] == "CWD":
+            model = CWDDistillModel(cfg, slim_cfg)
+        elif "slim_method" in slim_load_cfg and slim_load_cfg[
+                'slim_method'] == "PPYOLOEDistill":
+            model = PPYOLOEDistillModel(cfg, slim_cfg)
+        else:
+            # common distillation model
+            model = DistillModel(cfg, slim_cfg)
+        cfg['model'] = model
+        cfg['slim_type'] = cfg.slim
+    elif slim_load_cfg['slim'] == 'OFA':
+        load_config(slim_cfg)
+        model = create(cfg.architecture)
+        load_pretrain_weight(model, cfg.weights)
+        slim = create(cfg.slim)
+        cfg['slim'] = slim
+        cfg['model'] = slim(model, model.state_dict())
+        cfg['slim_type'] = cfg.slim
+    elif slim_load_cfg['slim'] == 'DistillPrune':
+        if mode == 'train':
+            model = DistillModel(cfg, slim_cfg)
+            pruner = create(cfg.pruner)
+            pruner(model.student_model)
+        else:
+            model = create(cfg.architecture)
+            weights = cfg.weights
+            load_config(slim_cfg)
+            pruner = create(cfg.pruner)
+            model = pruner(model)
+            load_pretrain_weight(model, weights)
+        cfg['model'] = model
+        cfg['slim_type'] = cfg.slim
+    elif slim_load_cfg['slim'] == 'PTQ':
+        model = create(cfg.architecture)
+        load_config(slim_cfg)
+        load_pretrain_weight(model, cfg.weights)
+        slim = create(cfg.slim)
+        cfg['slim_type'] = cfg.slim
+        cfg['slim'] = slim
+        cfg['model'] = slim(model)
+    elif slim_load_cfg['slim'] == 'UnstructuredPruner':
+        load_config(slim_cfg)
+        slim = create(cfg.slim)
+        cfg['slim_type'] = cfg.slim
+        cfg['slim'] = slim
+        cfg['unstructured_prune'] = True
+    else:
+        load_config(slim_cfg)
+        model = create(cfg.architecture)
+        if mode == 'train':
+            load_pretrain_weight(model, cfg.pretrain_weights)
+        slim = create(cfg.slim)
+        cfg['slim_type'] = cfg.slim
+        # TODO: fix quant export model in framework.
+        if mode == 'test' and 'QAT' in slim_load_cfg['slim']:
+            slim.quant_config['activation_preprocess_type'] = None
+        cfg['model'] = slim(model)
+        cfg['slim'] = slim
+        if mode != 'train':
+            load_pretrain_weight(cfg['model'], cfg.weights)
+
+    return cfg
diff --git a/ppdet/slim/distill_loss.py b/ppdet/slim/distill_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..d325a5b2ac93983256bf8c07b165354f0b4ffd98
--- /dev/null
+++ b/ppdet/slim/distill_loss.py
@@ -0,0 +1,919 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+from ppdet.core.workspace import register
+from ppdet.modeling import ops
+from ppdet.modeling.losses.iou_loss import GIoULoss
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'DistillYOLOv3Loss',
+    'KnowledgeDistillationKLDivLoss',
+    'DistillPPYOLOELoss',
+    'FGDFeatureLoss',
+    'CWDFeatureLoss',
+    'PKDFeatureLoss',
+    'MGDFeatureLoss',
+]
+
+
+def parameter_init(mode="kaiming", value=0.):
+    if mode == "kaiming":
+        weight_attr = paddle.nn.initializer.KaimingUniform()
+    elif mode == "constant":
+        weight_attr = paddle.nn.initializer.Constant(value=value)
+    else:
+        weight_attr = paddle.nn.initializer.KaimingUniform()
+
+    weight_init = ParamAttr(initializer=weight_attr)
+    return weight_init
+
+
+def feature_norm(feat):
+    # Normalize the feature maps to have zero mean and unit variances.
+    assert len(feat.shape) == 4
+    N, C, H, W = feat.shape
+    feat = feat.transpose([1, 0, 2, 3]).reshape([C, -1])
+    mean = feat.mean(axis=-1, keepdim=True)
+    std = feat.std(axis=-1, keepdim=True)
+    feat = (feat - mean) / (std + 1e-6)
+    return feat.reshape([C, N, H, W]).transpose([1, 0, 2, 3])
+
+
+@register
+class DistillYOLOv3Loss(nn.Layer):
+    def __init__(self, weight=1000):
+        super(DistillYOLOv3Loss, self).__init__()
+        self.loss_weight = weight
+
+    def obj_weighted_reg(self, sx, sy, sw, sh, tx, ty, tw, th, tobj):
+        loss_x = ops.sigmoid_cross_entropy_with_logits(sx, F.sigmoid(tx))
+        loss_y = ops.sigmoid_cross_entropy_with_logits(sy, F.sigmoid(ty))
+        loss_w = paddle.abs(sw - tw)
+        loss_h = paddle.abs(sh - th)
+        loss = paddle.add_n([loss_x, loss_y, loss_w, loss_h])
+        weighted_loss = paddle.mean(loss * F.sigmoid(tobj))
+        return weighted_loss
+
+    def obj_weighted_cls(self, scls, tcls, tobj):
+        loss = ops.sigmoid_cross_entropy_with_logits(scls, F.sigmoid(tcls))
+        weighted_loss = paddle.mean(paddle.multiply(loss, F.sigmoid(tobj)))
+        return weighted_loss
+
+    def obj_loss(self, sobj, tobj):
+        obj_mask = paddle.cast(tobj > 0., dtype="float32")
+        obj_mask.stop_gradient = True
+        loss = paddle.mean(
+            ops.sigmoid_cross_entropy_with_logits(sobj, obj_mask))
+        return loss
+
+    def forward(self, teacher_model, student_model):
+        teacher_distill_pairs = teacher_model.yolo_head.loss.distill_pairs
+        student_distill_pairs = student_model.yolo_head.loss.distill_pairs
+        distill_reg_loss, distill_cls_loss, distill_obj_loss = [], [], []
+        for s_pair, t_pair in zip(student_distill_pairs, teacher_distill_pairs):
+            distill_reg_loss.append(
+                self.obj_weighted_reg(s_pair[0], s_pair[1], s_pair[2], s_pair[
+                    3], t_pair[0], t_pair[1], t_pair[2], t_pair[3], t_pair[4]))
+            distill_cls_loss.append(
+                self.obj_weighted_cls(s_pair[5], t_pair[5], t_pair[4]))
+            distill_obj_loss.append(self.obj_loss(s_pair[4], t_pair[4]))
+        distill_reg_loss = paddle.add_n(distill_reg_loss)
+        distill_cls_loss = paddle.add_n(distill_cls_loss)
+        distill_obj_loss = paddle.add_n(distill_obj_loss)
+        loss = (distill_reg_loss + distill_cls_loss + distill_obj_loss
+                ) * self.loss_weight
+        return loss
+
+
+@register
+class KnowledgeDistillationKLDivLoss(nn.Layer):
+    """Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+        T (int): Temperature for distillation.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0, T=10):
+        super(KnowledgeDistillationKLDivLoss, self).__init__()
+        assert reduction in ('none', 'mean', 'sum')
+        assert T >= 1
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.T = T
+
+    def knowledge_distillation_kl_div_loss(self,
+                                           pred,
+                                           soft_label,
+                                           T,
+                                           detach_target=True):
+        r"""Loss function for knowledge distilling using KL divergence.
+
+        Args:
+            pred (Tensor): Predicted logits with shape (N, n + 1).
+            soft_label (Tensor): Target logits with shape (N, N + 1).
+            T (int): Temperature for distillation.
+            detach_target (bool): Remove soft_label from automatic differentiation
+        """
+        assert pred.shape == soft_label.shape
+        target = F.softmax(soft_label / T, axis=1)
+        if detach_target:
+            target = target.detach()
+
+        kd_loss = F.kl_div(
+            F.log_softmax(
+                pred / T, axis=1), target, reduction='none').mean(1) * (T * T)
+
+        return kd_loss
+
+    def forward(self,
+                pred,
+                soft_label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted logits with shape (N, n + 1).
+            soft_label (Tensor): Target logits with shape (N, N + 1).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+
+        loss_kd_out = self.knowledge_distillation_kl_div_loss(
+            pred, soft_label, T=self.T)
+
+        if weight is not None:
+            loss_kd_out = weight * loss_kd_out
+
+        if avg_factor is None:
+            if reduction == 'none':
+                loss = loss_kd_out
+            elif reduction == 'mean':
+                loss = loss_kd_out.mean()
+            elif reduction == 'sum':
+                loss = loss_kd_out.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if reduction == 'mean':
+                loss = loss_kd_out.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+
+        loss_kd = self.loss_weight * loss
+        return loss_kd
+
+
+@register
+class DistillPPYOLOELoss(nn.Layer):
+    def __init__(
+            self,
+            loss_weight={'logits': 4.0,
+                         'feat': 1.0},
+            logits_distill=True,
+            logits_loss_weight={'class': 1.0,
+                                'iou': 2.5,
+                                'dfl': 0.5},
+            logits_ld_distill=False,
+            logits_ld_params={'weight': 20000,
+                              'T': 10},
+            feat_distill=True,
+            feat_distiller='fgd',
+            feat_distill_place='neck_feats',
+            teacher_width_mult=1.0,  # L
+            student_width_mult=0.75,  # M
+            feat_out_channels=[768, 384, 192]):
+        super(DistillPPYOLOELoss, self).__init__()
+        self.loss_weight_logits = loss_weight['logits']
+        self.loss_weight_feat = loss_weight['feat']
+        self.logits_distill = logits_distill
+        self.logits_ld_distill = logits_ld_distill
+        self.feat_distill = feat_distill
+
+        if logits_distill and self.loss_weight_logits > 0:
+            self.bbox_loss_weight = logits_loss_weight['iou']
+            self.dfl_loss_weight = logits_loss_weight['dfl']
+            self.qfl_loss_weight = logits_loss_weight['class']
+            self.loss_bbox = GIoULoss()
+
+        if logits_ld_distill:
+            self.loss_kd = KnowledgeDistillationKLDivLoss(
+                loss_weight=logits_ld_params['weight'], T=logits_ld_params['T'])
+
+        if feat_distill and self.loss_weight_feat > 0:
+            assert feat_distiller in ['cwd', 'fgd', 'pkd', 'mgd', 'mimic']
+            assert feat_distill_place in ['backbone_feats', 'neck_feats']
+            self.feat_distill_place = feat_distill_place
+            self.t_channel_list = [
+                int(c * teacher_width_mult) for c in feat_out_channels
+            ]
+            self.s_channel_list = [
+                int(c * student_width_mult) for c in feat_out_channels
+            ]
+            self.distill_feat_loss_modules = []
+            for i in range(len(feat_out_channels)):
+                if feat_distiller == 'cwd':
+                    feat_loss_module = CWDFeatureLoss(
+                        student_channels=self.s_channel_list[i],
+                        teacher_channels=self.t_channel_list[i],
+                        normalize=True)
+                elif feat_distiller == 'fgd':
+                    feat_loss_module = FGDFeatureLoss(
+                        student_channels=self.s_channel_list[i],
+                        teacher_channels=self.t_channel_list[i],
+                        normalize=True,
+                        alpha_fgd=0.00001,
+                        beta_fgd=0.000005,
+                        gamma_fgd=0.00001,
+                        lambda_fgd=0.00000005)
+                elif feat_distiller == 'pkd':
+                    feat_loss_module = PKDFeatureLoss(
+                        student_channels=self.s_channel_list[i],
+                        teacher_channels=self.t_channel_list[i],
+                        normalize=True,
+                        resize_stu=True)
+                elif feat_distiller == 'mgd':
+                    feat_loss_module = MGDFeatureLoss(
+                        student_channels=self.s_channel_list[i],
+                        teacher_channels=self.t_channel_list[i],
+                        normalize=True,
+                        loss_func='ssim')
+                elif feat_distiller == 'mimic':
+                    feat_loss_module = MimicFeatureLoss(
+                        student_channels=self.s_channel_list[i],
+                        teacher_channels=self.t_channel_list[i],
+                        normalize=True)
+                else:
+                    raise ValueError
+                self.distill_feat_loss_modules.append(feat_loss_module)
+
+    def quality_focal_loss(self,
+                           pred_logits,
+                           soft_target_logits,
+                           beta=2.0,
+                           use_sigmoid=False,
+                           num_total_pos=None):
+        if use_sigmoid:
+            func = F.binary_cross_entropy_with_logits
+            soft_target = F.sigmoid(soft_target_logits)
+            pred_sigmoid = F.sigmoid(pred_logits)
+            preds = pred_logits
+        else:
+            func = F.binary_cross_entropy
+            soft_target = soft_target_logits
+            pred_sigmoid = pred_logits
+            preds = pred_sigmoid
+
+        scale_factor = pred_sigmoid - soft_target
+        loss = func(
+            preds, soft_target, reduction='none') * scale_factor.abs().pow(beta)
+        loss = loss.sum(1)
+
+        if num_total_pos is not None:
+            loss = loss.sum() / num_total_pos
+        else:
+            loss = loss.mean()
+        return loss
+
+    def bbox_loss(self, s_bbox, t_bbox, weight_targets=None):
+        # [x,y,w,h]
+        if weight_targets is not None:
+            loss = paddle.sum(self.loss_bbox(s_bbox, t_bbox) * weight_targets)
+            avg_factor = weight_targets.sum()
+            loss = loss / avg_factor
+        else:
+            loss = paddle.mean(self.loss_bbox(s_bbox, t_bbox))
+        return loss
+
+    def distribution_focal_loss(self,
+                                pred_corners,
+                                target_corners,
+                                weight_targets=None):
+        target_corners_label = F.softmax(target_corners, axis=-1)
+        loss_dfl = F.cross_entropy(
+            pred_corners,
+            target_corners_label,
+            soft_label=True,
+            reduction='none')
+        loss_dfl = loss_dfl.sum(1)
+
+        if weight_targets is not None:
+            loss_dfl = loss_dfl * (weight_targets.expand([-1, 4]).reshape([-1]))
+            loss_dfl = loss_dfl.sum(-1) / weight_targets.sum()
+        else:
+            loss_dfl = loss_dfl.mean(-1)
+        return loss_dfl / 4.0  # 4 direction
+
+    def main_kd(self, mask_positive, pred_scores, soft_cls, num_classes):
+        num_pos = mask_positive.sum()
+        if num_pos > 0:
+            cls_mask = mask_positive.unsqueeze(-1).tile([1, 1, num_classes])
+            pred_scores_pos = paddle.masked_select(
+                pred_scores, cls_mask).reshape([-1, num_classes])
+            soft_cls_pos = paddle.masked_select(
+                soft_cls, cls_mask).reshape([-1, num_classes])
+            loss_kd = self.loss_kd(
+                pred_scores_pos, soft_cls_pos, avg_factor=num_pos)
+        else:
+            loss_kd = paddle.zeros([1])
+        return loss_kd
+
+    def forward(self, teacher_model, student_model):
+        teacher_distill_pairs = teacher_model.yolo_head.distill_pairs
+        student_distill_pairs = student_model.yolo_head.distill_pairs
+        if self.logits_distill and self.loss_weight_logits > 0:
+            distill_bbox_loss, distill_dfl_loss, distill_cls_loss = [], [], []
+
+            distill_cls_loss.append(
+                self.quality_focal_loss(
+                    student_distill_pairs['pred_cls_scores'].reshape(
+                        (-1, student_distill_pairs['pred_cls_scores'].shape[-1]
+                         )),
+                    teacher_distill_pairs['pred_cls_scores'].detach().reshape(
+                        (-1, teacher_distill_pairs['pred_cls_scores'].shape[-1]
+                         )),
+                    num_total_pos=student_distill_pairs['pos_num'],
+                    use_sigmoid=False))
+
+            distill_bbox_loss.append(
+                self.bbox_loss(student_distill_pairs['pred_bboxes_pos'],
+                                teacher_distill_pairs['pred_bboxes_pos'].detach(),
+                                weight_targets=student_distill_pairs['bbox_weight']
+                    ) if 'pred_bboxes_pos' in student_distill_pairs and \
+                        'pred_bboxes_pos' in teacher_distill_pairs and \
+                            'bbox_weight' in student_distill_pairs
+                    else paddle.zeros([1]))
+
+            distill_dfl_loss.append(
+                self.distribution_focal_loss(
+                        student_distill_pairs['pred_dist_pos'].reshape((-1, student_distill_pairs['pred_dist_pos'].shape[-1])),
+                        teacher_distill_pairs['pred_dist_pos'].detach().reshape((-1, teacher_distill_pairs['pred_dist_pos'].shape[-1])), \
+                        weight_targets=student_distill_pairs['bbox_weight']
+                    ) if 'pred_dist_pos' in student_distill_pairs and \
+                        'pred_dist_pos' in teacher_distill_pairs and \
+                            'bbox_weight' in student_distill_pairs
+                    else paddle.zeros([1]))
+
+            distill_cls_loss = paddle.add_n(distill_cls_loss)
+            distill_bbox_loss = paddle.add_n(distill_bbox_loss)
+            distill_dfl_loss = paddle.add_n(distill_dfl_loss)
+            logits_loss = distill_bbox_loss * self.bbox_loss_weight + distill_cls_loss * self.qfl_loss_weight + distill_dfl_loss * self.dfl_loss_weight
+
+            if self.logits_ld_distill:
+                loss_kd = self.main_kd(
+                    student_distill_pairs['mask_positive_select'],
+                    student_distill_pairs['pred_cls_scores'],
+                    teacher_distill_pairs['pred_cls_scores'],
+                    student_model.yolo_head.num_classes, )
+                logits_loss += loss_kd
+        else:
+            logits_loss = paddle.zeros([1])
+
+        if self.feat_distill and self.loss_weight_feat > 0:
+            feat_loss_list = []
+            inputs = student_model.inputs
+            assert 'gt_bbox' in inputs
+            assert self.feat_distill_place in student_distill_pairs
+            assert self.feat_distill_place in teacher_distill_pairs
+            stu_feats = student_distill_pairs[self.feat_distill_place]
+            tea_feats = teacher_distill_pairs[self.feat_distill_place]
+            for i, loss_module in enumerate(self.distill_feat_loss_modules):
+                feat_loss_list.append(
+                    loss_module(stu_feats[i], tea_feats[i], inputs))
+            feat_loss = paddle.add_n(feat_loss_list)
+        else:
+            feat_loss = paddle.zeros([1])
+
+        student_model.yolo_head.distill_pairs.clear()
+        teacher_model.yolo_head.distill_pairs.clear()
+        return logits_loss * self.loss_weight_logits, feat_loss * self.loss_weight_feat
+
+
+@register
+class CWDFeatureLoss(nn.Layer):
+    def __init__(self,
+                 student_channels,
+                 teacher_channels,
+                 normalize=False,
+                 tau=1.0,
+                 weight=1.0):
+        super(CWDFeatureLoss, self).__init__()
+        self.normalize = normalize
+        self.tau = tau
+        self.loss_weight = weight
+
+        if student_channels != teacher_channels:
+            self.align = nn.Conv2D(
+                student_channels,
+                teacher_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0)
+        else:
+            self.align = None
+
+    def distill_softmax(self, x, tau):
+        _, _, w, h = paddle.shape(x)
+        x = paddle.reshape(x, [-1, w * h])
+        x /= tau
+        return F.softmax(x, axis=1)
+
+    def forward(self, preds_s, preds_t, inputs=None):
+        assert preds_s.shape[-2:] == preds_t.shape[-2:]
+        N, C, H, W = preds_s.shape
+        eps = 1e-5
+        if self.align is not None:
+            preds_s = self.align(preds_s)
+
+        if self.normalize:
+            preds_s = feature_norm(preds_s)
+            preds_t = feature_norm(preds_t)
+
+        softmax_pred_s = self.distill_softmax(preds_s, self.tau)
+        softmax_pred_t = self.distill_softmax(preds_t, self.tau)
+
+        loss = paddle.sum(-softmax_pred_t * paddle.log(eps + softmax_pred_s) +
+                          softmax_pred_t * paddle.log(eps + softmax_pred_t))
+        return self.loss_weight * loss / (C * N)
+
+
+@register
+class FGDFeatureLoss(nn.Layer):
+    """
+    Focal and Global Knowledge Distillation for Detectors
+    The code is reference from https://github.com/yzd-v/FGD/blob/master/mmdet/distillation/losses/fgd.py
+   
+    Args:
+        student_channels (int): The number of channels in the student's FPN feature map. Default to 256.
+        teacher_channels (int): The number of channels in the teacher's FPN feature map. Default to 256.
+        normalize (bool): Whether to normalize the feature maps.
+        temp (float, optional): The temperature coefficient. Defaults to 0.5.
+        alpha_fgd (float, optional): The weight of fg_loss. Defaults to 0.001
+        beta_fgd (float, optional): The weight of bg_loss. Defaults to 0.0005
+        gamma_fgd (float, optional): The weight of mask_loss. Defaults to 0.001
+        lambda_fgd (float, optional): The weight of relation_loss. Defaults to 0.000005
+    """
+
+    def __init__(self,
+                 student_channels,
+                 teacher_channels,
+                 normalize=False,
+                 loss_weight=1.0,
+                 temp=0.5,
+                 alpha_fgd=0.001,
+                 beta_fgd=0.0005,
+                 gamma_fgd=0.001,
+                 lambda_fgd=0.000005):
+        super(FGDFeatureLoss, self).__init__()
+        self.normalize = normalize
+        self.loss_weight = loss_weight
+        self.temp = temp
+        self.alpha_fgd = alpha_fgd
+        self.beta_fgd = beta_fgd
+        self.gamma_fgd = gamma_fgd
+        self.lambda_fgd = lambda_fgd
+        kaiming_init = parameter_init("kaiming")
+        zeros_init = parameter_init("constant", 0.0)
+
+        if student_channels != teacher_channels:
+            self.align = nn.Conv2D(
+                student_channels,
+                teacher_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                weight_attr=kaiming_init)
+            student_channels = teacher_channels
+        else:
+            self.align = None
+
+        self.conv_mask_s = nn.Conv2D(
+            student_channels, 1, kernel_size=1, weight_attr=kaiming_init)
+        self.conv_mask_t = nn.Conv2D(
+            teacher_channels, 1, kernel_size=1, weight_attr=kaiming_init)
+
+        self.stu_conv_block = nn.Sequential(
+            nn.Conv2D(
+                student_channels,
+                student_channels // 2,
+                kernel_size=1,
+                weight_attr=zeros_init),
+            nn.LayerNorm([student_channels // 2, 1, 1]),
+            nn.ReLU(),
+            nn.Conv2D(
+                student_channels // 2,
+                student_channels,
+                kernel_size=1,
+                weight_attr=zeros_init))
+        self.tea_conv_block = nn.Sequential(
+            nn.Conv2D(
+                teacher_channels,
+                teacher_channels // 2,
+                kernel_size=1,
+                weight_attr=zeros_init),
+            nn.LayerNorm([teacher_channels // 2, 1, 1]),
+            nn.ReLU(),
+            nn.Conv2D(
+                teacher_channels // 2,
+                teacher_channels,
+                kernel_size=1,
+                weight_attr=zeros_init))
+
+    def spatial_channel_attention(self, x, t=0.5):
+        shape = paddle.shape(x)
+        N, C, H, W = shape
+        _f = paddle.abs(x)
+        spatial_map = paddle.reshape(
+            paddle.mean(
+                _f, axis=1, keepdim=True) / t, [N, -1])
+        spatial_map = F.softmax(spatial_map, axis=1, dtype="float32") * H * W
+        spatial_att = paddle.reshape(spatial_map, [N, H, W])
+
+        channel_map = paddle.mean(
+            paddle.mean(
+                _f, axis=2, keepdim=False), axis=2, keepdim=False)
+        channel_att = F.softmax(channel_map / t, axis=1, dtype="float32") * C
+        return [spatial_att, channel_att]
+
+    def spatial_pool(self, x, mode="teacher"):
+        batch, channel, width, height = x.shape
+        x_copy = x
+        x_copy = paddle.reshape(x_copy, [batch, channel, height * width])
+        x_copy = x_copy.unsqueeze(1)
+        if mode.lower() == "student":
+            context_mask = self.conv_mask_s(x)
+        else:
+            context_mask = self.conv_mask_t(x)
+
+        context_mask = paddle.reshape(context_mask, [batch, 1, height * width])
+        context_mask = F.softmax(context_mask, axis=2)
+        context_mask = context_mask.unsqueeze(-1)
+        context = paddle.matmul(x_copy, context_mask)
+        context = paddle.reshape(context, [batch, channel, 1, 1])
+        return context
+
+    def mask_loss(self, stu_channel_att, tea_channel_att, stu_spatial_att,
+                  tea_spatial_att):
+        def _func(a, b):
+            return paddle.sum(paddle.abs(a - b)) / len(a)
+
+        mask_loss = _func(stu_channel_att, tea_channel_att) + _func(
+            stu_spatial_att, tea_spatial_att)
+        return mask_loss
+
+    def feature_loss(self, stu_feature, tea_feature, mask_fg, mask_bg,
+                     tea_channel_att, tea_spatial_att):
+        mask_fg = mask_fg.unsqueeze(axis=1)
+        mask_bg = mask_bg.unsqueeze(axis=1)
+        tea_channel_att = tea_channel_att.unsqueeze(axis=-1).unsqueeze(axis=-1)
+        tea_spatial_att = tea_spatial_att.unsqueeze(axis=1)
+
+        fea_t = paddle.multiply(tea_feature, paddle.sqrt(tea_spatial_att))
+        fea_t = paddle.multiply(fea_t, paddle.sqrt(tea_channel_att))
+        fg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_fg))
+        bg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_bg))
+
+        fea_s = paddle.multiply(stu_feature, paddle.sqrt(tea_spatial_att))
+        fea_s = paddle.multiply(fea_s, paddle.sqrt(tea_channel_att))
+        fg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_fg))
+        bg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_bg))
+
+        fg_loss = F.mse_loss(fg_fea_s, fg_fea_t, reduction="sum") / len(mask_fg)
+        bg_loss = F.mse_loss(bg_fea_s, bg_fea_t, reduction="sum") / len(mask_bg)
+        return fg_loss, bg_loss
+
+    def relation_loss(self, stu_feature, tea_feature):
+        context_s = self.spatial_pool(stu_feature, "student")
+        context_t = self.spatial_pool(tea_feature, "teacher")
+        out_s = stu_feature + self.stu_conv_block(context_s)
+        out_t = tea_feature + self.tea_conv_block(context_t)
+        rela_loss = F.mse_loss(out_s, out_t, reduction="sum") / len(out_s)
+        return rela_loss
+
+    def mask_value(self, mask, xl, xr, yl, yr, value):
+        mask[xl:xr, yl:yr] = paddle.maximum(mask[xl:xr, yl:yr], value)
+        return mask
+
+    def forward(self, stu_feature, tea_feature, inputs):
+        assert stu_feature.shape[-2:] == stu_feature.shape[-2:]
+        assert "gt_bbox" in inputs.keys() and "im_shape" in inputs.keys()
+        gt_bboxes = inputs['gt_bbox']
+        ins_shape = [
+            inputs['im_shape'][i] for i in range(inputs['im_shape'].shape[0])
+        ]
+        index_gt = []
+        for i in range(len(gt_bboxes)):
+            if gt_bboxes[i].size > 2:
+                index_gt.append(i)
+        # only distill feature with labeled GTbox
+        if len(index_gt) != len(gt_bboxes):
+            index_gt_t = paddle.to_tensor(index_gt)
+            stu_feature = paddle.index_select(stu_feature, index_gt_t)
+            tea_feature = paddle.index_select(tea_feature, index_gt_t)
+
+            ins_shape = [ins_shape[c] for c in index_gt]
+            gt_bboxes = [gt_bboxes[c] for c in index_gt]
+            assert len(gt_bboxes) == tea_feature.shape[0]
+
+        if self.align is not None:
+            stu_feature = self.align(stu_feature)
+
+        if self.normalize:
+            stu_feature = feature_norm(stu_feature)
+            tea_feature = feature_norm(tea_feature)
+
+        tea_spatial_att, tea_channel_att = self.spatial_channel_attention(
+            tea_feature, self.temp)
+        stu_spatial_att, stu_channel_att = self.spatial_channel_attention(
+            stu_feature, self.temp)
+
+        mask_fg = paddle.zeros(tea_spatial_att.shape)
+        mask_bg = paddle.ones_like(tea_spatial_att)
+        one_tmp = paddle.ones([*tea_spatial_att.shape[1:]])
+        zero_tmp = paddle.zeros([*tea_spatial_att.shape[1:]])
+        mask_fg.stop_gradient = True
+        mask_bg.stop_gradient = True
+        one_tmp.stop_gradient = True
+        zero_tmp.stop_gradient = True
+
+        wmin, wmax, hmin, hmax = [], [], [], []
+
+        if len(gt_bboxes) == 0:
+            loss = self.relation_loss(stu_feature, tea_feature)
+            return self.lambda_fgd * loss
+
+        N, _, H, W = stu_feature.shape
+        for i in range(N):
+            tmp_box = paddle.ones_like(gt_bboxes[i])
+            tmp_box.stop_gradient = True
+            tmp_box[:, 0] = gt_bboxes[i][:, 0] / ins_shape[i][1] * W
+            tmp_box[:, 2] = gt_bboxes[i][:, 2] / ins_shape[i][1] * W
+            tmp_box[:, 1] = gt_bboxes[i][:, 1] / ins_shape[i][0] * H
+            tmp_box[:, 3] = gt_bboxes[i][:, 3] / ins_shape[i][0] * H
+
+            zero = paddle.zeros_like(tmp_box[:, 0], dtype="int32")
+            ones = paddle.ones_like(tmp_box[:, 2], dtype="int32")
+            zero.stop_gradient = True
+            ones.stop_gradient = True
+            wmin.append(
+                paddle.cast(paddle.floor(tmp_box[:, 0]), "int32").maximum(zero))
+            wmax.append(paddle.cast(paddle.ceil(tmp_box[:, 2]), "int32"))
+            hmin.append(
+                paddle.cast(paddle.floor(tmp_box[:, 1]), "int32").maximum(zero))
+            hmax.append(paddle.cast(paddle.ceil(tmp_box[:, 3]), "int32"))
+
+            area_recip = 1.0 / (
+                hmax[i].reshape([1, -1]) + 1 - hmin[i].reshape([1, -1])) / (
+                    wmax[i].reshape([1, -1]) + 1 - wmin[i].reshape([1, -1]))
+
+            for j in range(len(gt_bboxes[i])):
+                if gt_bboxes[i][j].sum() > 0:
+                    mask_fg[i] = self.mask_value(
+                        mask_fg[i], hmin[i][j], hmax[i][j] + 1, wmin[i][j],
+                        wmax[i][j] + 1, area_recip[0][j])
+
+            mask_bg[i] = paddle.where(mask_fg[i] > zero_tmp, zero_tmp, one_tmp)
+
+            if paddle.sum(mask_bg[i]):
+                mask_bg[i] /= paddle.sum(mask_bg[i])
+
+        fg_loss, bg_loss = self.feature_loss(stu_feature, tea_feature, mask_fg,
+                                             mask_bg, tea_channel_att,
+                                             tea_spatial_att)
+        mask_loss = self.mask_loss(stu_channel_att, tea_channel_att,
+                                   stu_spatial_att, tea_spatial_att)
+        rela_loss = self.relation_loss(stu_feature, tea_feature)
+        loss = self.alpha_fgd * fg_loss + self.beta_fgd * bg_loss \
+               + self.gamma_fgd * mask_loss + self.lambda_fgd * rela_loss
+        return loss * self.loss_weight
+
+
+@register
+class PKDFeatureLoss(nn.Layer):
+    """
+    PKD: General Distillation Framework for Object Detectors via Pearson Correlation Coefficient.
+
+    Args:
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+        resize_stu (bool): If True, we'll down/up sample the features of the
+            student model to the spatial size of those of the teacher model if
+            their spatial sizes are different. And vice versa. Defaults to
+            True.
+    """
+
+    def __init__(self,
+                 student_channels=256,
+                 teacher_channels=256,
+                 normalize=True,
+                 loss_weight=1.0,
+                 resize_stu=True):
+        super(PKDFeatureLoss, self).__init__()
+        self.normalize = normalize
+        self.loss_weight = loss_weight
+        self.resize_stu = resize_stu
+
+    def forward(self, stu_feature, tea_feature, inputs=None):
+        size_s, size_t = stu_feature.shape[2:], tea_feature.shape[2:]
+        if size_s[0] != size_t[0]:
+            if self.resize_stu:
+                stu_feature = F.interpolate(
+                    stu_feature, size_t, mode='bilinear')
+            else:
+                tea_feature = F.interpolate(
+                    tea_feature, size_s, mode='bilinear')
+        assert stu_feature.shape == tea_feature.shape
+
+        if self.normalize:
+            stu_feature = feature_norm(stu_feature)
+            tea_feature = feature_norm(tea_feature)
+
+        loss = F.mse_loss(stu_feature, tea_feature) / 2
+        return loss * self.loss_weight
+
+
+@register
+class MimicFeatureLoss(nn.Layer):
+    def __init__(self,
+                 student_channels=256,
+                 teacher_channels=256,
+                 normalize=True,
+                 loss_weight=1.0):
+        super(MimicFeatureLoss, self).__init__()
+        self.normalize = normalize
+        self.loss_weight = loss_weight
+        self.mse_loss = nn.MSELoss()
+
+        if student_channels != teacher_channels:
+            self.align = nn.Conv2D(
+                student_channels,
+                teacher_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0)
+        else:
+            self.align = None
+
+    def forward(self, stu_feature, tea_feature, inputs=None):
+        if self.align is not None:
+            stu_feature = self.align(stu_feature)
+
+        if self.normalize:
+            stu_feature = feature_norm(stu_feature)
+            tea_feature = feature_norm(tea_feature)
+
+        loss = self.mse_loss(stu_feature, tea_feature)
+        return loss * self.loss_weight
+
+
+@register
+class MGDFeatureLoss(nn.Layer):
+    def __init__(self,
+                 student_channels=256,
+                 teacher_channels=256,
+                 normalize=True,
+                 loss_weight=1.0,
+                 loss_func='mse'):
+        super(MGDFeatureLoss, self).__init__()
+        self.normalize = normalize
+        self.loss_weight = loss_weight
+        assert loss_func in ['mse', 'ssim']
+        self.loss_func = loss_func
+        self.mse_loss = nn.MSELoss(reduction='sum')
+        self.ssim_loss = SSIM(11)
+
+        kaiming_init = parameter_init("kaiming")
+        if student_channels != teacher_channels:
+            self.align = nn.Conv2D(
+                student_channels,
+                teacher_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                weight_attr=kaiming_init,
+                bias_attr=False)
+        else:
+            self.align = None
+
+        self.generation = nn.Sequential(
+            nn.Conv2D(
+                teacher_channels, teacher_channels, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                teacher_channels, teacher_channels, kernel_size=3, padding=1))
+
+    def forward(self, stu_feature, tea_feature, inputs=None):
+        N = stu_feature.shape[0]
+        if self.align is not None:
+            stu_feature = self.align(stu_feature)
+        stu_feature = self.generation(stu_feature)
+
+        if self.normalize:
+            stu_feature = feature_norm(stu_feature)
+            tea_feature = feature_norm(tea_feature)
+
+        if self.loss_func == 'mse':
+            loss = self.mse_loss(stu_feature, tea_feature) / N
+        elif self.loss_func == 'ssim':
+            ssim_loss = self.ssim_loss(stu_feature, tea_feature)
+            loss = paddle.clip((1 - ssim_loss) / 2, 0, 1)
+        else:
+            raise ValueError
+        return loss * self.loss_weight
+
+
+class SSIM(nn.Layer):
+    def __init__(self, window_size=11, size_average=True):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.window = self.create_window(window_size, self.channel)
+
+    def gaussian(self, window_size, sigma):
+        gauss = paddle.to_tensor([
+            math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+            for x in range(window_size)
+        ])
+        return gauss / gauss.sum()
+
+    def create_window(self, window_size, channel):
+        _1D_window = self.gaussian(window_size, 1.5).unsqueeze(1)
+        _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0)
+        window = _2D_window.expand([channel, 1, window_size, window_size])
+        return window
+
+    def _ssim(self, img1, img2, window, window_size, channel,
+              size_average=True):
+        mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+        mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+        mu1_sq = mu1.pow(2)
+        mu2_sq = mu2.pow(2)
+        mu1_mu2 = mu1 * mu2
+
+        sigma1_sq = F.conv2d(
+            img1 * img1, window, padding=window_size // 2,
+            groups=channel) - mu1_sq
+        sigma2_sq = F.conv2d(
+            img2 * img2, window, padding=window_size // 2,
+            groups=channel) - mu2_sq
+        sigma12 = F.conv2d(
+            img1 * img2, window, padding=window_size // 2,
+            groups=channel) - mu1_mu2
+
+        C1 = 0.01**2
+        C2 = 0.03**2
+        ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
+            1e-12 + (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+
+        if size_average:
+            return ssim_map.mean()
+        else:
+            return ssim_map.mean([1, 2, 3])
+
+    def forward(self, img1, img2):
+        channel = img1.shape[1]
+        if channel == self.channel and self.window.dtype == img1.dtype:
+            window = self.window
+        else:
+            window = self.create_window(self.window_size, channel)
+            self.window = window
+            self.channel = channel
+
+        return self._ssim(img1, img2, window, self.window_size, channel,
+                          self.size_average)
diff --git a/ppdet/slim/distill_model.py b/ppdet/slim/distill_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fa3ccc83e06eb6c9f8058fc993446021ac5c45b
--- /dev/null
+++ b/ppdet/slim/distill_model.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+from ppdet.core.workspace import register, create, load_config
+from ppdet.utils.checkpoint import load_pretrain_weight
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'DistillModel',
+    'FGDDistillModel',
+    'CWDDistillModel',
+    'LDDistillModel',
+    'PPYOLOEDistillModel',
+]
+
+
+@register
+class DistillModel(nn.Layer):
+    """
+    Build common distill model.
+    Args:
+        cfg: The student config.
+        slim_cfg: The teacher and distill config.
+    """
+
+    def __init__(self, cfg, slim_cfg):
+        super(DistillModel, self).__init__()
+        self.arch = cfg.architecture
+
+        self.stu_cfg = cfg
+        self.student_model = create(self.stu_cfg.architecture)
+        if 'pretrain_weights' in self.stu_cfg and self.stu_cfg.pretrain_weights:
+            stu_pretrain = self.stu_cfg.pretrain_weights
+        else:
+            stu_pretrain = None
+
+        slim_cfg = load_config(slim_cfg)
+        self.tea_cfg = slim_cfg
+        self.teacher_model = create(self.tea_cfg.architecture)
+        if 'pretrain_weights' in self.tea_cfg and self.tea_cfg.pretrain_weights:
+            tea_pretrain = self.tea_cfg.pretrain_weights
+        else:
+            tea_pretrain = None
+        self.distill_cfg = slim_cfg
+
+        # load pretrain weights
+        self.is_inherit = False
+        if stu_pretrain:
+            if self.is_inherit and tea_pretrain:
+                load_pretrain_weight(self.student_model, tea_pretrain)
+                logger.debug(
+                    "Inheriting! loading teacher weights to student model!")
+            load_pretrain_weight(self.student_model, stu_pretrain)
+            logger.info("Student model has loaded pretrain weights!")
+        if tea_pretrain:
+            load_pretrain_weight(self.teacher_model, tea_pretrain)
+            logger.info("Teacher model has loaded pretrain weights!")
+
+        self.teacher_model.eval()
+        for param in self.teacher_model.parameters():
+            param.trainable = False
+
+        self.distill_loss = self.build_loss(self.distill_cfg)
+
+    def build_loss(self, distill_cfg):
+        if 'distill_loss' in distill_cfg and distill_cfg.distill_loss:
+            return create(distill_cfg.distill_loss)
+        else:
+            return None
+
+    def parameters(self):
+        return self.student_model.parameters()
+
+    def forward(self, inputs):
+        if self.training:
+            student_loss = self.student_model(inputs)
+            with paddle.no_grad():
+                teacher_loss = self.teacher_model(inputs)
+
+            loss = self.distill_loss(self.teacher_model, self.student_model)
+            student_loss['distill_loss'] = loss
+            student_loss['teacher_loss'] = teacher_loss['loss']
+            student_loss['loss'] += student_loss['distill_loss']
+            return student_loss
+        else:
+            return self.student_model(inputs)
+
+
+@register
+class FGDDistillModel(DistillModel):
+    """
+    Build FGD distill model.
+    Args:
+        cfg: The student config.
+        slim_cfg: The teacher and distill config.
+    """
+
+    def __init__(self, cfg, slim_cfg):
+        super(FGDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
+        assert self.arch in ['RetinaNet', 'PicoDet'
+                             ], 'Unsupported arch: {}'.format(self.arch)
+        self.is_inherit = True
+
+    def build_loss(self, distill_cfg):
+        assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name
+        assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss
+        loss_func = dict()
+        name_list = distill_cfg.distill_loss_name
+        for name in name_list:
+            loss_func[name] = create(distill_cfg.distill_loss)
+        return loss_func
+
+    def forward(self, inputs):
+        if self.training:
+            s_body_feats = self.student_model.backbone(inputs)
+            s_neck_feats = self.student_model.neck(s_body_feats)
+            with paddle.no_grad():
+                t_body_feats = self.teacher_model.backbone(inputs)
+                t_neck_feats = self.teacher_model.neck(t_body_feats)
+
+            loss_dict = {}
+            for idx, k in enumerate(self.distill_loss):
+                loss_dict[k] = self.distill_loss[k](s_neck_feats[idx],
+                                                    t_neck_feats[idx], inputs)
+            if self.arch == "RetinaNet":
+                loss = self.student_model.head(s_neck_feats, inputs)
+            elif self.arch == "PicoDet":
+                head_outs = self.student_model.head(
+                    s_neck_feats, self.student_model.export_post_process)
+                loss_gfl = self.student_model.head.get_loss(head_outs, inputs)
+                total_loss = paddle.add_n(list(loss_gfl.values()))
+                loss = {}
+                loss.update(loss_gfl)
+                loss.update({'loss': total_loss})
+            else:
+                raise ValueError(f"Unsupported model {self.arch}")
+
+            for k in loss_dict:
+                loss['loss'] += loss_dict[k]
+                loss[k] = loss_dict[k]
+            return loss
+        else:
+            body_feats = self.student_model.backbone(inputs)
+            neck_feats = self.student_model.neck(body_feats)
+            head_outs = self.student_model.head(neck_feats)
+            if self.arch == "RetinaNet":
+                bbox, bbox_num = self.student_model.head.post_process(
+                    head_outs, inputs['im_shape'], inputs['scale_factor'])
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+            elif self.arch == "PicoDet":
+                head_outs = self.student_model.head(
+                    neck_feats, self.student_model.export_post_process)
+                scale_factor = inputs['scale_factor']
+                bboxes, bbox_num = self.student_model.head.post_process(
+                    head_outs,
+                    scale_factor,
+                    export_nms=self.student_model.export_nms)
+                return {'bbox': bboxes, 'bbox_num': bbox_num}
+            else:
+                raise ValueError(f"Unsupported model {self.arch}")
+
+
+@register
+class CWDDistillModel(DistillModel):
+    """                                                                                                                                                    
+    Build CWD distill model.                                                                                                                               
+    Args:                                                                                                                                                  
+        cfg: The student config.                                                                                                                           
+        slim_cfg: The teacher and distill config.                                                                                                          
+    """
+
+    def __init__(self, cfg, slim_cfg):
+        super(CWDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
+        assert self.arch in ['GFL', 'RetinaNet'], 'Unsupported arch: {}'.format(
+            self.arch)
+
+    def build_loss(self, distill_cfg):
+        assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name
+        assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss
+        loss_func = dict()
+        name_list = distill_cfg.distill_loss_name
+        for name in name_list:
+            loss_func[name] = create(distill_cfg.distill_loss)
+        return loss_func
+
+    def get_loss_retinanet(self, stu_fea_list, tea_fea_list, inputs):
+        loss = self.student_model.head(stu_fea_list, inputs)
+        loss_dict = {}
+        for idx, k in enumerate(self.distill_loss):
+            loss_dict[k] = self.distill_loss[k](stu_fea_list[idx],
+                                                tea_fea_list[idx])
+
+            loss['loss'] += loss_dict[k]
+            loss[k] = loss_dict[k]
+        return loss
+
+    def get_loss_gfl(self, stu_fea_list, tea_fea_list, inputs):
+        loss = {}
+        head_outs = self.student_model.head(stu_fea_list)
+        loss_gfl = self.student_model.head.get_loss(head_outs, inputs)
+        loss.update(loss_gfl)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+
+        feat_loss = {}
+        loss_dict = {}
+        s_cls_feat, t_cls_feat = [], []
+        for s_neck_f, t_neck_f in zip(stu_fea_list, tea_fea_list):
+            conv_cls_feat, _ = self.student_model.head.conv_feat(s_neck_f)
+            cls_score = self.student_model.head.gfl_head_cls(conv_cls_feat)
+            t_conv_cls_feat, _ = self.teacher_model.head.conv_feat(t_neck_f)
+            t_cls_score = self.teacher_model.head.gfl_head_cls(t_conv_cls_feat)
+            s_cls_feat.append(cls_score)
+            t_cls_feat.append(t_cls_score)
+
+        for idx, k in enumerate(self.distill_loss):
+            loss_dict[k] = self.distill_loss[k](s_cls_feat[idx],
+                                                t_cls_feat[idx])
+            feat_loss[f"neck_f_{idx}"] = self.distill_loss[k](stu_fea_list[idx],
+                                                              tea_fea_list[idx])
+
+        for k in feat_loss:
+            loss['loss'] += feat_loss[k]
+            loss[k] = feat_loss[k]
+
+        for k in loss_dict:
+            loss['loss'] += loss_dict[k]
+            loss[k] = loss_dict[k]
+        return loss
+
+    def forward(self, inputs):
+        if self.training:
+            s_body_feats = self.student_model.backbone(inputs)
+            s_neck_feats = self.student_model.neck(s_body_feats)
+            with paddle.no_grad():
+                t_body_feats = self.teacher_model.backbone(inputs)
+                t_neck_feats = self.teacher_model.neck(t_body_feats)
+
+            if self.arch == "RetinaNet":
+                loss = self.get_loss_retinanet(s_neck_feats, t_neck_feats,
+                                               inputs)
+            elif self.arch == "GFL":
+                loss = self.get_loss_gfl(s_neck_feats, t_neck_feats, inputs)
+            else:
+                raise ValueError(f"unsupported arch {self.arch}")
+            return loss
+        else:
+            body_feats = self.student_model.backbone(inputs)
+            neck_feats = self.student_model.neck(body_feats)
+            head_outs = self.student_model.head(neck_feats)
+            if self.arch == "RetinaNet":
+                bbox, bbox_num = self.student_model.head.post_process(
+                    head_outs, inputs['im_shape'], inputs['scale_factor'])
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+            elif self.arch == "GFL":
+                bbox_pred, bbox_num = head_outs
+                output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+                return output
+            else:
+                raise ValueError(f"unsupported arch {self.arch}")
+
+
+@register
+class LDDistillModel(DistillModel):
+    """
+    Build LD distill model.
+    Args:
+        cfg: The student config.
+        slim_cfg: The teacher and distill config.
+    """
+
+    def __init__(self, cfg, slim_cfg):
+        super(LDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
+        assert self.arch in ['GFL'], 'Unsupported arch: {}'.format(self.arch)
+
+    def forward(self, inputs):
+        if self.training:
+            s_body_feats = self.student_model.backbone(inputs)
+            s_neck_feats = self.student_model.neck(s_body_feats)
+            s_head_outs = self.student_model.head(s_neck_feats)
+            with paddle.no_grad():
+                t_body_feats = self.teacher_model.backbone(inputs)
+                t_neck_feats = self.teacher_model.neck(t_body_feats)
+                t_head_outs = self.teacher_model.head(t_neck_feats)
+
+            soft_label_list = t_head_outs[0]
+            soft_targets_list = t_head_outs[1]
+            student_loss = self.student_model.head.get_loss(
+                s_head_outs, inputs, soft_label_list, soft_targets_list)
+            total_loss = paddle.add_n(list(student_loss.values()))
+            student_loss['loss'] = total_loss
+            return student_loss
+        else:
+            return self.student_model(inputs)
+
+
+@register
+class PPYOLOEDistillModel(DistillModel):
+    """
+    Build PPYOLOE distill model, only used in PPYOLOE
+    Args:
+        cfg: The student config.
+        slim_cfg: The teacher and distill config.
+    """
+
+    def __init__(self, cfg, slim_cfg):
+        super(PPYOLOEDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
+        assert self.arch in ['PPYOLOE'], 'Unsupported arch: {}'.format(
+            self.arch)
+
+    def forward(self, inputs, alpha=0.125):
+        if self.training:
+            with paddle.no_grad():
+                teacher_loss = self.teacher_model(inputs)
+            if hasattr(self.teacher_model.yolo_head, "assigned_labels"):
+                self.student_model.yolo_head.assigned_labels, self.student_model.yolo_head.assigned_bboxes, self.student_model.yolo_head.assigned_scores = \
+                    self.teacher_model.yolo_head.assigned_labels, self.teacher_model.yolo_head.assigned_bboxes, self.teacher_model.yolo_head.assigned_scores
+                delattr(self.teacher_model.yolo_head, "assigned_labels")
+                delattr(self.teacher_model.yolo_head, "assigned_bboxes")
+                delattr(self.teacher_model.yolo_head, "assigned_scores")
+            student_loss = self.student_model(inputs)
+
+            logits_loss, feat_loss = self.distill_loss(self.teacher_model,
+                                                       self.student_model)
+            det_total_loss = student_loss['loss']
+            total_loss = alpha * (det_total_loss + logits_loss + feat_loss)
+            student_loss['loss'] = total_loss
+            student_loss['det_loss'] = det_total_loss
+            student_loss['logits_loss'] = logits_loss
+            student_loss['feat_loss'] = feat_loss
+            return student_loss
+        else:
+            return self.student_model(inputs)
diff --git a/ppdet/slim/ofa.py b/ppdet/slim/ofa.py
new file mode 100644
index 0000000000000000000000000000000000000000..b75edacdf2b65ad08cc538a0ca0334c03d53838a
--- /dev/null
+++ b/ppdet/slim/ofa.py
@@ -0,0 +1,89 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import load_config, merge_config, create
+from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
+from ppdet.utils.logger import setup_logger
+from ppdet.core.workspace import register, serializable
+
+from paddle.utils import try_import
+
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class OFA(object):
+    def __init__(self, ofa_config):
+        super(OFA, self).__init__()
+        self.ofa_config = ofa_config
+
+    def __call__(self, model, param_state_dict):
+
+        paddleslim = try_import('paddleslim')
+        from paddleslim.nas.ofa import OFA, RunConfig, utils
+        from paddleslim.nas.ofa.convert_super import Convert, supernet
+        task = self.ofa_config['task']
+        expand_ratio = self.ofa_config['expand_ratio']
+
+        skip_neck = self.ofa_config['skip_neck']
+        skip_head = self.ofa_config['skip_head']
+
+        run_config = self.ofa_config['RunConfig']
+        if 'skip_layers' in run_config:
+            skip_layers = run_config['skip_layers']
+        else:
+            skip_layers = []
+
+        # supernet config
+        sp_config = supernet(expand_ratio=expand_ratio)
+        # convert to supernet
+        model = Convert(sp_config).convert(model)
+
+        skip_names = []
+        if skip_neck:
+            skip_names.append('neck.')
+        if skip_head:
+            skip_names.append('head.')
+
+        for name, sublayer in model.named_sublayers():
+            for n in skip_names:
+                if n in name:
+                    skip_layers.append(name)
+
+        run_config['skip_layers'] = skip_layers
+        run_config = RunConfig(**run_config)
+
+        # build ofa model
+        ofa_model = OFA(model, run_config=run_config)
+
+        ofa_model.set_epoch(0)
+        ofa_model.set_task(task)
+
+        input_spec = [{
+            "image": paddle.ones(
+                shape=[1, 3, 640, 640], dtype='float32'),
+            "im_shape": paddle.full(
+                [1, 2], 640, dtype='float32'),
+            "scale_factor": paddle.ones(
+                shape=[1, 2], dtype='float32')
+        }]
+
+        ofa_model._clear_search_space(input_spec=input_spec)
+        ofa_model._build_ss = True
+        check_ss = ofa_model._sample_config('expand_ratio', phase=None)
+        # tokenize the search space
+        ofa_model.tokenize()
+        # check token map, search cands and search space
+        logger.info('Token map is {}'.format(ofa_model.token_map))
+        logger.info('Search candidates is {}'.format(ofa_model.search_cands))
+        logger.info('The length of search_space is {}, search_space is {}'.
+                    format(len(ofa_model._ofa_layers), ofa_model._ofa_layers))
+        # set model state dict into ofa model
+        utils.set_state_dict(ofa_model.model, param_state_dict)
+        return ofa_model
diff --git a/ppdet/slim/prune.py b/ppdet/slim/prune.py
new file mode 100644
index 0000000000000000000000000000000000000000..28ffb7588d1e596e5883072b3bd2b5e6ba80ed7f
--- /dev/null
+++ b/ppdet/slim/prune.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle.utils import try_import
+
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def print_prune_params(model):
+    model_dict = model.state_dict()
+    for key in model_dict.keys():
+        weight_name = model_dict[key].name
+        logger.info('Parameter name: {}, shape: {}'.format(
+            weight_name, model_dict[key].shape))
+
+
+@register
+@serializable
+class Pruner(object):
+    def __init__(self,
+                 criterion,
+                 pruned_params,
+                 pruned_ratios,
+                 print_params=False):
+        super(Pruner, self).__init__()
+        assert criterion in ['l1_norm', 'fpgm'], \
+            "unsupported prune criterion: {}".format(criterion)
+        self.criterion = criterion
+        self.pruned_params = pruned_params
+        self.pruned_ratios = pruned_ratios
+        self.print_params = print_params
+
+    def __call__(self, model):
+        # FIXME: adapt to network graph when Training and inference are
+        # inconsistent, now only supports prune inference network graph.
+        model.eval()
+        paddleslim = try_import('paddleslim')
+        from paddleslim.analysis import dygraph_flops as flops
+        input_spec = [{
+            "image": paddle.ones(
+                shape=[1, 3, 640, 640], dtype='float32'),
+            "im_shape": paddle.full(
+                [1, 2], 640, dtype='float32'),
+            "scale_factor": paddle.ones(
+                shape=[1, 2], dtype='float32')
+        }]
+        if self.print_params:
+            print_prune_params(model)
+
+        ori_flops = flops(model, input_spec) / (1000**3)
+        logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops))
+        if self.criterion == 'fpgm':
+            pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec)
+        elif self.criterion == 'l1_norm':
+            pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec)
+
+        logger.info("pruned params: {}".format(self.pruned_params))
+        pruned_ratios = [float(n) for n in self.pruned_ratios]
+        ratios = {}
+        for i, param in enumerate(self.pruned_params):
+            ratios[param] = pruned_ratios[i]
+        pruner.prune_vars(ratios, [0])
+        pruned_flops = flops(model, input_spec) / (1000**3)
+        logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
+            pruned_flops, (ori_flops - pruned_flops) / ori_flops))
+
+        return model
+
+
+@register
+@serializable
+class PrunerQAT(object):
+    def __init__(self, criterion, pruned_params, pruned_ratios,
+                 print_prune_params, quant_config, print_qat_model):
+        super(PrunerQAT, self).__init__()
+        assert criterion in ['l1_norm', 'fpgm'], \
+            "unsupported prune criterion: {}".format(criterion)
+        # Pruner hyperparameter
+        self.criterion = criterion
+        self.pruned_params = pruned_params
+        self.pruned_ratios = pruned_ratios
+        self.print_prune_params = print_prune_params
+        # QAT hyperparameter
+        self.quant_config = quant_config
+        self.print_qat_model = print_qat_model
+
+    def __call__(self, model):
+        # FIXME: adapt to network graph when Training and inference are
+        # inconsistent, now only supports prune inference network graph.
+        model.eval()
+        paddleslim = try_import('paddleslim')
+        from paddleslim.analysis import dygraph_flops as flops
+        input_spec = [{
+            "image": paddle.ones(
+                shape=[1, 3, 640, 640], dtype='float32'),
+            "im_shape": paddle.full(
+                [1, 2], 640, dtype='float32'),
+            "scale_factor": paddle.ones(
+                shape=[1, 2], dtype='float32')
+        }]
+        if self.print_prune_params:
+            print_prune_params(model)
+
+        ori_flops = flops(model, input_spec) / 1000
+        logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops))
+        if self.criterion == 'fpgm':
+            pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec)
+        elif self.criterion == 'l1_norm':
+            pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec)
+
+        logger.info("pruned params: {}".format(self.pruned_params))
+        pruned_ratios = [float(n) for n in self.pruned_ratios]
+        ratios = {}
+        for i, param in enumerate(self.pruned_params):
+            ratios[param] = pruned_ratios[i]
+        pruner.prune_vars(ratios, [0])
+        pruned_flops = flops(model, input_spec) / 1000
+        logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
+            pruned_flops, (ori_flops - pruned_flops) / ori_flops))
+
+        self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config)
+
+        self.quanter.quantize(model)
+
+        if self.print_qat_model:
+            logger.info("Quantized model:")
+            logger.info(model)
+
+        return model
+
+    def save_quantized_model(self, layer, path, input_spec=None, **config):
+        self.quanter.save_quantized_model(
+            model=layer, path=path, input_spec=input_spec, **config)
diff --git a/ppdet/slim/quant.py b/ppdet/slim/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..44508198c46b77485d61e2b4e4d2804c62f96622
--- /dev/null
+++ b/ppdet/slim/quant.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddle.utils import try_import
+
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class QAT(object):
+    def __init__(self, quant_config, print_model):
+        super(QAT, self).__init__()
+        self.quant_config = quant_config
+        self.print_model = print_model
+
+    def __call__(self, model):
+        paddleslim = try_import('paddleslim')
+        self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config)
+        if self.print_model:
+            logger.info("Model before quant:")
+            logger.info(model)
+
+        # For PP-YOLOE, convert model to deploy firstly.
+        for layer in model.sublayers():
+            if hasattr(layer, 'convert_to_deploy'):
+                layer.convert_to_deploy()
+
+        self.quanter.quantize(model)
+
+        if self.print_model:
+            logger.info("Quantized model:")
+            logger.info(model)
+
+        return model
+
+    def save_quantized_model(self, layer, path, input_spec=None, **config):
+        self.quanter.save_quantized_model(
+            model=layer, path=path, input_spec=input_spec, **config)
+
+
+@register
+@serializable
+class PTQ(object):
+    def __init__(self,
+                 ptq_config,
+                 quant_batch_num=10,
+                 output_dir='output_inference',
+                 fuse=True,
+                 fuse_list=None):
+        super(PTQ, self).__init__()
+        self.ptq_config = ptq_config
+        self.quant_batch_num = quant_batch_num
+        self.output_dir = output_dir
+        self.fuse = fuse
+        self.fuse_list = fuse_list
+
+    def __call__(self, model):
+        paddleslim = try_import('paddleslim')
+        self.ptq = paddleslim.PTQ(**self.ptq_config)
+        model.eval()
+        quant_model = self.ptq.quantize(
+            model, fuse=self.fuse, fuse_list=self.fuse_list)
+
+        return quant_model
+
+    def save_quantized_model(self,
+                             quant_model,
+                             quantize_model_path,
+                             input_spec=None):
+        self.ptq.save_quantized_model(quant_model, quantize_model_path,
+                                      input_spec)
diff --git a/ppdet/slim/unstructured_prune.py b/ppdet/slim/unstructured_prune.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc876a8cb069700408a5c3f4b341be78e7dd6a3
--- /dev/null
+++ b/ppdet/slim/unstructured_prune.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddle.utils import try_import
+
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class UnstructuredPruner(object):
+    def __init__(self,
+                 stable_epochs,
+                 pruning_epochs,
+                 tunning_epochs,
+                 pruning_steps,
+                 ratio,
+                 initial_ratio,
+                 prune_params_type=None):
+        self.stable_epochs = stable_epochs
+        self.pruning_epochs = pruning_epochs
+        self.tunning_epochs = tunning_epochs
+        self.ratio = ratio
+        self.prune_params_type = prune_params_type
+        self.initial_ratio = initial_ratio
+        self.pruning_steps = pruning_steps
+
+    def __call__(self, model, steps_per_epoch, skip_params_func=None):
+        paddleslim = try_import('paddleslim')
+        from paddleslim import GMPUnstructuredPruner
+        configs = {
+            'pruning_strategy': 'gmp',
+            'stable_iterations': self.stable_epochs * steps_per_epoch,
+            'pruning_iterations': self.pruning_epochs * steps_per_epoch,
+            'tunning_iterations': self.tunning_epochs * steps_per_epoch,
+            'resume_iteration': 0,
+            'pruning_steps': self.pruning_steps,
+            'initial_ratio': self.initial_ratio,
+        }
+
+        pruner = GMPUnstructuredPruner(
+            model,
+            ratio=self.ratio,
+            skip_params_func=skip_params_func,
+            prune_params_type=self.prune_params_type,
+            local_sparsity=True,
+            configs=configs)
+
+        return pruner
diff --git a/ppdet/utils/__init__.py b/ppdet/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2
--- /dev/null
+++ b/ppdet/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ppdet/utils/check.py b/ppdet/utils/check.py
new file mode 100644
index 0000000000000000000000000000000000000000..7690ade9eab0a7d859459a0be74d344446be6938
--- /dev/null
+++ b/ppdet/utils/check.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import paddle
+import six
+import paddle.version as paddle_version
+
+from .logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'check_gpu', 'check_npu', 'check_xpu', 'check_mlu', 'check_version',
+    'check_config'
+]
+
+
+def check_mlu(use_mlu):
+    """
+    Log error and exit when set use_mlu=true in paddlepaddle
+    cpu/gpu/xpu/npu version.
+    """
+    err = "Config use_mlu cannot be set as true while you are " \
+          "using paddlepaddle cpu/gpu/xpu/npu version ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle-mlu to run model on MLU \n" \
+          "\t2. Set use_mlu as false in config file to run " \
+          "model on CPU/GPU/XPU/NPU"
+
+    try:
+        if use_mlu and not paddle.is_compiled_with_mlu():
+            logger.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_npu(use_npu):
+    """
+    Log error and exit when set use_npu=true in paddlepaddle
+    version without paddle-custom-npu installed.
+    """
+    err = "Config use_npu cannot be set as true while you are " \
+          "using paddlepaddle version without paddle-custom-npu " \
+          "installed! \nPlease try: \n" \
+          "\t1. Install paddle-custom-npu to run model on NPU \n" \
+          "\t2. Set use_npu as false in config file to run " \
+          "model on other devices supported."
+
+    try:
+        if use_npu and not 'npu' in paddle.device.get_all_custom_device_type():
+            logger.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_xpu(use_xpu):
+    """
+    Log error and exit when set use_xpu=true in paddlepaddle
+    cpu/gpu/npu version.
+    """
+    err = "Config use_xpu cannot be set as true while you are " \
+          "using paddlepaddle cpu/gpu/npu version ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle-xpu to run model on XPU \n" \
+          "\t2. Set use_xpu as false in config file to run " \
+          "model on CPU/GPU/NPU"
+
+    try:
+        if use_xpu and not paddle.is_compiled_with_xpu():
+            logger.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_gpu(use_gpu):
+    """
+    Log error and exit when set use_gpu=true in paddlepaddle
+    cpu version.
+    """
+    err = "Config use_gpu cannot be set as true while you are " \
+          "using paddlepaddle cpu version ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle-gpu to run model on GPU \n" \
+          "\t2. Set use_gpu as false in config file to run " \
+          "model on CPU"
+
+    try:
+        if use_gpu and not paddle.is_compiled_with_cuda():
+            logger.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_version(version='2.2'):
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version {} or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code.".format(version)
+
+    version_installed = [
+        paddle_version.major, paddle_version.minor, paddle_version.patch,
+        paddle_version.rc
+    ]
+
+    if version_installed == ['0', '0', '0', '0']:
+        return
+
+    version_split = version.split('.')
+
+    length = min(len(version_installed), len(version_split))
+    for i in six.moves.range(length):
+        if version_installed[i] > version_split[i]:
+            return
+        if version_installed[i] < version_split[i]:
+            raise Exception(err)
+
+
+def check_config(cfg):
+    """
+    Check the correctness of the configuration file. Log error and exit
+    when Config is not compliant.
+    """
+    err = "'{}' not specified in config file. Please set it in config file."
+    check_list = ['architecture', 'num_classes']
+    try:
+        for var in check_list:
+            if not var in cfg:
+                logger.error(err.format(var))
+                sys.exit(1)
+    except Exception as e:
+        pass
+
+    if 'log_iter' not in cfg:
+        cfg.log_iter = 20
+
+    return cfg
diff --git a/ppdet/utils/checkpoint.py b/ppdet/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..f57ef0227c676cdf54cb337cca1c6f49b3a3542f
--- /dev/null
+++ b/ppdet/utils/checkpoint.py
@@ -0,0 +1,278 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import errno
+import os
+import time
+import numpy as np
+import paddle
+import paddle.nn as nn
+from .download import get_weights_path
+
+from .logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') \
+            or path.startswith('https://') \
+            or path.startswith('ppdet://')
+
+
+def _get_unique_endpoints(trainer_endpoints):
+    # Sorting is to avoid different environmental variables for each card
+    trainer_endpoints.sort()
+    ips = set()
+    unique_endpoints = set()
+    for endpoint in trainer_endpoints:
+        ip = endpoint.split(":")[0]
+        if ip in ips:
+            continue
+        ips.add(ip)
+        unique_endpoints.add(endpoint)
+    logger.info("unique_endpoints {}".format(unique_endpoints))
+    return unique_endpoints
+
+
+def _strip_postfix(path):
+    path, ext = os.path.splitext(path)
+    assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
+            "Unknown postfix {} from weights".format(ext)
+    return path
+
+
+def load_weight(model, weight, optimizer=None, ema=None, exchange=True):
+    if is_url(weight):
+        weight = get_weights_path(weight)
+
+    path = _strip_postfix(weight)
+    pdparam_path = path + '.pdparams'
+    if not os.path.exists(pdparam_path):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(pdparam_path))
+
+    if ema is not None and os.path.exists(path + '.pdema'):
+        if exchange:
+            # Exchange model and ema_model to load
+            logger.info('Exchange model and ema_model to load:')
+            ema_state_dict = paddle.load(pdparam_path)
+            logger.info('Loading ema_model weights from {}'.format(path +
+                                                                   '.pdparams'))
+            param_state_dict = paddle.load(path + '.pdema')
+            logger.info('Loading model weights from {}'.format(path + '.pdema'))
+        else:
+            ema_state_dict = paddle.load(path + '.pdema')
+            logger.info('Loading ema_model weights from {}'.format(path +
+                                                                   '.pdema'))
+            param_state_dict = paddle.load(pdparam_path)
+            logger.info('Loading model weights from {}'.format(path +
+                                                               '.pdparams'))
+    else:
+        ema_state_dict = None
+        param_state_dict = paddle.load(pdparam_path)
+
+    model_dict = model.state_dict()
+    model_weight = {}
+    incorrect_keys = 0
+
+    for key, value in model_dict.items():
+        if key in param_state_dict.keys():
+            if isinstance(param_state_dict[key], np.ndarray):
+                param_state_dict[key] = paddle.to_tensor(param_state_dict[key])
+            if value.dtype == param_state_dict[key].dtype:
+                model_weight[key] = param_state_dict[key]
+            else:
+                model_weight[key] = param_state_dict[key].astype(value.dtype)
+        else:
+            logger.info('Unmatched key: {}'.format(key))
+            incorrect_keys += 1
+
+    assert incorrect_keys == 0, "Load weight {} incorrectly, \
+            {} keys unmatched, please check again.".format(weight,
+                                                           incorrect_keys)
+    logger.info('Finish resuming model weights: {}'.format(pdparam_path))
+
+    model.set_dict(model_weight)
+
+    last_epoch = 0
+    if optimizer is not None and os.path.exists(path + '.pdopt'):
+        optim_state_dict = paddle.load(path + '.pdopt')
+        # to solve resume bug, will it be fixed in paddle 2.0
+        for key in optimizer.state_dict().keys():
+            if not key in optim_state_dict.keys():
+                optim_state_dict[key] = optimizer.state_dict()[key]
+        if 'last_epoch' in optim_state_dict:
+            last_epoch = optim_state_dict.pop('last_epoch')
+        optimizer.set_state_dict(optim_state_dict)
+
+        if ema_state_dict is not None:
+            ema.resume(ema_state_dict,
+                       optim_state_dict['LR_Scheduler']['last_epoch'])
+    elif ema_state_dict is not None:
+        ema.resume(ema_state_dict)
+    return last_epoch
+
+
+def match_state_dict(model_state_dict, weight_state_dict):
+    """
+    Match between the model state dict and pretrained weight state dict.
+    Return the matched state dict.
+
+    The method supposes that all the names in pretrained weight state dict are
+    subclass of the names in models`, if the prefix 'backbone.' in pretrained weight
+    keys is stripped. And we could get the candidates for each model key. Then we
+    select the name with the longest matched size as the final match result. For
+    example, the model state dict has the name of
+    'backbone.res2.res2a.branch2a.conv.weight' and the pretrained weight as
+    name of 'res2.res2a.branch2a.conv.weight' and 'branch2a.conv.weight'. We
+    match the 'res2.res2a.branch2a.conv.weight' to the model key.
+    """
+
+    model_keys = sorted(model_state_dict.keys())
+    weight_keys = sorted(weight_state_dict.keys())
+
+    def match(a, b):
+        if b.startswith('backbone.res5'):
+            # In Faster RCNN, res5 pretrained weights have prefix of backbone,
+            # however, the corresponding model weights have difficult prefix,
+            # bbox_head.
+            b = b[9:]
+        return a == b or a.endswith("." + b)
+
+    match_matrix = np.zeros([len(model_keys), len(weight_keys)])
+    for i, m_k in enumerate(model_keys):
+        for j, w_k in enumerate(weight_keys):
+            if match(m_k, w_k):
+                match_matrix[i, j] = len(w_k)
+    max_id = match_matrix.argmax(1)
+    max_len = match_matrix.max(1)
+    max_id[max_len == 0] = -1
+
+    load_id = set(max_id)
+    load_id.discard(-1)
+    not_load_weight_name = []
+    for idx in range(len(weight_keys)):
+        if idx not in load_id:
+            not_load_weight_name.append(weight_keys[idx])
+
+    if len(not_load_weight_name) > 0:
+        logger.info('{} in pretrained weight is not used in the model, '
+                    'and its will not be loaded'.format(not_load_weight_name))
+    matched_keys = {}
+    result_state_dict = {}
+    for model_id, weight_id in enumerate(max_id):
+        if weight_id == -1:
+            continue
+        model_key = model_keys[model_id]
+        weight_key = weight_keys[weight_id]
+        weight_value = weight_state_dict[weight_key]
+        model_value_shape = list(model_state_dict[model_key].shape)
+
+        if list(weight_value.shape) != model_value_shape:
+            logger.info(
+                'The shape {} in pretrained weight {} is unmatched with '
+                'the shape {} in model {}. And the weight {} will not be '
+                'loaded'.format(weight_value.shape, weight_key,
+                                model_value_shape, model_key, weight_key))
+            continue
+
+        assert model_key not in result_state_dict
+        result_state_dict[model_key] = weight_value
+        if weight_key in matched_keys:
+            raise ValueError('Ambiguity weight {} loaded, it matches at least '
+                             '{} and {} in the model'.format(
+                                 weight_key, model_key, matched_keys[
+                                     weight_key]))
+        matched_keys[weight_key] = model_key
+    return result_state_dict
+
+
+def load_pretrain_weight(model, pretrain_weight):
+    if is_url(pretrain_weight):
+        pretrain_weight = get_weights_path(pretrain_weight)
+
+    path = _strip_postfix(pretrain_weight)
+    if not (os.path.isdir(path) or os.path.isfile(path) or
+            os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path `{}` does not exists. "
+                         "If you don't want to load pretrain model, "
+                         "please delete `pretrain_weights` field in "
+                         "config file.".format(path))
+
+    model_dict = model.state_dict()
+
+    weights_path = path + '.pdparams'
+    param_state_dict = paddle.load(weights_path)
+    param_state_dict = match_state_dict(model_dict, param_state_dict)
+
+    for k, v in param_state_dict.items():
+        if isinstance(v, np.ndarray):
+            v = paddle.to_tensor(v)
+        if model_dict[k].dtype != v.dtype:
+            param_state_dict[k] = v.astype(model_dict[k].dtype)
+
+    model.set_dict(param_state_dict)
+    logger.info('Finish loading model weights: {}'.format(weights_path))
+
+
+def save_model(model,
+               optimizer,
+               save_dir,
+               save_name,
+               last_epoch,
+               ema_model=None):
+    """
+    save model into disk.
+
+    Args:
+        model (dict): the model state_dict to save parameters.
+        optimizer (paddle.optimizer.Optimizer): the Optimizer instance to
+            save optimizer states.
+        save_dir (str): the directory to be saved.
+        save_name (str): the path to be saved.
+        last_epoch (int): the epoch index.
+        ema_model (dict|None): the ema_model state_dict to save parameters.
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    assert isinstance(model, dict), ("model is not a instance of dict, "
+                                     "please call model.state_dict() to get.")
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    save_path = os.path.join(save_dir, save_name)
+    # save model
+    if ema_model is None:
+        paddle.save(model, save_path + ".pdparams")
+    else:
+        assert isinstance(ema_model,
+                          dict), ("ema_model is not a instance of dict, "
+                                  "please call model.state_dict() to get.")
+        # Exchange model and ema_model to save
+        paddle.save(ema_model, save_path + ".pdparams")
+        paddle.save(model, save_path + ".pdema")
+    # save optimizer
+    state_dict = optimizer.state_dict()
+    state_dict['last_epoch'] = last_epoch
+    paddle.save(state_dict, save_path + ".pdopt")
+    logger.info("Save checkpoint: {}".format(save_dir))
diff --git a/ppdet/utils/cli.py b/ppdet/utils/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c5acc0e591af4bbd07a1d22e1237656ac47da65
--- /dev/null
+++ b/ppdet/utils/cli.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser, RawDescriptionHelpFormatter
+
+import yaml
+import re
+from ppdet.core.workspace import get_registered_modules, dump_value
+
+__all__ = ['ColorTTY', 'ArgsParser']
+
+
+class ColorTTY(object):
+    def __init__(self):
+        super(ColorTTY, self).__init__()
+        self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan']
+
+    def __getattr__(self, attr):
+        if attr in self.colors:
+            color = self.colors.index(attr) + 31
+
+            def color_message(message):
+                return "[{}m{}[0m".format(color, message)
+
+            setattr(self, attr, color_message)
+            return color_message
+
+    def bold(self, message):
+        return self.with_code('01', message)
+
+    def with_code(self, code, message):
+        return "[{}m{}[0m".format(code, message)
+
+
+class ArgsParser(ArgumentParser):
+    def __init__(self):
+        super(ArgsParser, self).__init__(
+            formatter_class=RawDescriptionHelpFormatter)
+        self.add_argument("-c", "--config", help="configuration file to use")
+        self.add_argument(
+            "-o", "--opt", nargs='*', help="set configuration options")
+
+    def parse_args(self, argv=None):
+        args = super(ArgsParser, self).parse_args(argv)
+        assert args.config is not None, \
+            "Please specify --config=configure_file_path."
+        args.opt = self._parse_opt(args.opt)
+        return args
+
+    def _parse_opt(self, opts):
+        config = {}
+        if not opts:
+            return config
+        for s in opts:
+            s = s.strip()
+            k, v = s.split('=', 1)
+            if '.' not in k:
+                config[k] = yaml.load(v, Loader=yaml.Loader)
+            else:
+                keys = k.split('.')
+                if keys[0] not in config:
+                    config[keys[0]] = {}
+                cur = config[keys[0]]
+                for idx, key in enumerate(keys[1:]):
+                    if idx == len(keys) - 2:
+                        cur[key] = yaml.load(v, Loader=yaml.Loader)
+                    else:
+                        cur[key] = {}
+                        cur = cur[key]
+        return config
+
+
+def merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']):
+    for k, v in vars(args).items():
+        if k not in exclude_args:
+            config[k] = v
+    return config
+
+
+def print_total_cfg(config):
+    modules = get_registered_modules()
+    color_tty = ColorTTY()
+    green = '___{}___'.format(color_tty.colors.index('green') + 31)
+
+    styled = {}
+    for key in config.keys():
+        if not config[key]:  # empty schema
+            continue
+
+        if key not in modules and not hasattr(config[key], '__dict__'):
+            styled[key] = config[key]
+            continue
+        elif key in modules:
+            module = modules[key]
+        else:
+            type_name = type(config[key]).__name__
+            if type_name in modules:
+                module = modules[type_name].copy()
+                module.update({
+                    k: v
+                    for k, v in config[key].__dict__.items()
+                    if k in module.schema
+                })
+                key += " ({})".format(type_name)
+        default = module.find_default_keys()
+        missing = module.find_missing_keys()
+        mismatch = module.find_mismatch_keys()
+        extra = module.find_extra_keys()
+        dep_missing = []
+        for dep in module.inject:
+            if isinstance(module[dep], str) and module[dep] != '<value>':
+                if module[dep] not in modules:  # not a valid module
+                    dep_missing.append(dep)
+                else:
+                    dep_mod = modules[module[dep]]
+                    # empty dict but mandatory
+                    if not dep_mod and dep_mod.mandatory():
+                        dep_missing.append(dep)
+        override = list(
+            set(module.keys()) - set(default) - set(extra) - set(dep_missing))
+        replacement = {}
+        for name in set(override + default + extra + mismatch + missing):
+            new_name = name
+            if name in missing:
+                value = "<missing>"
+            else:
+                value = module[name]
+
+            if name in extra:
+                value = dump_value(value) + " <extraneous>"
+            elif name in mismatch:
+                value = dump_value(value) + " <type mismatch>"
+            elif name in dep_missing:
+                value = dump_value(value) + " <module config missing>"
+            elif name in override and value != '<missing>':
+                mark = green
+                new_name = mark + name
+            replacement[new_name] = value
+        styled[key] = replacement
+    buffer = yaml.dump(styled, default_flow_style=False, default_style='')
+    buffer = (re.sub(r"<missing>", r"[31m<missing>[0m", buffer))
+    buffer = (re.sub(r"<extraneous>", r"[33m<extraneous>[0m", buffer))
+    buffer = (re.sub(r"<type mismatch>", r"[31m<type mismatch>[0m", buffer))
+    buffer = (re.sub(r"<module config missing>",
+                     r"[31m<module config missing>[0m", buffer))
+    buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer)
+    print(buffer)
diff --git a/ppdet/utils/colormap.py b/ppdet/utils/colormap.py
new file mode 100644
index 0000000000000000000000000000000000000000..67c68dc1c67e7de5e658d424a3bce9040e73f48f
--- /dev/null
+++ b/ppdet/utils/colormap.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+
+def colormap(rgb=False):
+    """
+    Get colormap
+
+    The code of this function is copied from https://github.com/facebookresearch/Detectron/blob/main/detectron/utils/colormap.py
+    """
+    color_list = np.array([
+        0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494,
+        0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078,
+        0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000,
+        0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667,
+        0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000,
+        1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000,
+        0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667,
+        0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333,
+        0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333,
+        0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000,
+        1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167,
+        0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000,
+        0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286,
+        0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714,
+        0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000
+    ]).astype(np.float32)
+    color_list = color_list.reshape((-1, 3)) * 255
+    if not rgb:
+        color_list = color_list[:, ::-1]
+    return color_list.astype('int32')
diff --git a/ppdet/utils/download.py b/ppdet/utils/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fb95afa36602ce9c6964ff05190216d01ffb235
--- /dev/null
+++ b/ppdet/utils/download.py
@@ -0,0 +1,559 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path as osp
+import sys
+import yaml
+import time
+import shutil
+import requests
+import tqdm
+import hashlib
+import base64
+import binascii
+import tarfile
+import zipfile
+import errno
+
+from paddle.utils.download import _get_unique_endpoints
+from ppdet.core.workspace import BASE_KEY
+from .logger import setup_logger
+from .voc_utils import create_list
+
+logger = setup_logger(__name__)
+
+__all__ = [
+    'get_weights_path', 'get_dataset_path', 'get_config_path',
+    'download_dataset', 'create_voc_list'
+]
+
+WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights")
+DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset")
+CONFIGS_HOME = osp.expanduser("~/.cache/paddle/configs")
+
+# dict of {dataset_name: (download_info, sub_dirs)}
+# download info: [(url, md5sum)]
+DATASETS = {
+    'coco': ([
+        (
+            'http://images.cocodataset.org/zips/train2017.zip',
+            'cced6f7f71b7629ddf16f17bbcfab6b2', ),
+        (
+            'http://images.cocodataset.org/zips/val2017.zip',
+            '442b8da7639aecaf257c1dceb8ba8c80', ),
+        (
+            'http://images.cocodataset.org/annotations/annotations_trainval2017.zip',
+            'f4bbac642086de4f52a3fdda2de5fa2c', ),
+    ], ["annotations", "train2017", "val2017"]),
+    'voc': ([
+        (
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',
+            '6cd6e144f989b92b3379bac3b3de84fd', ),
+        (
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',
+            'c52e279531787c972589f7e41ab4ae64', ),
+        (
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',
+            'b6e924de25625d8de591ea690078ad9f', ),
+        (
+            'https://paddledet.bj.bcebos.com/data/label_list.txt',
+            '5ae5d62183cfb6f6d3ac109359d06a1b', ),
+    ], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]),
+    'wider_face': ([
+        (
+            'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip',
+            '3fedf70df600953d25982bcd13d91ba2', ),
+        (
+            'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip',
+            'dfa7d7e790efa35df3788964cf0bbaea', ),
+        (
+            'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip',
+            'a4a898d6193db4b9ef3260a68bad0dc7', ),
+    ], ["WIDER_train", "WIDER_val", "wider_face_split"]),
+    'fruit': ([(
+        'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar',
+        'baa8806617a54ccf3685fa7153388ae6', ), ],
+              ['Annotations', 'JPEGImages']),
+    'roadsign_voc': ([(
+        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar',
+        '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']),
+    'roadsign_coco': ([(
+        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar',
+        '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']),
+    'spine_coco': ([(
+        'https://paddledet.bj.bcebos.com/data/spine.tar',
+        '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']),
+    'coco_ce': ([(
+        'https://paddledet.bj.bcebos.com/data/coco_ce.tar',
+        'eadd1b79bc2f069f2744b1dd4e0c0329', ), ], [])
+}
+
+DOWNLOAD_DATASETS_LIST = DATASETS.keys()
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/'
+
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+def parse_url(url):
+    url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX)
+    return url
+
+
+def get_weights_path(url):
+    """Get weights path from WEIGHTS_HOME, if not exists,
+    download it from url.
+    """
+    url = parse_url(url)
+    path, _ = get_path(url, WEIGHTS_HOME)
+    return path
+
+
+def get_config_path(url):
+    """Get weights path from CONFIGS_HOME, if not exists,
+    download it from url.
+    """
+    url = parse_url(url)
+    path = map_path(url, CONFIGS_HOME, path_depth=2)
+    if os.path.isfile(path):
+        return path
+
+    # config file not found, try download
+    # 1. clear configs directory
+    if osp.isdir(CONFIGS_HOME):
+        shutil.rmtree(CONFIGS_HOME)
+
+    # 2. get url
+    try:
+        from ppdet import __version__ as version
+    except ImportError:
+        version = None
+
+    cfg_url = "ppdet://configs/{}/configs.tar".format(version) \
+                if version else "ppdet://configs/configs.tar"
+    cfg_url = parse_url(cfg_url)
+
+    # 3. download and decompress
+    cfg_fullname = _download_dist(cfg_url, osp.dirname(CONFIGS_HOME))
+    _decompress_dist(cfg_fullname)
+
+    # 4. check config file existing
+    if os.path.isfile(path):
+        return path
+    else:
+        logger.error("Get config {} failed after download, please contact us on " \
+            "https://github.com/PaddlePaddle/PaddleDetection/issues".format(path))
+        sys.exit(1)
+
+
+def get_dataset_path(path, annotation, image_dir):
+    """
+    If path exists, return path.
+    Otherwise, get dataset path from DATASET_HOME, if not exists,
+    download it.
+    """
+    if _dataset_exists(path, annotation, image_dir):
+        return path
+
+    data_name = os.path.split(path.strip().lower())[-1]
+    if data_name not in DOWNLOAD_DATASETS_LIST:
+        raise ValueError(
+            "Dataset {} is not valid for reason above, please check again.".
+            format(osp.realpath(path)))
+    else:
+        logger.warning(
+            "Dataset {} is not valid for reason above, try searching {} or "
+            "downloading dataset...".format(osp.realpath(path), DATASET_HOME))
+
+    for name, dataset in DATASETS.items():
+        if data_name == name:
+            logger.debug("Parse dataset_dir {} as dataset "
+                         "{}".format(path, name))
+            data_dir = osp.join(DATASET_HOME, name)
+
+            if name == "spine_coco":
+                if _dataset_exists(data_dir, annotation, image_dir):
+                    return data_dir
+
+            # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007
+            if name in ['voc', 'fruit', 'roadsign_voc']:
+                exists = True
+                for sub_dir in dataset[1]:
+                    check_dir = osp.join(data_dir, sub_dir)
+                    if osp.exists(check_dir):
+                        logger.info("Found {}".format(check_dir))
+                    else:
+                        exists = False
+                if exists:
+                    return data_dir
+
+            # voc exist is checked above, voc is not exist here
+            check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc'
+            for url, md5sum in dataset[0]:
+                get_path(url, data_dir, md5sum, check_exist)
+
+            # voc should create list after download
+            if name == 'voc':
+                create_voc_list(data_dir)
+            return data_dir
+
+    raise ValueError("Dataset automaticly downloading Error.")
+
+
+def create_voc_list(data_dir, devkit_subdir='VOCdevkit'):
+    logger.debug("Create voc file list...")
+    devkit_dir = osp.join(data_dir, devkit_subdir)
+    years = ['2007', '2012']
+
+    # NOTE: since using auto download VOC
+    # dataset, VOC default label list should be used, 
+    # do not generate label_list.txt here. For default
+    # label, see ../data/source/voc.py
+    create_list(devkit_dir, years, data_dir)
+    logger.debug("Create voc file list finished")
+
+
+def map_path(url, root_dir, path_depth=1):
+    # parse path after download to decompress under root_dir
+    assert path_depth > 0, "path_depth should be a positive integer"
+    dirname = url
+    for _ in range(path_depth):
+        dirname = osp.dirname(dirname)
+    fpath = osp.relpath(url, dirname)
+
+    zip_formats = ['.zip', '.tar', '.gz']
+    for zip_format in zip_formats:
+        fpath = fpath.replace(zip_format, '')
+    return osp.join(root_dir, fpath)
+
+
+def get_path(url, root_dir, md5sum=None, check_exist=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+
+    url (str): download url
+    root_dir (str): root dir for downloading, it should be
+                    WEIGHTS_HOME or DATASET_HOME
+    md5sum (str): md5 sum of download package
+    """
+    # parse path after download to decompress under root_dir
+    fullpath = map_path(url, root_dir)
+
+    # For same zip file, decompressed directory name different
+    # from zip file name, rename by following map
+    decompress_name_map = {
+        "VOCtrainval_11-May-2012": "VOCdevkit/VOC2012",
+        "VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007",
+        "VOCtest_06-Nov-2007": "VOCdevkit/VOC2007",
+        "annotations_trainval": "annotations"
+    }
+    for k, v in decompress_name_map.items():
+        if fullpath.find(k) >= 0:
+            fullpath = osp.join(osp.split(fullpath)[0], v)
+
+    if osp.exists(fullpath) and check_exist:
+        if not osp.isfile(fullpath) or \
+                _check_exist_file_md5(fullpath, md5sum, url):
+            logger.debug("Found {}".format(fullpath))
+            return fullpath, True
+        else:
+            os.remove(fullpath)
+
+    fullname = _download_dist(url, root_dir, md5sum)
+
+    # new weights format which postfix is 'pdparams' not
+    # need to decompress
+    if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml']:
+        _decompress_dist(fullname)
+
+    return fullpath, False
+
+
+def download_dataset(path, dataset=None):
+    if dataset not in DATASETS.keys():
+        logger.error("Unknown dataset {}, it should be "
+                     "{}".format(dataset, DATASETS.keys()))
+        return
+    dataset_info = DATASETS[dataset][0]
+    for info in dataset_info:
+        get_path(info[0], path, info[1], False)
+    logger.debug("Download dataset {} finished.".format(dataset))
+
+
+def _dataset_exists(path, annotation, image_dir):
+    """
+    Check if user define dataset exists
+    """
+    if not osp.exists(path):
+        logger.warning("Config dataset_dir {} is not exits, "
+                       "dataset config is not valid".format(path))
+        return False
+
+    if annotation:
+        annotation_path = osp.join(path, annotation)
+        if not osp.isfile(annotation_path):
+            logger.warning("Config annotation {} is not a "
+                           "file, dataset config is not "
+                           "valid".format(annotation_path))
+            return False
+    if image_dir:
+        image_path = osp.join(path, image_dir)
+        if not osp.isdir(image_path):
+            logger.warning("Config image_dir {} is not a "
+                           "directory, dataset config is not "
+                           "valid".format(image_path))
+            return False
+    return True
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    must_mkdirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum,
+                                                              url)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        # NOTE: windows path join may incur \, which is invalid in url
+        if sys.platform == "win32":
+            url = url.replace('\\', '/')
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+    return fullname
+
+
+def _download_dist(url, path, md5sum=None):
+    env = os.environ
+    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
+        # Mainly used to solve the problem of downloading data from
+        # different machines in the case of multiple machines.
+        # Different nodes will download data, and the same node
+        # will only download data once.
+        # Reference https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/utils/download.py#L108
+        rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0))
+        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
+        if num_trainers <= 1:
+            return _download(url, path, md5sum)
+        else:
+            fname = osp.split(url)[-1]
+            fullname = osp.join(path, fname)
+            lock_path = fullname + '.download.lock'
+
+            must_mkdirs(path)
+
+            if not osp.exists(fullname):
+                with open(lock_path, 'w'):  # touch    
+                    os.utime(lock_path, None)
+                if rank_id_curr_node == 0:
+                    _download(url, path, md5sum)
+                    os.remove(lock_path)
+                else:
+                    while os.path.exists(lock_path):
+                        time.sleep(0.5)
+            return fullname
+    else:
+        return _download(url, path, md5sum)
+
+
+def _check_exist_file_md5(filename, md5sum, url):
+    # if md5sum is None, and file to check is weights file, 
+    # read md5um from url and check, else check md5sum directly
+    return _md5check_from_url(filename, url) if md5sum is None \
+            and filename.endswith('pdparams') \
+            else _md5check(filename, md5sum)
+
+
+def _md5check_from_url(filename, url):
+    # For weights in bcebos URLs, MD5 value is contained
+    # in request header as 'content_md5'
+    req = requests.get(url, stream=True)
+    content_md5 = req.headers.get('content-md5')
+    req.close()
+    if not content_md5 or _md5check(
+            filename,
+            binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode(
+            )):
+        return True
+    else:
+        return False
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.debug("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.warning("File {} md5 check failed, {}(calc) != "
+                       "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = osp.split(fname)[0]
+    fpath_tmp = osp.join(fpath, 'tmp')
+    if osp.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    elif fname.find('.txt') >= 0:
+        return
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    for f in os.listdir(fpath_tmp):
+        src_dir = osp.join(fpath_tmp, f)
+        dst_dir = osp.join(fpath, f)
+        _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+
+def _decompress_dist(fname):
+    env = os.environ
+    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
+        trainer_id = int(env['PADDLE_TRAINER_ID'])
+        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
+        if num_trainers <= 1:
+            _decompress(fname)
+        else:
+            lock_path = fname + '.decompress.lock'
+            from paddle.distributed import ParallelEnv
+            unique_endpoints = _get_unique_endpoints(ParallelEnv()
+                                                     .trainer_endpoints[:])
+            # NOTE(dkp): _decompress_dist always performed after
+            # _download_dist, in _download_dist sub-trainers is waiting
+            # for download lock file release with sleeping, if decompress
+            # prograss is very fast and finished with in the sleeping gap
+            # time, e.g in tiny dataset such as coco_ce, spine_coco, main
+            # trainer may finish decompress and release lock file, so we
+            # only craete lock file in main trainer and all sub-trainer
+            # wait 1s for main trainer to create lock file, for 1s is
+            # twice as sleeping gap, this waiting time can keep all
+            # trainer pipeline in order
+            # **change this if you have more elegent methods**
+            if ParallelEnv().current_endpoint in unique_endpoints:
+                with open(lock_path, 'w'):  # touch    
+                    os.utime(lock_path, None)
+                _decompress(fname)
+                os.remove(lock_path)
+            else:
+                time.sleep(1)
+                while os.path.exists(lock_path):
+                    time.sleep(0.5)
+    else:
+        _decompress(fname)
+
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not osp.exists(dst):
+        shutil.move(src, dst)
+    elif osp.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = osp.join(src, fp)
+            dst_fp = osp.join(dst, fp)
+            if osp.isdir(src_fp):
+                if osp.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif osp.isfile(src_fp) and \
+                    not osp.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
diff --git a/ppdet/utils/fuse_utils.py b/ppdet/utils/fuse_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..647fa995da615fcb2bcdca13f4296f73e3204628
--- /dev/null
+++ b/ppdet/utils/fuse_utils.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import paddle
+import paddle.nn as nn
+
+__all__ = ['fuse_conv_bn']
+
+
+def fuse_conv_bn(model):
+    is_train = False
+    if model.training:
+        model.eval()
+        is_train = True
+    fuse_list = []
+    tmp_pair = [None, None]
+    for name, layer in model.named_sublayers():
+        if isinstance(layer, nn.Conv2D):
+            tmp_pair[0] = name
+        if isinstance(layer, nn.BatchNorm2D):
+            tmp_pair[1] = name
+
+        if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2:
+            fuse_list.append(tmp_pair)
+            tmp_pair = [None, None]
+    model = fuse_layers(model, fuse_list)
+    if is_train:
+        model.train()
+    return model
+
+
+def find_parent_layer_and_sub_name(model, name):
+    """
+    Given the model and the name of a layer, find the parent layer and
+    the sub_name of the layer.
+    For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
+    'block_1/convbn_1' and the sub_name is `conv_1`.
+    Args:
+        model(paddle.nn.Layer): the model to be quantized.
+        name(string): the name of a layer
+
+    Returns:
+        parent_layer, subname
+    """
+    assert isinstance(model, nn.Layer), \
+            "The model must be the instance of paddle.nn.Layer."
+    assert len(name) > 0, "The input (name) should not be empty."
+
+    last_idx = 0
+    idx = 0
+    parent_layer = model
+    while idx < len(name):
+        if name[idx] == '.':
+            sub_name = name[last_idx:idx]
+            if hasattr(parent_layer, sub_name):
+                parent_layer = getattr(parent_layer, sub_name)
+                last_idx = idx + 1
+        idx += 1
+    sub_name = name[last_idx:idx]
+    return parent_layer, sub_name
+
+
+class Identity(nn.Layer):
+    '''a layer to replace bn or relu layers'''
+
+    def __init__(self, *args, **kwargs):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+def fuse_layers(model, layers_to_fuse, inplace=False):
+    '''
+       fuse layers in layers_to_fuse
+
+       Args:
+           model(nn.Layer): The model to be fused.
+           layers_to_fuse(list): The layers' names to be fused. For
+               example,"fuse_list = [["conv1", "bn1"], ["conv2", "bn2"]]".
+               A TypeError would be raised if "fuse" was set as
+               True but "fuse_list" was None.
+                                 Default: None.
+           inplace(bool): Whether apply fusing to the input model.
+                          Default: False.
+
+       Return
+           fused_model(paddle.nn.Layer): The fused model.
+    '''
+    if not inplace:
+        model = copy.deepcopy(model)
+    for layers_list in layers_to_fuse:
+        layer_list = []
+        for layer_name in layers_list:
+            parent_layer, sub_name = find_parent_layer_and_sub_name(model,
+                                                                    layer_name)
+            layer_list.append(getattr(parent_layer, sub_name))
+        new_layers = _fuse_func(layer_list)
+        for i, item in enumerate(layers_list):
+            parent_layer, sub_name = find_parent_layer_and_sub_name(model, item)
+            setattr(parent_layer, sub_name, new_layers[i])
+    return model
+
+
+def _fuse_func(layer_list):
+    '''choose the fuser method and fuse layers'''
+    types = tuple(type(m) for m in layer_list)
+    fusion_method = types_to_fusion_method.get(types, None)
+    new_layers = [None] * len(layer_list)
+    fused_layer = fusion_method(*layer_list)
+    for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items():
+        fused_layer.register_forward_pre_hook(pre_hook_fn)
+        del layer_list[0]._forward_pre_hooks[handle_id]
+    for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items():
+        fused_layer.register_forward_post_hook(hook_fn)
+        del layer_list[-1]._forward_post_hooks[handle_id]
+    new_layers[0] = fused_layer
+    for i in range(1, len(layer_list)):
+        identity = Identity()
+        identity.training = layer_list[0].training
+        new_layers[i] = identity
+    return new_layers
+
+
+def _fuse_conv_bn(conv, bn):
+    '''fuse conv and bn for train or eval'''
+    assert(conv.training == bn.training),\
+        "Conv and BN both must be in the same mode (train or eval)."
+    if conv.training:
+        assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d'
+        raise NotImplementedError
+    else:
+        return _fuse_conv_bn_eval(conv, bn)
+
+
+def _fuse_conv_bn_eval(conv, bn):
+    '''fuse conv and bn for eval'''
+    assert (not (conv.training or bn.training)), "Fusion only for eval!"
+    fused_conv = copy.deepcopy(conv)
+
+    fused_weight, fused_bias = _fuse_conv_bn_weights(
+        fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon,
+        bn.weight, bn.bias)
+    fused_conv.weight.set_value(fused_weight)
+    if fused_conv.bias is None:
+        fused_conv.bias = paddle.create_parameter(
+            shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype)
+    fused_conv.bias.set_value(fused_bias)
+    return fused_conv
+
+
+def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
+    '''fuse weights and bias of conv and bn'''
+    if conv_b is None:
+        conv_b = paddle.zeros_like(bn_rm)
+    if bn_w is None:
+        bn_w = paddle.ones_like(bn_rm)
+    if bn_b is None:
+        bn_b = paddle.zeros_like(bn_rm)
+    bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps)
+    conv_w = conv_w * \
+        (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
+    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
+    return conv_w, conv_b
+
+
+types_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, }
diff --git a/ppdet/utils/logger.py b/ppdet/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e296205273f0cc57fc4007758342cddf5210fa
--- /dev/null
+++ b/ppdet/utils/logger.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import paddle.distributed as dist
+
+__all__ = ['setup_logger']
+
+logger_initialized = []
+
+
+def setup_logger(name="ppdet", output=None):
+    """
+    Initialize logger and set its verbosity level to INFO.
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+
+    Returns:
+        logging.Logger: a logger
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+
+    formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
+        datefmt="%m/%d %H:%M:%S")
+    # stdout logging: master only
+    local_rank = dist.get_rank()
+    if local_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "log.txt")
+        if local_rank > 0:
+            filename = filename + ".rank{}".format(local_rank)
+        os.makedirs(os.path.dirname(filename))
+        fh = logging.FileHandler(filename, mode='a')
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(logging.Formatter())
+        logger.addHandler(fh)
+    logger_initialized.append(name)
+    return logger
diff --git a/ppdet/utils/profiler.py b/ppdet/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cae3773fade36cd1d55421dc8d8b212d8f5413d7
--- /dev/null
+++ b/ppdet/utils/profiler.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'.
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(_profiler_options['state'],
+                                             _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/ppdet/utils/stats.py b/ppdet/utils/stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..c070e6544ed9e25b2ed156156ec8a6379717ebc9
--- /dev/null
+++ b/ppdet/utils/stats.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import numpy as np
+
+__all__ = ['SmoothedValue', 'TrainingStats']
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({avg:.4f})"
+        self.deque = collections.deque(maxlen=window_size)
+        self.fmt = fmt
+        self.total = 0.
+        self.count = 0
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    @property
+    def median(self):
+        return np.median(self.deque)
+
+    @property
+    def avg(self):
+        return np.mean(self.deque)
+
+    @property
+    def max(self):
+        return np.max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median, avg=self.avg, max=self.max, value=self.value)
+
+
+class TrainingStats(object):
+    def __init__(self, window_size, delimiter=' '):
+        self.meters = None
+        self.window_size = window_size
+        self.delimiter = delimiter
+
+    def update(self, stats):
+        if self.meters is None:
+            self.meters = {
+                k: SmoothedValue(self.window_size)
+                for k in stats.keys()
+            }
+        for k, v in self.meters.items():
+            v.update(float(stats[k]))
+
+    def get(self, extras=None):
+        stats = collections.OrderedDict()
+        if extras:
+            for k, v in extras.items():
+                stats[k] = v
+        for k, v in self.meters.items():
+            stats[k] = format(v.median, '.6f')
+
+        return stats
+
+    def log(self, extras=None):
+        d = self.get(extras)
+        strs = []
+        for k, v in d.items():
+            strs.append("{}: {}".format(k, str(v)))
+        return self.delimiter.join(strs)
diff --git a/ppdet/utils/visualizer.py b/ppdet/utils/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7193306c93e0917ee400df3f76f28a3f436df08
--- /dev/null
+++ b/ppdet/utils/visualizer.py
@@ -0,0 +1,457 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from PIL import Image, ImageDraw
+import cv2
+import math
+
+from .colormap import colormap
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['visualize_results']
+
+
+def visualize_results(image,
+                      bbox_res,
+                      mask_res,
+                      segm_res,
+                      keypoint_res,
+                      pose3d_res,
+                      im_id,
+                      catid2name,
+                      threshold=0.5):
+    """
+    Visualize bbox and mask results
+    """
+    if bbox_res is not None:
+        image = draw_bbox(image, im_id, catid2name, bbox_res, threshold)
+    if mask_res is not None:
+        image = draw_mask(image, im_id, mask_res, threshold)
+    if segm_res is not None:
+        image = draw_segm(image, im_id, catid2name, segm_res, threshold)
+    if keypoint_res is not None:
+        image = draw_pose(image, keypoint_res, threshold)
+    if pose3d_res is not None:
+        pose3d = np.array(pose3d_res[0]['pose3d']) * 1000
+        image = draw_pose3d(image, pose3d, visual_thread=threshold)
+    return image
+
+
+def draw_mask(image, im_id, segms, threshold, alpha=0.7):
+    """
+    Draw mask on image
+    """
+    mask_color_id = 0
+    w_ratio = .4
+    color_list = colormap(rgb=True)
+    img_array = np.array(image).astype('float32')
+    for dt in np.array(segms):
+        if im_id != dt['image_id']:
+            continue
+        segm, score = dt['segmentation'], dt['score']
+        if score < threshold:
+            continue
+        import pycocotools.mask as mask_util
+        mask = mask_util.decode(segm) * 255
+        color_mask = color_list[mask_color_id % len(color_list), 0:3]
+        mask_color_id += 1
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        img_array[idx[0], idx[1], :] *= 1.0 - alpha
+        img_array[idx[0], idx[1], :] += alpha * color_mask
+    return Image.fromarray(img_array.astype('uint8'))
+
+
+def draw_bbox(image, im_id, catid2name, bboxes, threshold):
+    """
+    Draw bbox on image
+    """
+    draw = ImageDraw.Draw(image)
+
+    catid2color = {}
+    color_list = colormap(rgb=True)[:40]
+    for dt in np.array(bboxes):
+        if im_id != dt['image_id']:
+            continue
+        catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
+        if score < threshold:
+            continue
+
+        if catid not in catid2color:
+            idx = np.random.randint(len(color_list))
+            catid2color[catid] = color_list[idx]
+        color = tuple(catid2color[catid])
+
+        # draw bbox
+        if len(bbox) == 4:
+            # draw bbox
+            xmin, ymin, w, h = bbox
+            xmax = xmin + w
+            ymax = ymin + h
+            draw.line(
+                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
+                 (xmin, ymin)],
+                width=2,
+                fill=color)
+        elif len(bbox) == 8:
+            x1, y1, x2, y2, x3, y3, x4, y4 = bbox
+            draw.line(
+                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
+                width=2,
+                fill=color)
+            xmin = min(x1, x2, x3, x4)
+            ymin = min(y1, y2, y3, y4)
+        else:
+            logger.error('the shape of bbox must be [M, 4] or [M, 8]!')
+
+        # draw label
+        text = "{} {:.2f}".format(catid2name[catid], score)
+        tw, th = draw.textsize(text)
+        draw.rectangle(
+            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
+        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
+
+    return image
+
+
+def save_result(save_path, results, catid2name, threshold):
+    """
+    save result as txt
+    """
+    img_id = int(results["im_id"])
+    with open(save_path, 'w') as f:
+        if "bbox_res" in results:
+            for dt in results["bbox_res"]:
+                catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
+                if score < threshold:
+                    continue
+                # each bbox result as a line
+                # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4
+                # for bbox: classname score x1 y1 w h
+                bbox_pred = '{} {} '.format(catid2name[catid],
+                                            score) + ' '.join(
+                                                [str(e) for e in bbox])
+                f.write(bbox_pred + '\n')
+        elif "keypoint_res" in results:
+            for dt in results["keypoint_res"]:
+                kpts = dt['keypoints']
+                scores = dt['score']
+                keypoint_pred = [img_id, scores, kpts]
+                print(keypoint_pred, file=f)
+        else:
+            print("No valid results found, skip txt save")
+
+
+def draw_segm(image,
+              im_id,
+              catid2name,
+              segms,
+              threshold,
+              alpha=0.7,
+              draw_box=True):
+    """
+    Draw segmentation on image
+    """
+    mask_color_id = 0
+    w_ratio = .4
+    color_list = colormap(rgb=True)
+    img_array = np.array(image).astype('float32')
+    for dt in np.array(segms):
+        if im_id != dt['image_id']:
+            continue
+        segm, score, catid = dt['segmentation'], dt['score'], dt['category_id']
+        if score < threshold:
+            continue
+        import pycocotools.mask as mask_util
+        mask = mask_util.decode(segm) * 255
+        color_mask = color_list[mask_color_id % len(color_list), 0:3]
+        mask_color_id += 1
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        img_array[idx[0], idx[1], :] *= 1.0 - alpha
+        img_array[idx[0], idx[1], :] += alpha * color_mask
+
+        if not draw_box:
+            center_y, center_x = ndimage.measurements.center_of_mass(mask)
+            label_text = "{}".format(catid2name[catid])
+            vis_pos = (max(int(center_x) - 10, 0), int(center_y))
+            cv2.putText(img_array, label_text, vis_pos,
+                        cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255))
+        else:
+            mask = mask_util.decode(segm) * 255
+            sum_x = np.sum(mask, axis=0)
+            x = np.where(sum_x > 0.5)[0]
+            sum_y = np.sum(mask, axis=1)
+            y = np.where(sum_y > 0.5)[0]
+            x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
+            cv2.rectangle(img_array, (x0, y0), (x1, y1),
+                          tuple(color_mask.astype('int32').tolist()), 1)
+            bbox_text = '%s %.2f' % (catid2name[catid], score)
+            t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
+            cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0],
+                                                y0 - t_size[1] - 3),
+                          tuple(color_mask.astype('int32').tolist()), -1)
+            cv2.putText(
+                img_array,
+                bbox_text, (x0, y0 - 2),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.3, (0, 0, 0),
+                1,
+                lineType=cv2.LINE_AA)
+
+    return Image.fromarray(img_array.astype('uint8'))
+
+
+def draw_pose(image,
+              results,
+              visual_thread=0.6,
+              save_name='pose.jpg',
+              save_dir='output',
+              returnimg=False,
+              ids=None):
+    try:
+        import matplotlib.pyplot as plt
+        import matplotlib
+        plt.switch_backend('agg')
+    except Exception as e:
+        logger.error('Matplotlib not found, please install matplotlib.'
+                     'for example: `pip install matplotlib`.')
+        raise e
+
+    skeletons = np.array([item['keypoints'] for item in results])
+    kpt_nums = 17
+    if len(skeletons) > 0:
+        kpt_nums = int(skeletons.shape[1] / 3)
+    skeletons = skeletons.reshape(-1, kpt_nums, 3)
+    if kpt_nums == 17:  #plot coco keypoint
+        EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8),
+                 (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14),
+                 (13, 15), (14, 16), (11, 12)]
+    else:  #plot mpii keypoint
+        EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8),
+                 (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12),
+                 (8, 13)]
+    NUM_EDGES = len(EDGES)
+
+    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+            [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+            [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+    cmap = matplotlib.cm.get_cmap('hsv')
+    plt.figure()
+
+    img = np.array(image).astype('float32')
+
+    color_set = results['colors'] if 'colors' in results else None
+
+    if 'bbox' in results and ids is None:
+        bboxs = results['bbox']
+        for j, rect in enumerate(bboxs):
+            xmin, ymin, xmax, ymax = rect
+            color = colors[0] if color_set is None else colors[color_set[j] %
+                                                               len(colors)]
+            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)
+
+    canvas = img.copy()
+    for i in range(kpt_nums):
+        for j in range(len(skeletons)):
+            if skeletons[j][i, 2] < visual_thread:
+                continue
+            if ids is None:
+                color = colors[i] if color_set is None else colors[color_set[j]
+                                                                   %
+                                                                   len(colors)]
+            else:
+                color = get_color(ids[j])
+
+            cv2.circle(
+                canvas,
+                tuple(skeletons[j][i, 0:2].astype('int32')),
+                2,
+                color,
+                thickness=-1)
+
+    to_plot = cv2.addWeighted(img, 0.3, canvas, 0.7, 0)
+    fig = matplotlib.pyplot.gcf()
+
+    stickwidth = 2
+
+    for i in range(NUM_EDGES):
+        for j in range(len(skeletons)):
+            edge = EDGES[i]
+            if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[
+                    1], 2] < visual_thread:
+                continue
+
+            cur_canvas = canvas.copy()
+            X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]]
+            Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]]
+            mX = np.mean(X)
+            mY = np.mean(Y)
+            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)),
+                                       (int(length / 2), stickwidth),
+                                       int(angle), 0, 360, 1)
+            if ids is None:
+                color = colors[i] if color_set is None else colors[color_set[j]
+                                                                   %
+                                                                   len(colors)]
+            else:
+                color = get_color(ids[j])
+            cv2.fillConvexPoly(cur_canvas, polygon, color)
+            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+    image = Image.fromarray(canvas.astype('uint8'))
+    plt.close()
+    return image
+
+
+def draw_pose3d(image,
+                pose3d,
+                pose2d=None,
+                visual_thread=0.6,
+                save_name='pose3d.jpg',
+                returnimg=True):
+    try:
+        import matplotlib.pyplot as plt
+        import matplotlib
+        plt.switch_backend('agg')
+    except Exception as e:
+        logger.error('Matplotlib not found, please install matplotlib.'
+                     'for example: `pip install matplotlib`.')
+        raise e
+
+    if pose3d.shape[0] == 24:
+        joints_connectivity_dict = [
+            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1],
+            [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0],
+            [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1],
+            [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0],
+            [23, 21, 1]
+        ]
+    elif pose3d.shape[0] == 14:
+        joints_connectivity_dict = [
+            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 12, 0],
+            [3, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1],
+            [8, 12, 0], [9, 12, 1], [12, 13, 1]
+        ]
+    else:
+        print(
+            "not defined joints number :{}, cannot visualize because unknown of joint connectivity".
+            format(pose.shape[0]))
+        return
+
+    def draw3Dpose(pose3d,
+                   ax,
+                   lcolor="#3498db",
+                   rcolor="#e74c3c",
+                   add_labels=False):
+        #    pose3d = orthographic_projection(pose3d, cam)
+        for i in joints_connectivity_dict:
+            x, y, z = [
+                np.array([pose3d[i[0], j], pose3d[i[1], j]]) for j in range(3)
+            ]
+            ax.plot(-x, -z, -y, lw=2, c=lcolor if i[2] else rcolor)
+
+        RADIUS = 1000
+        center_xy = 2 if pose3d.shape[0] == 14 else 14
+        x, y, z = pose3d[center_xy, 0], pose3d[center_xy, 1], pose3d[center_xy,
+                                                                     2]
+        ax.set_xlim3d([-RADIUS + x, RADIUS + x])
+        ax.set_ylim3d([-RADIUS + y, RADIUS + y])
+        ax.set_zlim3d([-RADIUS + z, RADIUS + z])
+
+        ax.set_xlabel("x")
+        ax.set_ylabel("y")
+        ax.set_zlabel("z")
+
+    def draw2Dpose(pose2d,
+                   ax,
+                   lcolor="#3498db",
+                   rcolor="#e74c3c",
+                   add_labels=False):
+        for i in joints_connectivity_dict:
+            if pose2d[i[0], 2] and pose2d[i[1], 2]:
+                x, y = [
+                    np.array([pose2d[i[0], j], pose2d[i[1], j]])
+                    for j in range(2)
+                ]
+                ax.plot(x, y, 0, lw=2, c=lcolor if i[2] else rcolor)
+
+    def draw_img_pose(pose3d,
+                      pose2d=None,
+                      frame=None,
+                      figsize=(12, 12),
+                      savepath=None):
+        fig = plt.figure(figsize=figsize, dpi=80)
+        # fig.clear()
+        fig.tight_layout()
+
+        ax = fig.add_subplot(221)
+        if frame is not None:
+            ax.imshow(frame, interpolation='nearest')
+        if pose2d is not None:
+            draw2Dpose(pose2d, ax)
+
+        ax = fig.add_subplot(222, projection='3d')
+        ax.view_init(45, 45)
+        draw3Dpose(pose3d, ax)
+        ax = fig.add_subplot(223, projection='3d')
+        ax.view_init(0, 0)
+        draw3Dpose(pose3d, ax)
+        ax = fig.add_subplot(224, projection='3d')
+        ax.view_init(0, 90)
+        draw3Dpose(pose3d, ax)
+
+        if savepath is not None:
+            plt.savefig(savepath)
+            plt.close()
+        else:
+            return fig
+
+    def fig2data(fig):
+        """
+        fig = plt.figure()
+        image = fig2data(fig)
+        @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
+        @param fig a matplotlib figure
+        @return a numpy 3D array of RGBA values
+        """
+        # draw the renderer
+        fig.canvas.draw()
+
+        # Get the RGBA buffer from the figure
+        w, h = fig.canvas.get_width_height()
+        buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8)
+        buf.shape = (w, h, 4)
+
+        # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
+        buf = np.roll(buf, 3, axis=2)
+        image = Image.frombytes("RGBA", (w, h), buf.tostring())
+        return image.convert("RGB")
+
+    fig = draw_img_pose(pose3d, pose2d, frame=image)
+    data = fig2data(fig)
+    if returnimg is False:
+        data.save(save_name)
+    else:
+        return data
diff --git a/ppdet/utils/voc_utils.py b/ppdet/utils/voc_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd6d9f90ea85e355562d9bb8bd30319deb0f7901
--- /dev/null
+++ b/ppdet/utils/voc_utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path as osp
+import re
+import random
+
+__all__ = ['create_list']
+
+
+def create_list(devkit_dir, years, output_dir):
+    """
+    create following list:
+        1. trainval.txt
+        2. test.txt
+    """
+    trainval_list = []
+    test_list = []
+    for year in years:
+        trainval, test = _walk_voc_dir(devkit_dir, year, output_dir)
+        trainval_list.extend(trainval)
+        test_list.extend(test)
+
+    random.shuffle(trainval_list)
+    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
+        for item in trainval_list:
+            ftrainval.write(item[0] + ' ' + item[1] + '\n')
+
+    with open(osp.join(output_dir, 'test.txt'), 'w') as fval:
+        ct = 0
+        for item in test_list:
+            ct += 1
+            fval.write(item[0] + ' ' + item[1] + '\n')
+
+
+def _get_voc_dir(devkit_dir, year, type):
+    return osp.join(devkit_dir, 'VOC' + year, type)
+
+
+def _walk_voc_dir(devkit_dir, year, output_dir):
+    filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main')
+    annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations')
+    img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages')
+    trainval_list = []
+    test_list = []
+    added = set()
+
+    for _, _, files in os.walk(filelist_dir):
+        for fname in files:
+            img_ann_list = []
+            if re.match(r'[a-z]+_trainval\.txt', fname):
+                img_ann_list = trainval_list
+            elif re.match(r'[a-z]+_test\.txt', fname):
+                img_ann_list = test_list
+            else:
+                continue
+            fpath = osp.join(filelist_dir, fname)
+            for line in open(fpath):
+                name_prefix = line.strip().split()[0]
+                if name_prefix in added:
+                    continue
+                added.add(name_prefix)
+                ann_path = osp.join(
+                    osp.relpath(annotation_dir, output_dir),
+                    name_prefix + '.xml')
+                img_path = osp.join(
+                    osp.relpath(img_dir, output_dir), name_prefix + '.jpg')
+                img_ann_list.append((img_path, ann_path))
+
+    return trainval_list, test_list
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d281b346b328ac407c84846c5a63419eed66f3ad
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+numpy < 1.24
+tqdm
+typeguard
+visualdl>=2.2.0
+opencv-python <= 4.6.0
+PyYAML
+shapely
+scipy
+terminaltables
+Cython
+pycocotools
+setuptools
diff --git a/scripts/analysis.py b/scripts/analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc59b371536d26fdbc788697ccdc53ab6ef3e0fd
--- /dev/null
+++ b/scripts/analysis.py
@@ -0,0 +1,347 @@
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import json
+import os
+import re
+import traceback
+
+from numpy import mean, var
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--filename", type=str, help="The name of log which need to analysis.")
+    parser.add_argument(
+        "--speed_log_file", type=str, help="json file")
+    parser.add_argument(
+        "--log_with_profiler", type=str, help="The path of train log with profiler")
+    parser.add_argument(
+        "--profiler_path", type=str, help="The path of profiler timeline log.")
+    parser.add_argument(
+        "--keyword", type=str, help="Keyword to specify analysis data")
+    parser.add_argument(
+        "--separator", type=str, default=None, help="Separator of different field in log")
+    parser.add_argument(
+        '--position', type=int, default=None, help='The position of data field')
+    parser.add_argument(
+        '--range', type=str, default="", help='The range of data field to intercept')
+    parser.add_argument(
+        '--skip_steps', type=int, default=0, help='The number of steps to be skipped')
+    parser.add_argument(
+        '--model_mode', type=int, default=-1, help='Analysis mode, default value is -1')
+
+    parser.add_argument(
+        '--model_name', type=str, default="model_name", help='training model_name, transformer_base')
+    parser.add_argument(
+        '--base_batch_size', type=int, help='base_batch size on gpu')
+    parser.add_argument(
+        '--fp_item', type=str, help='fp_item:fp16|fp32')
+    parser.add_argument(
+        '--run_mode', type=str, default="DP", help='DP|MP|PP')
+    parser.add_argument(
+        '--convergence_key', type=str, default="", help="Keyword to specify loss data")
+    parser.add_argument(
+        '--speed_unit', type=str, default="images/s", help='IPS unit')
+    parser.add_argument(
+        '--device_num', type=str, default='N1C1', help='device_num:N1C1|N1C8|N4C32')
+    args = parser.parse_args()
+    args.separator = None if args.separator == "None" else args.separator
+    return args
+
+
+def _is_number(num):
+    pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
+    result = pattern.match(num)
+    if result:
+        return True
+    else:
+        return False
+
+
+class TimeAnalyzer(object):
+    def __init__(self, filename, keyword=None, separator=None, position=None, range="-1"):
+        if filename is None:
+            raise Exception("Please specify the filename!")
+
+        if keyword is None:
+            raise Exception("Please specify the keyword!")
+
+        self.filename = filename
+        self.keyword = keyword
+        self.separator = separator
+        self.position = position
+        self.range = range
+        self.records = None
+        self._distil()
+
+    def _distil(self):
+        self.records = []
+        with open(self.filename, "r") as f_object:
+            lines = f_object.readlines()
+            for line in lines:
+                if self.keyword not in line:
+                    continue
+                try:
+                    result = None
+
+                    # Distil the string from a line.
+                    line = line.strip()
+                    line_words = line.split(self.separator) if self.separator else line.split()
+                    if args.position:
+                        result = line_words[self.position]
+                    else:
+                        # Distil the string following the keyword.
+                        for i in range(len(line_words) - 1):
+                            if line_words[i] == self.keyword:
+                                result = line_words[i + 1]
+                                break
+
+                    # Distil the result from the picked string.
+                    if not self.range:
+                        result = result[0:]
+                    elif _is_number(self.range):
+                        result = result[0: int(self.range)]
+                    else:
+                        result = result[int(self.range.split(":")[0]): int(self.range.split(":")[1])]
+                    self.records.append(float(result))
+                except Exception as exc:
+                    print("line is: {}; separator={}; position={}".format(line, self.separator, self.position))
+
+        print("Extract {} records: separator={}; position={}".format(len(self.records), self.separator, self.position))
+
+    def _get_fps(self, mode, base_batch_size, gpu_num, avg_of_records, unit=None):
+        if mode == -1:
+            assert unit, "Please set the unit when mode is -1."
+            fps = gpu_num * avg_of_records
+        elif mode == 0:
+            # s/step -> samples/s
+            fps = (base_batch_size * gpu_num) / avg_of_records
+            unit = "samples/s"
+        elif mode == 1:
+            # steps/s -> steps/s
+            fps = avg_of_records
+            unit = "steps/s"
+        elif mode == 2:
+            # s/step -> steps/s
+            fps = 1 / avg_of_records
+            unit = "steps/s"
+        elif mode == 3:
+            # steps/s -> samples/s
+            fps = base_batch_size * gpu_num * avg_of_records
+            unit = "samples/s"
+        elif mode == 4:
+            # s/epoch -> s/epoch
+            fps = avg_of_records
+            unit = "s/epoch"
+        else:
+            ValueError("Unsupported analysis mode.")
+
+        return fps, unit
+
+    def analysis(self, base_batch_size, gpu_num=1, skip_steps=0, mode=-1, unit=None):
+        if base_batch_size <= 0:
+            print("base_batch_size should larger than 0.")
+            return 0, ''
+        
+        if len(self.records) <= (skip_steps + 10):  # to address the condition which item of log equals to skip_steps
+            print("ERROR!!! too few logs printed")
+            return 0, ''
+
+        sum_of_records = 0
+        sum_of_records_skipped = 0
+        skip_min = self.records[skip_steps]
+        skip_max = self.records[skip_steps]
+
+        count = len(self.records)
+        # 1 计算skip 后平均值
+        for i in range(count):
+            sum_of_records += self.records[i]
+            if i >= skip_steps:
+                sum_of_records_skipped += self.records[i]
+                if self.records[i] < skip_min:
+                    skip_min = self.records[i]
+                if self.records[i] > skip_max:
+                    skip_max = self.records[i]
+        avg_of_records = sum_of_records / float(count)
+        avg_of_records_skipped = sum_of_records_skipped / float(count - skip_steps)
+        # 2 skip后去掉去除前max(5%,5)和后max(5%,5)个数据再计算平均值
+        sorted_records = sorted(self.records[skip_steps:])
+        skip_step2 = max(int(len(sorted_records) * 0.05), 5)
+        try:
+            del sorted_records[:skip_step2]
+            del sorted_records[-skip_step2:]
+            avg_of_sorted_records = mean(sorted_records)
+            var_of_sorted_records = var(sorted_records)
+            skip_min = min(sorted_records)
+            skip_max = max(sorted_records)
+        except Exception:
+            print("no records")
+            return 0, ''
+
+        fps, fps_unit = self._get_fps(mode, base_batch_size, gpu_num, avg_of_records, unit)
+        # fps_skipped, _ = self._get_fps(mode, base_batch_size, gpu_num, avg_of_records_skipped, unit)
+        Fips, _ = self._get_fps(mode, base_batch_size, gpu_num, avg_of_sorted_records, unit)
+
+        if mode == -1:
+            print("average ips of %d steps, skip 0 step:" % count)
+            print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
+            print("\tFPS: %.3f %s" % (fps, fps_unit))
+            if skip_steps > 0:
+                print("average ips of %d steps, skip %d steps, valid steps %d :" % (count, \
+                                                                                    skip_steps,
+                                                                                    count - skip_steps - 2 * skip_step2))
+                print("\tvar: %.3f " % (var_of_sorted_records))
+                print("\tAvg: %.3f %s" % (avg_of_sorted_records, fps_unit))
+                print("\tMin: %.3f %s" % (skip_min, fps_unit))
+                print("\tMax: %.3f %s" % (skip_max, fps_unit))
+                print("\tdiff_min_max: %.3f %s" % ((skip_min - skip_max) * 100 / skip_max, "%"))
+                print("\tFPS: %.3f %s" % (Fips, fps_unit))
+        elif mode == 1 or mode == 3:
+            print("average latency of %d steps, skip 0 step:" % count)
+            print("\tAvg: %.3f steps/s" % avg_of_records)
+            print("\tFPS: %.3f %s" % (fps, fps_unit))
+            if skip_steps > 0:
+                print("average latency of %d steps, skip %d steps:" % (count, skip_steps))
+                print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
+                print("\tMin: %.3f steps/s" % skip_min)
+                print("\tMax: %.3f steps/s" % skip_max)
+                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
+        elif mode == 0 or mode == 2:
+            print("average latency of %d steps, skip 0 step:" % count)
+            print("\tAvg: %.3f s/step" % avg_of_records)
+            print("\tFPS: %.3f %s" % (fps, fps_unit))
+            if skip_steps > 0:
+                print("average latency of %d steps, skip %d steps:" % (count, skip_steps))
+                print("\tAvg: %.3f s/step" % avg_of_records_skipped)
+                print("\tMin: %.3f s/step" % skip_min)
+                print("\tMax: %.3f s/step" % skip_max)
+                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
+
+        return round(Fips, 3), fps_unit
+
+
+class ExceptionTest(Exception):
+    pass
+
+
+class LossAnalyzer(object):
+    def __init__(self, filename, convergence_key=None, separator=None):
+        if filename is None:
+            raise Exception("Please specify the filename!")
+        if convergence_key is None:
+            raise Exception("Please specify the keyword of loss!")
+        self.filename = filename
+        self.convergence_key = convergence_key
+        self.separator = separator
+
+    def get_loss(self):
+        with open(self.filename, "r") as f_object:
+            lines = f_object.readlines()
+            lines.reverse()
+            result_loss = 0
+            for line in lines:
+                if self.convergence_key not in line:
+                    continue
+                try:
+                    result_loss = 0
+                    line = line.strip()
+                    line_words = line.split(self.separator) if self.separator else line.split()
+                    for i in range(len(line_words) - 1):
+                        if line_words[i] == self.convergence_key:
+                            result_loss = line_words[i + 1]
+                            result_loss = result_loss.replace(',', '')
+                            raise ExceptionTest()
+                except ExceptionTest:
+                    break
+        print("\tLoss: {}".format(result_loss))
+        return result_loss
+
+    def get_loss_avg(self):
+        # for gpt3 CE loss
+        with open(self.filename, "r") as f_object:
+            lines = f_object.readlines()
+            result_loss_list = []
+            for line in lines:
+                if self.convergence_key in line and "tokens/s" in line:
+                    result_loss = 0
+                    line = line.strip()
+                    line_words = line.split(self.separator) if self.separator else line.split()
+                    for i in range(len(line_words) - 1):
+                        if line_words[i] == self.convergence_key:
+                            result_loss = line_words[i + 1]
+                            result_loss = result_loss.replace(',', '')
+                            try:
+                                result_loss_list.append(float(result_loss))
+                            except:
+                                result_loss_list.append(-1)
+        result_loss_avg = sum(result_loss_list[-100:]) / 100
+        print("\tLoss: {}".format(result_loss_avg))
+        return result_loss_avg
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    run_info = dict()
+    run_info["model_branch"] = os.getenv("model_branch")
+    run_info["model_commit"] = os.getenv("model_commit")
+    run_info["model_name"] = args.model_name
+    run_info["batch_size"] = args.base_batch_size
+    run_info["fp_item"] = args.fp_item
+    if re.match(r'DP.-MP.-PP.', args.run_mode) or 'DP_MoE_C' in args.run_mode or 'Sharding_MoE_C' in args.run_mode \
+            or re.match(r'DP._MP.', args.run_mode):
+        run_info["run_mode"] = 'Collective'
+    else:
+        run_info["run_mode"] = args.run_mode
+    run_info["convergence_value"] = 0
+    run_info["convergence_key"] = args.convergence_key
+    run_info["ips"] = 0
+    run_info["speed_unit"] = args.speed_unit
+    run_info["device_num"] = args.device_num
+    run_info["model_run_time"] = os.getenv('model_run_time')
+    run_info["frame_commit"] = os.getenv('frame_commit')
+    run_info["frame_version"] = os.getenv('frame_version')
+    device_num = args.device_num
+    print("---device_num:-", device_num)
+    index_c = device_num.index('C')
+    print("---index_c:-", index_c)
+    gpu_num = int(device_num[index_c + 1:len(device_num)])
+    print("-----gpu_num:", gpu_num)
+    if "pwgan" in args.model_name:
+        print("------analysis ", args.model_name)
+        args.keyword = "avg_ips:"
+
+    try:
+        analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, args.position, args.range)
+        run_info["ips"], run_info["speed_unit"] = analyzer.analysis(
+            base_batch_size=args.base_batch_size,
+            gpu_num=gpu_num,
+            skip_steps=args.skip_steps,
+            mode=args.model_mode,
+            unit=args.speed_unit)
+        if "CE_gpt3" in run_info["model_name"]:
+            loss_analyzer = LossAnalyzer(args.filename, args.convergence_key)
+            run_info["convergence_value"] = loss_analyzer.get_loss_avg()
+        elif args.convergence_key != "":
+            loss_analyzer = LossAnalyzer(args.filename, args.convergence_key)
+            run_info["convergence_value"] = loss_analyzer.get_loss()
+    except Exception:
+        traceback.print_exc()
+    print("{}".format(json.dumps(run_info)))  # it's required, for the log file path  insert to the database
+    with open(args.speed_log_file, "w") as f:
+        f.write(json.dumps(run_info))
diff --git a/scripts/build_wheel.sh b/scripts/build_wheel.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8b6ea06836b30328e2b54e916c4f6ba70834264c
--- /dev/null
+++ b/scripts/build_wheel.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#=================================================
+#                   Utils
+#=================================================
+
+
+# directory config
+DIST_DIR="dist"
+BUILD_DIR="build"
+EGG_DIR="paddledet.egg-info"
+
+CFG_DIR="configs"
+TEST_DIR=".tests"
+DATA_DIR="dataset"
+
+# command line log config
+RED='\033[0;31m'
+BLUE='\033[0;34m'
+GREEN='\033[1;32m'
+BOLD='\033[1m'
+NONE='\033[0m'
+
+function python_version_check() {
+  PY_MAIN_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
+  PY_SUB_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
+  echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
+  if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "5" ]; then
+    echo -e "${RED}FAIL:${NONE} please use Python >= 3.5 !"
+    exit 1
+  fi
+}
+
+function init() {
+    echo -e "${BLUE}[init]${NONE} removing building directory..."
+    rm -rf $DIST_DIR $BUILD_DIR $EGG_DIR $TEST_DIR
+    if [ `pip list | grep paddledet | wc -l` -gt 0  ]; then
+      echo -e "${BLUE}[init]${NONE} uninstalling paddledet..."
+      pip uninstall -y paddledet
+    fi
+    echo -e "${BLUE}[init]${NONE} ${GREEN}init success\n"
+}
+
+function build_and_install() {
+  echo -e "${BLUE}[build]${NONE} building paddledet wheel..."
+  python setup.py sdist bdist_wheel
+  if [ $? -ne 0 ]; then
+    echo -e "${RED}[FAIL]${NONE} build paddledet wheel failed !"
+    exit 1
+  fi
+  echo -e "${BLUE}[build]${NONE} ${GREEN}build paddldet wheel success\n"
+
+  echo -e "${BLUE}[install]${NONE} installing paddledet..."
+  cd $DIST_DIR
+  find . -name "paddledet*.whl" | xargs pip install
+  if [ $? -ne 0 ]; then
+    cd ..
+    echo -e "${RED}[FAIL]${NONE} install paddledet wheel failed !"
+    exit 1
+  fi
+  echo -e "${BLUE}[install]${NONE} ${GREEN}paddledet install success\n"
+  cd ..
+}
+
+function unittest() {
+  if [ -d $TEST_DIR ]; then
+    rm -rf $TEST_DIR
+  fi;
+
+  echo -e "${BLUE}[unittest]${NONE} run unittests..."
+
+  # NOTE: perform unittests under TEST_DIR to
+  #       make sure installed paddledet is used
+  mkdir $TEST_DIR
+  cp -r $CFG_DIR $TEST_DIR
+  cp -r $DATA_DIR $TEST_DIR
+  cd $TEST_DIR
+
+  if [ $? != 0  ]; then
+    exit 1
+  fi
+  find "../ppdet" -wholename '*tests/test_*' -type f -print0 | \
+      xargs -0 -I{} -n1 -t bash -c  'python -u -s {}'
+
+  # clean TEST_DIR
+  cd ..
+  rm -rf $TEST_DIR
+  echo -e "${BLUE}[unittest]${NONE} ${GREEN}unittests success\n${NONE}"
+}
+
+function cleanup() {
+  if [ -d $TEST_DIR ]; then
+    rm -rf $TEST_DIR
+  fi
+
+  rm -rf $BUILD_DIR $EGG_DIR
+  pip uninstall -y paddledet
+}
+
+function abort() {
+  echo -e "${RED}[FAIL]${NONE} build wheel and unittest failed !
+          please check your code" 1>&2
+
+  cur_dir=`basename "$pwd"`
+  if [ cur_dir==$TEST_DIR -o cur_dir==$DIST_DIR ]; then
+    cd ..
+  fi
+
+  rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR $TEST_DIR
+  pip uninstall -y paddledet
+}
+
+python_version_check
+
+trap 'abort' 0
+set -e
+
+init
+build_and_install
+unittest
+cleanup
+
+# get Paddle version
+PADDLE_VERSION=`python -c "import paddle; print(paddle.version.full_version)"`
+PADDLE_COMMIT=`python -c "import paddle; print(paddle.version.commit)"`
+PADDLE_COMMIT=`git rev-parse --short $PADDLE_COMMIT`
+
+# get PaddleDetection branch
+PPDET_BRANCH=`git rev-parse --abbrev-ref HEAD`
+PPDET_COMMIT=`git rev-parse --short HEAD`
+
+# get Python version
+PYTHON_VERSION=`python -c "import platform; print(platform.python_version())"`
+
+echo -e "\n${GREEN}paddledet wheel compiled and checked success !${NONE}
+        ${BLUE}Python version:${NONE} $PYTHON_VERSION
+        ${BLUE}Paddle version:${NONE} $PADDLE_VERSION ($PADDLE_COMMIT)
+        ${BLUE}PaddleDetection branch:${NONE} $PPDET_BRANCH ($PPDET_COMMIT)\n"
+
+echo -e "${GREEN}wheel saved under${NONE} ${RED}${BOLD}./dist"
+
+trap : 0
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc057d393857177d717e51136a900926b39cf7bb
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import os.path as osp
+import glob
+import shutil
+import subprocess
+from setuptools import find_packages, setup
+
+# ==============  version definition  ==============
+
+PPDET_VERSION = "2.6.0"
+
+
+def parse_version():
+    return PPDET_VERSION.replace('-', '')
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE, ).communicate()[0].strip()
+        git_commit = git_commit.decode()
+    except:
+        git_commit = 'Unknown'
+
+    return str(git_commit)
+
+
+def write_version_py(filename='ppdet/version.py'):
+    ver_str = """# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(version)s'
+commit          = '%(commit)s'
+"""
+
+    _git_commit = git_commit()
+    with open(filename, 'w') as f:
+        f.write(ver_str % {'version': PPDET_VERSION, 'commit': _git_commit})
+
+
+write_version_py()
+
+# ==============  version definition  ==============
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+def parse_requirements(fname):
+    with open(fname, encoding="utf-8-sig") as f:
+        requirements = f.readlines()
+    return requirements
+
+
+def package_model_zoo():
+    cur_dir = osp.dirname(osp.realpath(__file__))
+    cfg_dir = osp.join(cur_dir, "configs")
+    cfgs = glob.glob(osp.join(cfg_dir, '*/*.yml'))
+
+    valid_cfgs = []
+    for cfg in cfgs:
+        # exclude dataset base config
+        if osp.split(osp.split(cfg)[0])[1] not in ['datasets']:
+            valid_cfgs.append(cfg)
+    model_names = [
+        osp.relpath(cfg, cfg_dir).replace(".yml", "") for cfg in valid_cfgs
+    ]
+
+    model_zoo_file = osp.join(cur_dir, 'ppdet', 'model_zoo', 'MODEL_ZOO')
+    with open(model_zoo_file, 'w') as wf:
+        for model_name in model_names:
+            wf.write("{}\n".format(model_name))
+
+    return [model_zoo_file]
+
+
+packages = [
+    'ppdet',
+    'ppdet.core',
+    'ppdet.data',
+    'ppdet.engine',
+    'ppdet.metrics',
+    'ppdet.modeling',
+    'ppdet.model_zoo',
+    'ppdet.slim',
+    'ppdet.utils',
+]
+
+if __name__ == "__main__":
+    setup(
+        name='paddledet',
+        packages=find_packages(exclude=("configs", "tools", "deploy")),
+        package_data={'ppdet.model_zoo': package_model_zoo()},
+        author='PaddlePaddle',
+        version=parse_version(),
+        install_requires=parse_requirements('./requirements.txt'),
+        description='Object detection and instance segmentation toolkit based on PaddlePaddle',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        url='https://github.com/PaddlePaddle/PaddleDetection',
+        download_url='https://github.com/PaddlePaddle/PaddleDetection.git',
+        keywords=['ppdet paddle ppyolo'],
+        classifiers=[
+            'Intended Audience :: Developers',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Natural Language :: Chinese (Simplified)',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8', 'Topic :: Utilities'
+        ],
+        license='Apache License 2.0',
+        ext_modules=[])
diff --git a/test_tipc/README.md b/test_tipc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..42b1b7458b85eadef47a1c92533aff90d97ebd85
--- /dev/null
+++ b/test_tipc/README.md
@@ -0,0 +1,113 @@
+
+# 飞桨训推一体认证
+
+## 1. 简介
+
+飞桨除了基本的模型训练和预测，还提供了支持多端多平台的高性能推理部署工具。
+本文档提供了PaddleDetection中所有模型的飞桨训推一体认证 (Training and Inference Pipeline Certification(TIPC)) 信息和测试工具，
+方便用户查阅每种模型的训练推理部署打通情况，并可以进行一键测试。
+
+<div align="center">
+    <img src="docs/guide.png" width="1000">
+</div>
+
+## 2. 汇总信息
+
+已填写的部分表示可以使用本工具进行一键测试，未填写的表示正在支持中。
+
+**字段说明：**
+- 基础训练预测：包括模型训练、Paddle Inference Python预测。
+- 更多训练方式：包括多机多卡、混合精度。
+- 模型压缩：包括裁剪、离线/在线量化、蒸馏。
+- 其他预测部署：包括Paddle Inference C++预测、Paddle Serving部署、Paddle-Lite部署等。
+
+更详细的mkldnn、Tensorrt等预测加速相关功能的支持情况可以查看各测试工具的[更多教程](#more)。
+
+| 算法论文 | 模型名称                                                                               | 模型类型 | 基础<br>训练预测 | 更多<br>训练方式 | 模型压缩 |  其他预测部署  |
+| :--- |:-----------------------------------------------------------------------------------| :----: | :--------: | :---- | :---- | :---- |
+| [PPYOLO](https://arxiv.org/abs/2007.12099) | [ppyolo_mbv3_large_coco](../configs/ppyolo/ppyolo_mbv3_large_coco.yml)             | 目标检测  | 支持 | 混合精度 | FPGM裁剪 <br> PACT量化 <br> 离线量化 | Paddle Inference: C++  |
+| [PPYOLOv2](https://arxiv.org/abs/2104.10419) | [ppyolov2_r50vd_dcn_365e_coco](../configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml) | 目标检测  | 支持 | 多机多卡 <br> 混合精度 |  | Paddle Inference: C++  |
+| [PP-PicoDet](https://arxiv.org/abs/2111.00902) | [picodet_s_320_coco_lcnet](../configs/picodet/picodet_s_320_coco_lcnet.yml)        | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+
+更详细的汇总信息可以查看[更多模型](docs/more_models.md)
+
+## 3. 测试工具简介
+### 目录介绍
+
+```shell
+test_tipc/
+├── configs/          # 配置文件目录
+│   ├── ppyolo        # ppyolo参数目录
+│   │   ├──ppyolo_mbv3_large_coco.txt
+│   │   ├──ppyolo_r50vd_dcn_1x_coco.txt
+│   │   ├──ppyolov2_r50vd_dcn_365e_coco.txt
+│   ├── yolov3        # yolov3参数目录
+│   │   ├──yolov3_darknet53_270e_coco.txt
+│   ├── ...
+├── docs/             # 相关说明文档目录
+│   ├── ...
+├── results/          # 预先保存的预测结果，用于和实际预测结果进行精读比对
+│   ├── xxx.txt
+│   ├── ...
+├── compare_results.py                # 用于对比log中的预测结果与results中的预存结果精度误差是否在限定范围内
+├── prepare.sh                        # 完成test_*.sh运行所需要的数据和模型下载
+├── README.md                         # 使用文档
+├── test_inference_cpp.sh             # 测试c++预测的主程序
+├── test_lite.sh                      # 测试lite部署预测的主程序
+├── test_serving.sh                   # 测试serving部署预测的主程序
+├── test_train_inference_python.sh    # 测试python训练预测的主程序
+└── utils_func.sh                     # test_*.sh中需要用到的工具类函数
+```
+
+### 测试流程概述
+使用本工具，可以测试不同功能的支持情况，以及预测结果是否对齐，测试流程概括如下：
+<div align="center">
+    <img src="docs/test.png" width="800">
+</div>
+
+1. 运行prepare.sh准备测试所需数据和模型；
+2. 运行要测试的功能对应的测试脚本`test_*.sh`，产出log，由log可以看到不同配置是否运行成功；
+3. 用`compare_results.py`对比log中的预测结果和预存在results目录下的结果，判断预测精度是否符合预期（在误差范围内）。
+
+测试单项功能仅需两行命令，**如需测试不同模型/功能，替换配置文件即可**，命令格式如下：
+```shell
+# 功能：准备数据
+# 格式：bash + 运行脚本 + 参数1: 配置文件选择 + 参数2: 模式选择
+bash test_tipc/prepare.sh  configs/[model_name]/[params_file_name]  [Mode]
+
+# 功能：运行测试
+# 格式：bash + 运行脚本 + 参数1: 配置文件选择 + 参数2: 模式选择
+bash test_tipc/test_train_inference_python.sh configs/[model_name]/[params_file_name]  [Mode]
+```
+
+例如，测试基本训练预测功能的`lite_train_lite_infer`模式，运行：
+```shell
+# 准备数据
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'lite_train_lite_infer'
+# 运行测试
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'lite_train_lite_infer'
+```  
+关于本示例命令的更多信息可查看[基础训练预测使用文档](docs/test_train_inference_python.md)。
+
+### 配置文件命名规范
+在`configs`目录下，**按模型名称划分为子目录**，子目录中存放所有该模型测试需要用到的配置文件，配置文件的命名遵循如下规范：
+
+1. 基础训练预测配置简单命名为：`train_infer_python.txt`，表示**Linux环境下单机、不使用混合精度训练+python预测**，其完整命名对应`train_linux_gpu_normal_normal_infer_python_linux_gpu_cpu.txt`，由于本配置文件使用频率较高，这里进行了名称简化。
+
+2. 其他带训练配置命名格式为：`train_训练硬件环境(linux_gpu/linux_dcu/…)_是否多机(fleet/normal)_是否混合精度(amp/normal)_预测模式(infer/lite/serving/js)_语言(cpp/python/java)_预测硬件环境(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`。如，linux gpu下多机多卡+混合精度链条测试对应配置 `train_linux_gpu_fleet_amp_infer_python_linux_gpu_cpu.txt`，linux dcu下基础训练预测对应配置 `train_linux_dcu_normal_normal_infer_python_linux_dcu.txt`。
+
+3. 仅预测的配置（如serving、lite等）命名格式：`model_训练硬件环境(linux_gpu/linux_dcu/…)_是否多机(fleet/normal)_是否混合精度(amp/normal)_(infer/lite/serving/js)_语言(cpp/python/java)_预测硬件环境(linux_gpu/mac/jetson/opencl_arm_gpu/...).txt`，即，与2相比，仅第一个字段从train换为model，测试时模型直接下载获取，这里的“训练硬件环境”表示所测试的模型是在哪种环境下训练得到的。
+
+**根据上述命名规范，可以直接从子目录名称和配置文件名找到需要测试的场景和功能对应的配置文件。**
+
+
+<a name="more"></a>
+## 4. 开始测试
+各功能测试中涉及混合精度、裁剪、量化等训练相关，及mkldnn、Tensorrt等多种预测相关参数配置，请点击下方相应链接了解更多细节和使用教程：  
+- [test_train_inference_python 使用](docs/test_train_inference_python.md) ：测试基于Python的模型训练、评估、推理等基本功能，包括裁剪、量化、蒸馏。
+- [test_train_fleet_inference_python 使用](./docs/test_train_fleet_inference_python.md)：测试基于Python的多机多卡训练与推理等基本功能。
+- [test_inference_cpp 使用](docs/test_inference_cpp.md)：测试基于C++的模型推理。
+- [test_serving 使用](docs/test_serving.md)：测试基于Paddle Serving的服务化部署功能，包括Python、C++。
+- test_lite_arm_cpu_cpp 使用（待开发）：测试基于Paddle-Lite的ARM CPU端c++预测部署功能。
+- [test_paddle2onnx 使用](docs/test_paddle2onnx.md)：测试Paddle2ONNX的模型转化功能，并验证正确性。
+- [test_ptq_inference_python 使用](docs/test_ptq_inference_python.md)：测试基于Python的离线量化功能。
diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..41ff748c68edda622a63e3fc1e0147291bf01171
--- /dev/null
+++ b/test_tipc/benchmark_train.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+
+# set env
+python=python
+export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
+export model_commit=$(git log|head -n1|awk '{print $2}')
+export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
+export frame_version=${str_tmp%%.post*}
+export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`)
+
+# run benchmark sh
+# Usage:
+# bash run_benchmark_train.sh config.txt params
+# or
+# bash run_benchmark_train.sh config.txt
+
+function func_parser_params(){
+    strs=$1
+    IFS="="
+    array=(${strs})
+    tmp=${array[1]}
+    echo ${tmp}
+}
+
+function set_dynamic_epoch(){
+    string=$1
+    num=$2
+    _str=${string:1:6}
+    IFS="C"
+    arr=(${_str})
+    M=${arr[0]}
+    P=${arr[1]}
+    ep=`expr $num \* $P`
+    echo $ep
+}
+
+function func_sed_params(){
+    filename=$1
+    line=$2
+    param_value=$3
+    params=`sed -n "${line}p" $filename`
+    IFS=":"
+    array=(${params})
+    key=${array[0]}
+    new_params="${key}:${param_value}"
+    IFS=";"
+    cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'"
+    eval $cmd
+}
+
+function set_gpu_id(){
+    string=$1
+    _str=${string:1:6}
+    IFS="C"
+    arr=(${_str})
+    M=${arr[0]}
+    P=${arr[1]}
+    gn=`expr $P - 1`
+    gpu_num=`expr $gn / $M`
+    seq=`seq -s "," 0 $gpu_num`
+    echo $seq
+}
+
+function get_repo_name(){
+    IFS=";"
+    cur_dir=$(pwd)
+    IFS="/"
+    arr=(${cur_dir})
+    echo ${arr[-1]}
+}
+
+FILENAME=$1
+# copy FILENAME as new
+new_filename="./test_tipc/benchmark_train.txt"
+cmd=`yes|cp $FILENAME $new_filename`
+FILENAME=$new_filename
+# MODE must be one of ['benchmark_train']
+MODE=$2
+PARAMS=$3
+# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train dynamic_bs8_null_DP_N1C1
+IFS=$'\n'
+# parser params from train_benchmark.txt
+dataline=`cat $FILENAME`
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+
+# 获取benchmark_params所在的行数
+line_num=`grep -n -w "train_benchmark_params" $FILENAME  | cut -d ":" -f 1`
+# for train log parser
+batch_size=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+fp_items=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+epoch=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+repeat=$(func_parser_value "${lines[line_num]}")
+
+line_num=`expr $line_num + 1`
+profile_option_key=$(func_parser_key "${lines[line_num]}")
+profile_option_params=$(func_parser_value "${lines[line_num]}")
+profile_option="${profile_option_key}:${profile_option_params}"
+
+line_num=`expr $line_num + 1`
+flags_value=$(func_parser_value "${lines[line_num]}")
+if [ ${flags_value} != "null" ];then
+    # set flags
+    IFS=";"
+    flags_list=(${flags_value})
+    for _flag in ${flags_list[*]}; do
+        cmd="export ${_flag}"
+        eval $cmd
+    done
+fi
+
+# set log_name
+repo_name=$(get_repo_name )
+SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)}   # */benchmark_log
+mkdir -p "${SAVE_LOG}/benchmark_log/"
+status_log="${SAVE_LOG}/benchmark_log/results.log"
+
+# The number of lines in which train params can be replaced.
+line_python=3
+line_gpuid=4
+line_precision=6
+line_epoch=7
+line_batchsize=9
+line_profile=13
+line_eval_py=24
+line_export_py=30
+
+func_sed_params "$FILENAME" "${line_eval_py}" "null"
+func_sed_params "$FILENAME" "${line_export_py}" "null"
+func_sed_params "$FILENAME" "${line_python}"  "${python}"
+
+# if params
+if  [ ! -n "$PARAMS" ] ;then
+    # PARAMS input is not a word.
+    IFS="|"
+    batch_size_list=(${batch_size})
+    fp_items_list=(${fp_items})
+    device_num="N1C4"
+    device_num_list=($device_num)
+    run_mode="DP"
+elif [[ ${PARAMS} = "dynamicTostatic" ]] ;then
+    IFS="|"
+    model_type=$PARAMS
+    batch_size_list=(${batch_size})
+    fp_items_list=(${fp_items})
+    device_num="N1C4"
+    device_num_list=($device_num)
+    run_mode="DP"
+else
+    # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}
+    IFS="_"
+    params_list=(${PARAMS})
+    model_type=${params_list[0]}
+    batch_size=${params_list[1]}
+    batch_size=`echo  ${batch_size} | tr -cd "[0-9]" `
+    precision=${params_list[2]}
+    run_mode=${params_list[3]}
+    device_num=${params_list[4]}
+    IFS=";"
+
+    if [ ${precision} = "null" ];then
+        precision="fp32"
+    fi
+
+    fp_items_list=($precision)
+    batch_size_list=($batch_size)
+    device_num_list=($device_num)
+fi
+
+if [[ ${model_name} =~ "yolov5" ]];then 
+   echo "${model_name} run unset MosaicPerspective and RandomHSV"
+   eval "sed -i '10c 10c    - MosaicPerspective: {mosaic_prob: 0.0, target_size: *input_size, scale: 0.9, mixup_prob: 0.1, copy_paste_prob: 0.1}' configs/yolov5/_base_/yolov5_reader_high_aug.yml"
+   eval "sed -i 's/10c//' configs/yolov5/_base_/yolov5_reader_high_aug.yml"
+   eval "sed -i 's/^    - RandomHSV: /#&/' configs/yolov5/_base_/yolov5_reader_high_aug.yml"
+fi
+
+# for log name
+to_static=""
+# parse "to_static" options and modify trainer into "to_static_trainer"
+if [[ ${model_type} = "dynamicTostatic" ]];then
+    to_static="d2sT_"
+    sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME
+    #yolov5 and yolov7 static need MosaicPerspective
+    eval "sed -i '10c 10c    - MosaicPerspective: {mosaic_prob: 1.0, target_size: *input_size, scale: 0.9, mixup_prob: 0.1, copy_paste_prob: 0.1}' configs/yolov5/_base_/yolov5_reader_high_aug.yml"
+    eval "sed -i 's/10c//' configs/yolov5/_base_/yolov5_reader_high_aug.yml"
+    eval "sed -i '10c 10c    - MosaicPerspective: {mosaic_prob: 1.0, target_size: *input_size, scale: 0.9, mixup_prob: 0.1, copy_paste_prob: 0.1}' configs/yolov7/_base_/yolov7_reader.yml"
+    eval "sed -i 's/10c//' configs/yolov7/_base_/yolov7_reader.yml"
+fi
+
+
+
+if [[ ${model_name} =~ "higherhrnet" ]] || [[ ${model_name} =~ "hrnet" ]] || [[ ${model_name} =~ "tinypose" ]] || [[ ${model_name} =~ "ppyoloe_r_crn_s_3x_spine_coco" ]] ;then
+    echo "${model_name} run on full coco dataset"
+    epoch=$(set_dynamic_epoch $device_num $epoch)
+else
+    epoch=1
+    repeat=$(set_dynamic_epoch $device_num $repeat)
+    eval "sed -i '10c\  repeat: ${repeat}' configs/datasets/coco_detection.yml"
+    eval "sed -i '10c\  repeat: ${repeat}' configs/datasets/coco_instance.yml"
+    eval "sed -i '10c\  repeat: ${repeat}' configs/datasets/mot.yml"
+fi
+
+
+IFS="|"
+for batch_size in ${batch_size_list[*]}; do
+    for precision in ${fp_items_list[*]}; do
+        for device_num in ${device_num_list[*]}; do
+            # sed batchsize and precision
+            func_sed_params "$FILENAME" "${line_precision}" "$precision"
+            func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size"
+            func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch"
+            gpu_id=$(set_gpu_id $device_num)
+
+            if [ ${#gpu_id} -le 1 ];then
+                log_path="$SAVE_LOG/profiling_log"
+                mkdir -p $log_path
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling"
+                func_sed_params "$FILENAME" "${line_gpuid}" "0"  # sed used gpu_id
+                # set profile_option params
+                tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
+
+                # run test_train_inference_python.sh
+                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                echo $cmd
+                eval $cmd
+                eval "cat ${log_path}/${log_name}"
+
+                # without profile
+                log_path="$SAVE_LOG/train_log"
+                speed_log_path="$SAVE_LOG/index"
+                mkdir -p $log_path
+                mkdir -p $speed_log_path
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
+                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed profile_id as null
+                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                echo $cmd
+                job_bt=`date '+%Y%m%d%H%M%S'`
+                eval $cmd
+                job_et=`date '+%Y%m%d%H%M%S'`
+                export model_run_time=$((${job_et}-${job_bt}))
+                eval "cat ${log_path}/${log_name}"
+
+                # parser log
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
+                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
+                        --model_name ${_model_name} \
+                        --base_batch_size ${batch_size} \
+                        --run_mode ${run_mode} \
+                        --fp_item ${precision} \
+                        --keyword ips: \
+                        --skip_steps 4 \
+                        --device_num ${device_num} \
+                        --speed_unit images/s \
+                        --convergence_key loss: "
+                echo $cmd
+                eval $cmd
+                last_status=${PIPESTATUS[0]}
+                status_check $last_status "${cmd}" "${status_log}" "${model_name}"
+            else
+                IFS=";"
+                unset_env=`unset CUDA_VISIBLE_DEVICES`
+                log_path="$SAVE_LOG/train_log"
+                speed_log_path="$SAVE_LOG/index"
+                mkdir -p $log_path
+                mkdir -p $speed_log_path
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
+                func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id"  # sed used gpu_id
+                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed --profile_option as null
+                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                echo $cmd
+                job_bt=`date '+%Y%m%d%H%M%S'`
+                eval $cmd
+                job_et=`date '+%Y%m%d%H%M%S'`
+                export model_run_time=$((${job_et}-${job_bt}))
+                eval "cat ${log_path}/${log_name}"
+                # parser log
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
+
+                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
+                        --model_name ${_model_name} \
+                        --base_batch_size ${batch_size} \
+                        --run_mode ${run_mode} \
+                        --fp_item ${precision} \
+                        --keyword ips: \
+                        --skip_steps 4 \
+                        --device_num ${device_num} \
+                        --speed_unit images/s \
+                        --convergence_key loss: "
+                echo $cmd
+                eval $cmd
+                last_status=${PIPESTATUS[0]}
+                status_check $last_status "${cmd}" "${status_log}" "${model_name}"
+            fi
+        done
+    done
+done
diff --git a/test_tipc/benchmark_train.txt b/test_tipc/benchmark_train.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ed57cb59c143db49919587c5f2913fe732ba16e3
--- /dev/null
+++ b/test_tipc/benchmark_train.txt
@@ -0,0 +1,60 @@
+===========================train_params===========================
+model_name:ppyolo_mbv3_large_coco
+python:python
+gpu_list:0,1,2,3,4,5,6,7
+use_gpu:True
+auto_cast:fp16
+epoch:benchmark_train=1
+save_dir:null
+TrainReader.batch_size:benchmark_train=24
+pretrain_weights:https://paddledet.bj.bcebos.com/models/ppyolo_mbv3_large_coco.pdparams
+trained_model_name:model_final.pdparams
+train_infer_img_dir:./dataset/coco/test2017/
+filename:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c configs/ppyolo/ppyolo_mbv3_large_coco.yml -o
+pact_train:tools/train.py -c configs/ppyolo/ppyolo_mbv3_large_coco.yml --slim_config configs/slim/quant/ppyolo_mbv3_large_qat.yml -o
+fpgm_train:tools/train.py -c configs/ppyolo/ppyolo_mbv3_large_coco.yml --slim_config configs/slim/prune/ppyolo_mbv3_large_prune_fpgm.yml -o
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output_inference
+weights:https://paddledet.bj.bcebos.com/models/ppyolo_mbv3_large_coco.pdparams
+norm_export:null
+pact_export:tools/export_model.py -c configs/ppyolo/ppyolo_mbv3_large_coco.yml --slim_config configs/slim/quant/ppyolo_mbv3_large_qat.yml -o
+fpgm_export:tools/export_model.py -c configs/ppyolo/ppyolo_mbv3_large_coco.yml --slim_config configs/slim/prune/ppyolo_mbv3_large_prune_fpgm.yml -o
+distill_export:null
+export1:null
+export2:null
+kl_quant_export:tools/post_quant.py -c configs/ppyolo/ppyolo_mbv3_large_coco.yml --slim_config configs/slim/post_quant/ppyolo_mbv3_large_ptq.yml -o
+##
+infer_mode:norm|kl_quant
+infer_quant:False|True
+inference:./deploy/python/infer.py
+--device:gpu|cpu
+--enable_mkldnn:False
+--cpu_threads:4
+--batch_size:1|2
+--use_tensorrt:null
+--run_mode:paddle
+--model_dir:
+--image_dir:./dataset/coco/test2017/
+--save_log_path:null
+--run_benchmark:False
+null:null
+===========================train_benchmark_params==========================
+batch_size:24
+fp_items:fp32|fp16
+epoch:1
+repeat:10
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
+===========================infer_benchmark_params===========================
+numpy_infer_input:3x320x320.npy
\ No newline at end of file
diff --git a/test_tipc/compare_results.py b/test_tipc/compare_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..e28410ed6cb26aab7557025c06b2541a7d27c2c1
--- /dev/null
+++ b/test_tipc/compare_results.py
@@ -0,0 +1,140 @@
+import numpy as np
+import os
+import subprocess
+import json
+import argparse
+import glob
+
+
+def init_args():
+    parser = argparse.ArgumentParser()
+    # params for testing assert allclose
+    parser.add_argument("--atol", type=float, default=1e-3)
+    parser.add_argument("--rtol", type=float, default=1e-3)
+    parser.add_argument("--gt_file", type=str, default="")
+    parser.add_argument("--log_file", type=str, default="")
+    parser.add_argument("--precision", type=str, default="fp32")
+    return parser
+
+
+def parse_args():
+    parser = init_args()
+    return parser.parse_args()
+
+
+def run_shell_command(cmd):
+    p = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    out, err = p.communicate()
+
+    if p.returncode == 0:
+        return out.decode('utf-8')
+    else:
+        return None
+
+
+def parser_results_from_log_by_name(log_path, names_list):
+    if not os.path.exists(log_path):
+        raise ValueError("The log file {} does not exists!".format(log_path))
+
+    if names_list is None or len(names_list) < 1:
+        return []
+
+    parser_results = {}
+    for name in names_list:
+        cmd = "grep {} {}".format(name, log_path)
+        outs = run_shell_command(cmd)
+        outs = outs.split("\n")[0]
+        result = outs.split("{}".format(name))[-1]
+        try:
+            result = json.loads(result)
+        except:
+            result = np.array([int(r) for r in result.split()]).reshape(-1, 4)
+        parser_results[name] = result
+    return parser_results
+
+
+def load_gt_from_file(gt_file):
+    if not os.path.exists(gt_file):
+        raise ValueError("The log file {} does not exists!".format(gt_file))
+    with open(gt_file, 'r') as f:
+        data = f.readlines()
+        f.close()
+    parser_gt = {}
+    for line in data:
+        image_name, result = line.strip("\n").split("\t")
+        image_name = image_name.split('/')[-1]
+        try:
+            result = json.loads(result)
+        except:
+            result = np.array([int(r) for r in result.split()]).reshape(-1, 4)
+        parser_gt[image_name] = result
+    return parser_gt
+
+
+def load_gt_from_txts(gt_file):
+    gt_list = glob.glob(gt_file)
+    gt_collection = {}
+    for gt_f in gt_list:
+        gt_dict = load_gt_from_file(gt_f)
+        basename = os.path.basename(gt_f)
+        if "fp32" in basename:
+            gt_collection["fp32"] = [gt_dict, gt_f]
+        elif "fp16" in basename:
+            gt_collection["fp16"] = [gt_dict, gt_f]
+        elif "int8" in basename:
+            gt_collection["int8"] = [gt_dict, gt_f]
+        else:
+            continue
+    return gt_collection
+
+
+def collect_predict_from_logs(log_path, key_list):
+    log_list = glob.glob(log_path)
+    pred_collection = {}
+    for log_f in log_list:
+        pred_dict = parser_results_from_log_by_name(log_f, key_list)
+        key = os.path.basename(log_f)
+        pred_collection[key] = pred_dict
+
+    return pred_collection
+
+
+def testing_assert_allclose(dict_x, dict_y, atol=1e-7, rtol=1e-7):
+    for k in dict_x:
+        np.testing.assert_allclose(
+            np.array(dict_x[k]), np.array(dict_y[k]), atol=atol, rtol=rtol)
+
+
+if __name__ == "__main__":
+    # Usage:
+    # python3.7 tests/compare_results.py --gt_file=./tests/results/*.txt  --log_file=./tests/output/infer_*.log
+
+    args = parse_args()
+
+    gt_collection = load_gt_from_txts(args.gt_file)
+    key_list = gt_collection["fp32"][0].keys()
+
+    pred_collection = collect_predict_from_logs(args.log_file, key_list)
+    for filename in pred_collection.keys():
+        if "fp32" in filename:
+            gt_dict, gt_filename = gt_collection["fp32"]
+        elif "fp16" in filename:
+            gt_dict, gt_filename = gt_collection["fp16"]
+        elif "int8" in filename:
+            gt_dict, gt_filename = gt_collection["int8"]
+        else:
+            continue
+        pred_dict = pred_collection[filename]
+
+        try:
+            testing_assert_allclose(
+                gt_dict, pred_dict, atol=args.atol, rtol=args.rtol)
+            print(
+                "Assert allclose passed! The results of {} and {} are consistent!".
+                format(filename, gt_filename))
+        except Exception as E:
+            print(E)
+            raise ValueError(
+                "The results of {} and the results of {} are inconsistent!".
+                format(filename, gt_filename))
diff --git a/test_tipc/configs/yolov5/yolov5_l_300e_coco_train_infer_python.txt b/test_tipc/configs/yolov5/yolov5_l_300e_coco_train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c8d67bf8dfa48e8909680d36f7021fa763e66f9e
--- /dev/null
+++ b/test_tipc/configs/yolov5/yolov5_l_300e_coco_train_infer_python.txt
@@ -0,0 +1,62 @@
+===========================train_params===========================
+model_name:yolov5_l_300e_coco
+python:python3.7
+gpu_list:0|0,1
+use_gpu:True
+auto_cast:null
+epoch:lite_train_lite_infer=1|lite_train_whole_infer=1|whole_train_whole_infer=300
+save_dir:null
+TrainReader.batch_size:lite_train_lite_infer=2|lite_train_whole_infer=2|whole_train_whole_infer=2
+pretrain_weights:https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams 
+trained_model_name:model_final.pdparams
+train_infer_img_dir:./dataset/coco/test2017/
+filename:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c configs/yolov5/yolov5_l_300e_coco.yml -o
+pact_train:tools/train.py -c configs/yolov5/yolov5_l_300e_coco.yml --slim_config _template_pact -o
+fpgm_train:tools/train.py -c configs/yolov5/yolov5_l_300e_coco.yml --slim_config _template_fpgm -o
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:tools/eval.py -c configs/yolov5/yolov5_l_300e_coco.yml -o
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output_inference
+weights:https://paddledet.bj.bcebos.com/models/yolov5_l_300e_coco.pdparams
+norm_export:tools/export_model.py -c configs/yolov5/yolov5_l_300e_coco.yml -o
+pact_export:tools/export_model.py -c configs/yolov5/yolov5_l_300e_coco.yml --slim_config _template_pact -o
+fpgm_export:tools/export_model.py -c configs/yolov5/yolov5_l_300e_coco.yml --slim_config _template_fpgm -o
+distill_export:null
+export1:null
+export2:null
+kl_quant_export:tools/post_quant.py -c configs/yolov5/yolov5_l_300e_coco.yml --slim_config configs/slim/post_quant/yolov3_darknet53_ptq.yml -o
+##
+infer_mode:norm|kl_quant
+infer_quant:False|True
+inference:./deploy/python/infer.py
+--device:gpu|cpu
+--enable_mkldnn:False
+--cpu_threads:4
+--batch_size:1|2
+--use_tensorrt:null
+--run_mode:paddle
+--model_dir:
+--image_dir:./dataset/coco/test2017/
+--save_log_path:null
+--run_benchmark:False
+--trt_max_shape:1600
+===========================train_benchmark_params==========================
+batch_size:16
+fp_items:fp32|fp16
+epoch:1
+repeat:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
+===========================infer_benchmark_params===========================
+numpy_infer_input:3x640x640_2.npy
+===========================to_static_train_benchmark_params=================
+to_static_train:--to_static
\ No newline at end of file
diff --git a/test_tipc/configs/yolov5/yolov5_s_300e_coco_train_infer_python.txt b/test_tipc/configs/yolov5/yolov5_s_300e_coco_train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c3b5506f8b04a2a824de50096cec3c6307fb93f2
--- /dev/null
+++ b/test_tipc/configs/yolov5/yolov5_s_300e_coco_train_infer_python.txt
@@ -0,0 +1,60 @@
+===========================train_params===========================
+model_name:yolov5_s_300e_coco
+python:python3.7
+gpu_list:0|0,1
+use_gpu:True
+auto_cast:null
+epoch:lite_train_lite_infer=1|lite_train_whole_infer=1|whole_train_whole_infer=300
+save_dir:null
+TrainReader.batch_size:lite_train_lite_infer=2|lite_train_whole_infer=2|whole_train_whole_infer=2
+pretrain_weights:https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams
+trained_model_name:model_final.pdparams
+train_infer_img_dir:./dataset/coco/test2017/
+filename:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c configs/yolov5/yolov5_s_300e_coco.yml -o
+pact_train:tools/train.py -c configs/yolov5/yolov5_s_300e_coco.yml --slim_config _template_pact -o
+fpgm_train:tools/train.py -c configs/yolov5/yolov5_s_300e_coco.yml --slim_config _template_fpgm -o
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:tools/eval.py -c configs/yolov5/yolov5_s_300e_coco.yml -o
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output_inference
+weights:https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams
+norm_export:tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml -o
+pact_export:tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml --slim_config _template_pact -o
+fpgm_export:tools/export_model.py -c configs/yolov5/yolov5_s_300e_coco.yml --slim_config _template_fpgm -o
+distill_export:null
+export1:null
+export2:null
+kl_quant_export:tools/post_quant.py -c configs/yolov5/yolov5_s_300e_coco.yml --slim_config configs/slim/post_quant/yolov3_darknet53_ptq.yml -o
+##
+infer_mode:norm|kl_quant
+infer_quant:False|True
+inference:./deploy/python/infer.py
+--device:gpu|cpu
+--enable_mkldnn:False
+--cpu_threads:4
+--batch_size:1|2
+--use_tensorrt:null
+--run_mode:paddle
+--model_dir:
+--image_dir:./dataset/coco/test2017/
+--save_log_path:null
+--run_benchmark:False
+--trt_max_shape:1600
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32|fp16
+epoch:1
+repeat:12
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
+===========================infer_benchmark_params===========================
+numpy_infer_input:3x640x640_2.npy
\ No newline at end of file
diff --git a/test_tipc/configs/yolov7/yolov7_l_300e_coco_train_infer_python.txt b/test_tipc/configs/yolov7/yolov7_l_300e_coco_train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e8580eace1a73be1f13cf49b6212c509f609fa26
--- /dev/null
+++ b/test_tipc/configs/yolov7/yolov7_l_300e_coco_train_infer_python.txt
@@ -0,0 +1,60 @@
+===========================train_params===========================
+model_name:yolov7_l_300e_coco
+python:python3.7
+gpu_list:0|0,1
+use_gpu:True
+auto_cast:null
+epoch:lite_train_lite_infer=1|lite_train_whole_infer=1|whole_train_whole_infer=300
+save_dir:null
+TrainReader.batch_size:lite_train_lite_infer=8|lite_train_whole_infer=8|whole_train_whole_infer=8
+pretrain_weights:https://paddledet.bj.bcebos.com/models/yolov7_l_300e_coco.pdparams
+trained_model_name:model_final.pdparams
+train_infer_img_dir:./dataset/coco/test2017/
+filename:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c configs/yolov7/yolov7_l_300e_coco.yml -o
+pact_train:tools/train.py -c configs/yolov7/yolov7_l_300e_coco.yml --slim_config _template_pact -o
+fpgm_train:tools/train.py -c configs/yolov7/yolov7_l_300e_coco.yml --slim_config _template_fpgm -o
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:tools/eval.py -c configs/yolov7/yolov7_l_300e_coco.yml -o
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output_inference
+weights:https://paddledet.bj.bcebos.com/models/yolov7_l_300e_coco.pdparams
+norm_export:tools/export_model.py -c configs/yolov7/yolov7_l_300e_coco.yml -o
+pact_export:tools/export_model.py -c configs/yolov7/yolov7_l_300e_coco.yml --slim_config _template_pact -o
+fpgm_export:tools/export_model.py -c configs/yolov7/yolov7_l_300e_coco.yml --slim_config _template_fpgm -o
+distill_export:null
+export1:null
+export2:null
+kl_quant_export:tools/post_quant.py -c configs/yolov7/yolov7_l_300e_coco.yml --slim_config configs/slim/post_quant/yolov3_darknet53_ptq.yml -o
+##
+infer_mode:norm|kl_quant
+infer_quant:False|True
+inference:./deploy/python/infer.py
+--device:gpu|cpu
+--enable_mkldnn:False
+--cpu_threads:4
+--batch_size:1|2
+--use_tensorrt:null
+--run_mode:paddle
+--model_dir:
+--image_dir:./dataset/coco/test2017/
+--save_log_path:null
+--run_benchmark:False
+--trt_max_shape:1600
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32|fp16
+epoch:1
+repeat:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
+===========================infer_benchmark_params===========================
+numpy_infer_input:3x640x640_2.npy
\ No newline at end of file
diff --git a/test_tipc/configs/yolov7/yolov7_tiny_300e_coco_train_infer_python.txt b/test_tipc/configs/yolov7/yolov7_tiny_300e_coco_train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..04fe5199307b9e9cd56e9f6a543e6fbe06a956d8
--- /dev/null
+++ b/test_tipc/configs/yolov7/yolov7_tiny_300e_coco_train_infer_python.txt
@@ -0,0 +1,60 @@
+===========================train_params===========================
+model_name:yolov7_tiny_300e_coco
+python:python3.7
+gpu_list:0|0,1
+use_gpu:True
+auto_cast:null
+epoch:lite_train_lite_infer=1|lite_train_whole_infer=1|whole_train_whole_infer=300
+save_dir:null
+TrainReader.batch_size:lite_train_lite_infer=8|lite_train_whole_infer=8|whole_train_whole_infer=8
+pretrain_weights:https://paddledet.bj.bcebos.com/models/yolov7_tiny_300e_coco.pdparams
+trained_model_name:model_final.pdparams
+train_infer_img_dir:./dataset/coco/test2017/
+filename:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c configs/yolov7/yolov7_tiny_300e_coco.yml -o
+pact_train:tools/train.py -c configs/yolov7/yolov7_tiny_300e_coco.yml --slim_config _template_pact -o
+fpgm_train:tools/train.py -c configs/yolov7/yolov7_tiny_300e_coco.yml --slim_config _template_fpgm -o
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:tools/eval.py -c configs/yolov7/yolov7_tiny_300e_coco.yml -o
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output_inference
+weights:https://paddledet.bj.bcebos.com/models/yolov7_tiny_300e_coco.pdparams
+norm_export:tools/export_model.py -c configs/yolov7/yolov7_tiny_300e_coco.yml -o
+pact_export:tools/export_model.py -c configs/yolov7/yolov7_tiny_300e_coco.yml --slim_config _template_pact -o
+fpgm_export:tools/export_model.py -c configs/yolov7/yolov7_tiny_300e_coco.yml --slim_config _template_fpgm -o
+distill_export:null
+export1:null
+export2:null
+kl_quant_export:tools/post_quant.py -c configs/yolov7/yolov7_tiny_300e_coco.yml --slim_config configs/slim/post_quant/yolov3_darknet53_ptq.yml -o
+##
+infer_mode:norm|kl_quant
+infer_quant:False|True
+inference:./deploy/python/infer.py
+--device:gpu|cpu
+--enable_mkldnn:False
+--cpu_threads:4
+--batch_size:1|2
+--use_tensorrt:null
+--run_mode:paddle
+--model_dir:
+--image_dir:./dataset/coco/test2017/
+--save_log_path:null
+--run_benchmark:False
+--trt_max_shape:1600
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32|fp16
+epoch:1
+repeat:12
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
+===========================infer_benchmark_params===========================
+numpy_infer_input:3x640x640_2.npy
\ No newline at end of file
diff --git a/test_tipc/docs/benchmark_train.md b/test_tipc/docs/benchmark_train.md
new file mode 100644
index 0000000000000000000000000000000000000000..82c3b07abe9deb621c7a36c459785f68db1b984e
--- /dev/null
+++ b/test_tipc/docs/benchmark_train.md
@@ -0,0 +1,50 @@
+# TIPC Linux端Benchmark测试文档
+
+该文档为Benchmark测试说明，Benchmark预测功能测试的主程序为`benchmark_train.sh`，用于验证监控模型训练的性能。
+
+# 1. 测试流程
+## 1.1 准备数据和环境安装
+运行`test_tipc/prepare.sh`，完成训练数据准备和安装环境流程。
+
+```shell
+# 运行格式：bash test_tipc/prepare.sh  train_benchmark.txt  mode
+bash test_tipc/prepare.sh test_tipc/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco_train_infer_python.txt benchmark_train
+```
+
+## 1.2 功能测试
+执行`test_tipc/benchmark_train.sh`，完成模型训练和日志解析
+
+```shell
+# 运行格式：bash test_tipc/benchmark_train.sh train_benchmark.txt mode
+bash test_tipc/benchmark_train.sh test_tipc/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco_train_infer_python.txt benchmark_train
+```
+
+`test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置，如下：
+```shell
+# 运行格式：bash test_tipc/benchmark_train.sh train_benchmark.txt mode
+bash test_tipc/benchmark_train.sh test_tipc/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco_train_infer_python.txt benchmark_train  dynamic_bs2_fp32_DP_N1C1
+```
+dynamic_bs2_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数，格式如下：
+`${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}`
+包含的信息有：模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡（N1C1）。
+
+
+## 2. 日志输出
+
+运行后将保存模型的训练日志和解析日志，使用 `test_tipc/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco_train_infer_python.txt` 参数文件的训练日志解析结果是：
+
+```
+{"model_branch": "tipc_fuse_benchmark", "model_commit": "4cce901d231f7954468045cf96302505bd6be495", "model_name": "faster_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP", "batch_size": 2, "fp_item": "fp32", "run_process_type": "SingleP", "run_mode": "DP", "convergence_value": "0.556966", "convergence_key": "loss:", "ips": 4.857, "speed_unit": "images/s", "device_num": "N1C1", "model_run_time": "590", "frame_commit": "6b0c57cf65945e97d87a8fba89c0a2fc18dd8544", "frame_version": "0.0.0"}
+```
+
+训练日志和日志解析结果保存在benchmark_log目录下，文件组织格式如下：
+```
+train_log/
+├── index
+│   └── PaddleDetection_faster_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP_N1C1_speed
+├── profiling_log
+│   └── PaddleDetection_faster_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP_N1C1_profiling
+└── train_log
+    ├── PaddleDetection_faster_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP_N1C1_log
+    └── PaddleDetection_faster_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP_N1C4_log
+```
diff --git a/test_tipc/docs/guide.png b/test_tipc/docs/guide.png
new file mode 100644
index 0000000000000000000000000000000000000000..319ac819daff38ed77e84cdff2b122e8bc4a8e5f
Binary files /dev/null and b/test_tipc/docs/guide.png differ
diff --git a/test_tipc/docs/install.md b/test_tipc/docs/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..eaac6908dbb9f5c6f394644da438c0a4f8ca60f3
--- /dev/null
+++ b/test_tipc/docs/install.md
@@ -0,0 +1,149 @@
+## 1. 环境准备
+
+本教程适用于test_tipc目录下基础功能测试的运行环境搭建。
+
+推荐环境：
+- CUDA 10.1/10.2
+- CUDNN 7.6/cudnn8.1
+- TensorRT 6.1.0.5 / 7.1 / 7.2
+
+环境配置可以选择docker镜像安装，或者在本地环境Python搭建环境。推荐使用docker镜像安装，避免不必要的环境配置。
+
+## 2. Docker 镜像安装
+
+推荐docker镜像安装，按照如下命令创建镜像，当前目录映射到镜像中的`/paddle`目录下
+```
+# 启动docker镜像
+nvidia-docker run --name paddle -it -v $PWD:/paddle paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7-gcc82-dev /bin/bash
+cd /paddle
+```
+
+```
+# 编译安装Paddle
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build && cd build
+cmake .. \
+  -DWITH_MKL=ON \
+  -DWITH_MKLDNN=ON \
+  -DWITH_GPU=ON \
+  -DWITH_DISTRIBUTE=ON \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCUDA_ARCH_NAME=Auto \
+  -DPY_VERSION=3.7 \
+  -DON_INFER=ON \
+  -DWITH_TENSORRT=ON \
+  -DTENSORRT_ROOT=/usr/local/TensorRT6-cuda10.1-cudnn7
+make -j 20
+pip3.7 install python/dist/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl
+cd ../../
+```
+or
+```
+# 下载安装Paddle-2.2
+wget https://paddle-inference-lib.bj.bcebos.com/2.2.0/python/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddlepaddle_gpu-2.2.0.post101-cp37-cp37m-linux_x86_64.whl
+pip3.7 install paddlepaddle_gpu-2.2.0.post101-cp37-cp37m-linux_x86_64.whl
+# 下载C++预测库用于C++ inference
+wget https://paddle-inference-lib.bj.bcebos.com/2.2.0/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz
+tar -xvf paddle_inference.tgz
+export PADDLE_DIR=/paddle/paddle_inference
+```
+
+## 3 Python 环境构建
+
+如果您已经通过docker方式构建环境，跳过该部分内容。非docker环境下，环境配置比较灵活，推荐环境组合配置：
+- CUDA10.1 + CUDNN7.6 + TensorRT 6
+- CUDA10.2 + CUDNN8.1 + TensorRT 7
+- CUDA11.1 + CUDNN8.1 + TensorRT 7
+
+下面以 CUDA10.2 + CUDNN8.1 + TensorRT 7 配置为例，介绍环境配置的流程。
+
+### 3.1 安装CUDNN
+
+如果当前环境满足CUDNN版本的要求，可以跳过此步骤。
+
+以CUDNN8.1 安装安装为例，安装步骤如下，首先下载CUDNN，从[Nvidia官网](https://developer.nvidia.com/rdp/cudnn-archive)下载CUDNN8.1版本，下载符合当前系统版本的三个deb文件，分别是：
+- cuDNN Runtime Library ，如：libcudnn8_8.1.0.77-1+cuda10.2_amd64.deb
+- cuDNN Developer Library ，如：libcudnn8-dev_8.1.0.77-1+cuda10.2_amd64.deb
+- cuDNN Code Samples，如：libcudnn8-samples_8.1.0.77-1+cuda10.2_amd64.deb
+
+deb安装可以参考[官方文档](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-deb)，安装方式如下
+```
+# x.x.x表示下载的版本号
+# $HOME为工作目录
+sudo dpkg -i libcudnn8_x.x.x-1+cudax.x_arm64.deb
+sudo dpkg -i libcudnn8-dev_8.x.x.x-1+cudax.x_arm64.deb
+sudo dpkg -i libcudnn8-samples_8.x.x.x-1+cudax.x_arm64.deb
+
+# 验证是否正确安装
+cp -r /usr/src/cudnn_samples_v8/ $HOME
+cd  $HOME/cudnn_samples_v8/mnistCUDNN
+
+# 编译
+make clean && make
+./mnistCUDNN
+```
+如果运行mnistCUDNN完后提示运行成功，则表示安装成功。如果运行后出现freeimage相关的报错，需要按照提示安装freeimage库:
+```
+sudo apt-get install libfreeimage-dev
+sudo apt-get install libfreeimage
+```
+
+### 3.2 安装TensorRT
+
+首先，从[Nvidia官网TensorRT板块](https://developer.nvidia.com/tensorrt-getting-started)下载TensorRT，这里选择7.1.3.4版本的TensorRT，注意选择适合自己系统版本和CUDA版本的TensorRT，另外建议下载TAR package的安装包。
+
+以Ubuntu16.04+CUDA10.2为例，下载并解压后可以参考[官方文档](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-713/install-guide/index.html#installing-tar)的安装步骤，按照如下步骤安装:
+```
+# 以下安装命令中 '${version}' 为下载的TensorRT版本，如7.1.3.4
+# 设置环境变量，<TensorRT-${version}/lib> 为解压后的TensorRT的lib目录
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>
+
+# 安装TensorRT
+cd TensorRT-${version}/python
+pip3.7 install tensorrt-*-cp3x-none-linux_x86_64.whl
+
+# 安装graphsurgeon
+cd TensorRT-${version}/graphsurgeon
+```
+
+
+### 3.3 安装PaddlePaddle
+
+下载支持TensorRT版本的Paddle安装包，注意安装包的TensorRT版本需要与本地TensorRT一致，下载[链接](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html#python)
+选择下载 linux-cuda10.2-trt7-gcc8.2 Python3.7版本的Paddle：
+```
+# 从下载链接中可以看到是paddle2.1.1-cuda10.2-cudnn8.1版本
+wget  https://paddle-wheel.bj.bcebos.com/with-trt/2.1.1-gpu-cuda10.2-cudnn8.1-mkl-gcc8.2/paddlepaddle_gpu-2.1.1-cp37-cp37m-linux_x86_64.whl
+pip3.7 install -U paddlepaddle_gpu-2.1.1-cp37-cp37m-linux_x86_64.whl
+```
+
+## 4. 安装PaddleDetection依赖
+```
+# 安装AutoLog
+git clone https://github.com/LDOUBLEV/AutoLog
+cd AutoLog
+pip3.7 install -r requirements.txt
+python3.7 setup.py bdist_wheel
+pip3.7 install ./dist/auto_log-1.0.0-py3-none-any.whl
+
+# 下载PaddleDetection代码
+cd ../
+git clone https://github.com/PaddlePaddle/PaddleDetection
+```
+
+安装PaddleDetection依赖：
+```
+cd PaddleDetection
+pip3.7 install -r ./requirements.txt
+```
+
+## FAQ :
+Q. You are using Paddle compiled with TensorRT, but TensorRT dynamic library is not found. Ignore this if TensorRT is not needed.
+
+A. 问题一般是当前安装paddle版本带TRT，但是本地环境找不到TensorRT的预测库，需要下载TensorRT库，解压后设置环境变量LD_LIBRARY_PATH;
+如：
+```
+export LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/paddle/package/TensorRT-6.0.1.5/lib
+```
+或者问题是下载的TensorRT版本和当前paddle中编译的TRT版本不匹配，需要下载版本相符的TensorRT重新安装。
diff --git a/test_tipc/docs/more_models.md b/test_tipc/docs/more_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..60cada7027b457e78a37e44d79a15fae6e65595c
--- /dev/null
+++ b/test_tipc/docs/more_models.md
@@ -0,0 +1,73 @@
+## 汇总信息
+
+已填写的部分表示可以使用本工具进行一键测试，未填写的表示正在支持中。
+
+**字段说明：**
+- 基础训练预测：包括模型训练、Paddle Inference Python预测。
+- 更多训练方式：包括多机多卡、混合精度。
+- 模型压缩：包括裁剪、离线/在线量化、蒸馏。
+- 其他预测部署：包括Paddle Inference C++预测、Paddle Serving部署、Paddle-Lite部署等。
+
+| 算法论文 | 模型名称                                                                                                                  | 模型类型 | 基础<br>训练预测 | 更多<br>训练方式 | 模型压缩 |  其他预测部署  |
+| :--- |:----------------------------------------------------------------------------------------------------------------------| :----: | :--------: | :---- | :---- | :---- |
+| [YOLOv3](https://arxiv.org/abs/1804.02767) | [yolov3_darknet53_270e_coco](../../configs/yolov3/yolov3_darknet53_270e_coco.yml)                                     | 目标检测 | 支持 | 混合精度 | FPGM裁剪 <br> PACT量化 <br> 离线量化 | Paddle Inference: C++  |
+| YOLOv3 | [yolov3_mobilenet_v1_270e_coco](../../configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml)                               | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| YOLOv3 | [yolov3_mobilenet_v3_large_270e_coco](../../configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml)                   | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| YOLOv3 | [yolov3_r34_270e_coco](../../configs/yolov3/yolov3_r34_270e_coco.yml)                                                 | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| YOLOv3 | [yolov3_r50vd_dcn_270e_coco](../../configs/yolov3/yolov3_r50vd_dcn_270e_coco.yml)                                     | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [PPYOLO](https://arxiv.org/abs/2007.12099) | [ppyolo_mbv3_large_coco](../../configs/ppyolo/ppyolo_mbv3_large_coco.yml)                                             | 目标检测  | 支持 | 混合精度 | FPGM裁剪 <br> PACT量化 <br> 离线量化 | Paddle Inference: C++  |
+| PPYOLO | [ppyolo_r50vd_dcn_1x_coco](../../configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml)                                         | 目标检测  | 支持 | 混合精度 | FPGM裁剪 <br> PACT量化 <br> 离线量化 | Paddle Inference: C++  |
+| PPYOLO | [ppyolo_mbv3_small_coco](../../configs/ppyolo/ppyolo_mbv3_small_coco.yml)                                             | 目标检测  | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PPYOLO | [ppyolo_r18vd_coco](../../configs/ppyolo/ppyolo_r18vd_coco.yml)                                                       | 目标检测  | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PPYOLO-tiny | [ppyolo_tiny_650e_coco](../../configs/ppyolo/ppyolo_tiny_650e_coco.yml)                                               | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [PPYOLOv2](https://arxiv.org/abs/2104.10419) | [ppyolov2_r50vd_dcn_365e_coco](../../configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml)                                 | 目标检测  | 支持 | 多机多卡 <br> 混合精度 |  | Paddle Inference: C++  |
+| PPYOLOv2 | [ppyolov2_r50vd_dcn_365e_coco](../../configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml)                                 | 目标检测  | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PPYOLOv2 | [ppyolov2_r101vd_dcn_365e_coco](../../configs/ppyolo/ppyolov2_r101vd_dcn_365e_coco.yml)                               | 目标检测  | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [PP-PicoDet](https://arxiv.org/abs/2111.00902) | picodet_s_320_coco                                                                                                    | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PP-PicoDet | picodet_m_416_coco                                                                                                    | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PP-PicoDet | picodet_l_640_coco                                                                                                    | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PP-PicoDet | picodet_lcnet_1_5x_416_coco                      | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PP-PicoDet | picodet_mobilenetv3_large_1x_416_coco   | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PP-PicoDet | picodet_r18_640_coco                                    | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| PP-PicoDet | picodet_shufflenetv2_1x_416_coco            | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [SSD](https://arxiv.org/abs/1512.02325) | [ssdlite_mobilenet_v1_300_coco](../../configs/ssd/ssdlite_mobilenet_v1_300_coco.yml)                                  | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [Faster R-CNN](https://arxiv.org/abs/1506.01497) | [faster_rcnn_r50_fpn_1x_coco](../../configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml)                              | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_r34_fpn_1x_coco](../../configs/faster_rcnn/faster_rcnn_r34_fpn_1x_coco.yml)                              | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_r34_vd_fpn_1x_coco](../../configs/faster_rcnn/faster_rcnn_r34_vd_fpn_1x_coco.yml)                        | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_r50_1x_coco](../../configs/faster_rcnn/faster_rcnn_r50_1x_coco.yml)                                      | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_r50_vd_1x_coco](../../configs/faster_rcnn/faster_rcnn_r50_vd_1x_coco.yml)                                | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_r50_vd_fpn_1x_coco](../../configs/faster_rcnn/faster_rcnn_r50_vd_fpn_1x_coco.yml)                        | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_r101_1x_coco](../../configs/faster_rcnn/faster_rcnn_r101_1x_coco.yml)                                    | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_r101_fpn_1x_coco](../../configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.yml)                            | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_r101_vd_fpn_1x_coco](../../configs/faster_rcnn/faster_rcnn_r101_vd_fpn_1x_coco.yml)                      | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_x101_vd_64x4d_fpn_1x_coco](../../configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_1x_coco.yml)          | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Faster R-CNN | [faster_rcnn_swin_tiny_fpn_1x_coco](../../configs/faster_rcnn/faster_rcnn_swin_tiny_fpn_1x_coco.yml)                  | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [Cascade Faster R-CNN](https://arxiv.org/abs/1712.00726) | [cascade_rcnn_r50_fpn_1x_coco](../../configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml)                           | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Cascade Faster R-CNN | [cascade_rcnn_r50_vd_fpn_ssld_1x_coco](../../configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.yml)           | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [FCOS](https://arxiv.org/abs/1904.01355) | [fcos_r50_fpn_1x_coco](../../configs/fcos/fcos_r50_fpn_1x_coco.yml)                                                   | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| FCOS | [fcos_dcn_r50_fpn_1x_coco](../../configs/fcos/fcos_dcn_r50_fpn_1x_coco.yml)                                           | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [TTFNet](https://arxiv.org/abs/1909.00700) | [ttfnet_darknet53_1x_coco](../../configs/ttfnet/ttfnet_darknet53_1x_coco.yml)                                         | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [S2ANet](https://arxiv.org/abs/2008.09397) | [s2anet_conv_2x_dota](../../configs/dota/s2anet_conv_2x_dota.yml)                                                     | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| S2ANet | [s2anet_1x_spine](../../configs/dota/s2anet_1x_spine.yml)                                                             | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| S2ANet | [s2anet_alignconv_2x_dota](../../configs/dota/s2anet_alignconv_2x_dota.yml)                                           | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [BlazeFace](https://arxiv.org/abs/1907.05047) | [blazeface_1000e](../../configs/face_detection/blazeface_1000e.yml)                                                   | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| BlazeFace | [blazeface_fpn_ssh_1000e](../../configs/face_detection/blazeface_fpn_ssh_1000e.yml)                                   | 目标检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [Mask R-CNN](https://arxiv.org/abs/1703.06870) | [mask_rcnn_r50_fpn_1x_coco](../../configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml)                                    | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Mask R-CNN | [mask_rcnn_r50_1x_coco](../../configs/mask_rcnn/mask_rcnn_r50_1x_coco.yml)                                            | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Mask R-CNN | [mask_rcnn_r50_vd_fpn_1x_coco](../../configs/mask_rcnn/mask_rcnn_r50_vd_fpn_1x_coco.yml)                              | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Mask R-CNN | [mask_rcnn_r101_fpn_1x_coco](../../configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.yml)                                  | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Mask R-CNN | [mask_rcnn_r101_vd_fpn_1x_coco](../../configs/mask_rcnn/mask_rcnn_r101_vd_fpn_1x_coco.yml)                            | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Mask R-CNN | [mask_rcnn_x101_vd_64x4d_fpn_1x_coco](../../configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_1x_coco.yml)                | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [Cascade Mask R-CNN](https://arxiv.org/abs/1906.09756) | [cascade_mask_rcnn_r50_fpn_1x_coco](../../configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.yml)                 | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| Cascade Mask R-CNN | [cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco](../../configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml) | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [SOLOv2](https://arxiv.org/abs/2003.10152) | [solov2_r50_fpn_1x_coco](../../configs/solov2/solov2_r50_fpn_1x_coco.yml)                                             | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| SOLOv2 | [solov2_r50_enhance_coco](../../configs/solov2/solov2_r50_enhance_coco.yml)                                           | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| SOLOv2 | [solov2_r101_vd_fpn_3x_coco](../../configs/solov2/solov2_r101_vd_fpn_3x_coco.yml)                                     | 实例分割 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [PP-Tinypose] | [tinypose_128x96](../../configs/keypoint/tiny_pose/tinypose_128x96.yml)                                               | 关键点检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [HRNet](https://arxiv.org/abs/1902.09212) | [hrnet_w32_256x192](../../configs/keypoint/hrnet/hrnet_w32_256x192.yml)                                               | 关键点检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| HRNet | [dark_hrnet_w32_256x192](../../configs/keypoint/hrnet/dark_hrnet_w32_256x192.yml)                                     | 关键点检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| HRNet | [dark_hrnet_w48_256x192](../../configs/keypoint/hrnet/dark_hrnet_w48_256x192.yml)                                     | 关键点检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [HigherHRNet](https://arxiv.org/abs/1908.10357) | [higherhrnet_hrnet_w32_512](../../configs/keypoint/higherhrnet/higherhrnet_hrnet_w32_512.yml)                         | 关键点检测 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [FairMot](https://arxiv.org/abs/2004.01888) | [fairmot_dla34_30e_576x320](../../configs/mot/fairmot/fairmot_dla34_30e_576x320.yml)                                  | 目标跟踪 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| FairMot | [fairmot_hrnetv2_w18_dlafpn_30e_576x320](../../configs/mot/fairmot/fairmot_hrnetv2_w18_dlafpn_30e_576x320.yml)        | 目标跟踪 | 支持 | 混合精度 |  | Paddle Inference: C++  |
+| [JDE](https://arxiv.org/abs/1909.12605) | [jde_darknet53_30e_576x320](../../configs/mot/jde/jde_darknet53_30e_576x320.yml)                                      | 目标跟踪 | 支持 | 混合精度 |  | Paddle Inference: C++  |
diff --git a/test_tipc/docs/test.png b/test_tipc/docs/test.png
new file mode 100644
index 0000000000000000000000000000000000000000..f99f23d7050eb61879cf317c0d7728ef14531b08
Binary files /dev/null and b/test_tipc/docs/test.png differ
diff --git a/test_tipc/docs/test_inference_cpp.md b/test_tipc/docs/test_inference_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..01847af3b694ecc4dfd99d3c686cb31542448adc
--- /dev/null
+++ b/test_tipc/docs/test_inference_cpp.md
@@ -0,0 +1,99 @@
+# C++预测功能测试
+
+C++预测功能测试的主程序为`test_inference_cpp.sh`，可以测试基于C++预测库的模型推理功能。
+
+## 1. 测试结论汇总
+
+基于训练是否使用量化，进行本测试的模型可以分为`正常模型`和`量化模型`，这两类模型对应的C++预测功能汇总如下：
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+|  ----   |  ---- |   ----   |  :----:  |   :----:   |  :----:  |
+| 正常模型 | GPU | 1/8 | fp32/fp16 | - | - |
+| 正常模型 | CPU | 1/8 | - | fp32 | 支持 |
+| 量化模型 | GPU | 1/8 | int8 | - | - |
+| 量化模型 | CPU | 1/8 | - | int8 | 支持 |
+
+## 2. 测试流程
+运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
+```
+# 请设置paddle_inference环境变量，如：
+export PADDLE_INFER_DIR=/path/to/paddle_inference
+# 若不设置paddle_inference环境变量，也可通过指定参数的方式使脚本自动下载paddle_inference.tgz，如：
+bash test_tipc/test_inference_cpp.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt 'https://paddle-inference-lib.bj.bcebos.com/2.3.0/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz'
+
+# 若未使用docker镜像: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7-gcc82-dev
+# 请设置TensorRT环境变量，如：
+export TENSORRT_ROOT=/usr/local/TensorRT6-cuda10.1-cudnn7
+```
+
+### 2.1 功能测试
+先运行`prepare.sh`准备数据和模型，然后运行`test_inference_cpp.sh`进行测试，最终在```test_tipc/output```目录下生成`cpp_infer_*.log`后缀的日志文件。
+
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt "cpp_infer"
+# 用法1:
+bash test_tipc/test_inference_cpp.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt
+# 用法2: 指定下载paddle_inference.tgz链接，第二个传入参数为下载链接
+bash test_tipc/test_inference_cpp.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt 'https://paddle-inference-lib.bj.bcebos.com/2.3.0/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz'
+# 用法3: 同时指定下载paddle_inference.tgz链接和指定GPU卡预测，第三个传入参数为GPU卡号
+bash test_tipc/test_inference_cpp.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt 'https://paddle-inference-lib.bj.bcebos.com/2.3.0/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz' '1'
+```  
+
+运行预测指令后，在`test_tipc/output`文件夹下自动会保存运行日志，包括以下文件：
+
+```shell
+test_tipc/output/
+|- results_cpp.log    # 运行指令状态的日志
+|- cpp_infer_cpu_usemkldnn_False_threads_1_precision_fluid_batchsize_1.log  # CPU上不开启Mkldnn，线程数设置为1，测试batch_size=1条件下的预测运行日志
+|- cpp_infer_cpu_usemkldnn_False_threads_6_precision_fluid_batchsize_1.log  # CPU上不开启Mkldnn，线程数设置为6，测试batch_size=1条件下的预测运行日志
+|- cpp_infer_gpu_precision_fluid_batchsize_1.log # GPU上不开启TensorRT，测试batch_size=1的fp32精度预测日志
+|- cpp_infer_gpu_precision_trt_fp16_batchsize_1.log  # GPU上开启TensorRT，测试batch_size=1的fp16精度预测日志
+......
+```
+其中results_cpp.log中包含了每条指令的运行状态，如果运行成功会输出：
+
+```
+Run successfully with command - python3.7 tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams filename=yolov3_darknet53_270e_coco --output_dir=./output_inference !
+Run successfully with command - ./deploy/cpp/build/main --device=gpu --run_mode=fluid --model_dir=./output_inference/yolov3_darknet53_270e_coco --batch_size=8 --image_dir=./dataset/coco/test2017/ --run_benchmark=True   > ./test_tipc/output/cpp_infer_gpu_precision_fluid_batchsize_8.log 2>&1 !
+......
+```
+如果运行失败，会输出：
+```
+Run failed with command - python3.7 tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams filename=yolov3_darknet53_270e_coco --output_dir=./output_inference !
+Run failed with command - ./deploy/cpp/build/main --device=gpu --run_mode=fluid --model_dir=./output_inference/yolov3_darknet53_270e_coco --batch_size=8 --image_dir=./dataset/coco/test2017/ --run_benchmark=True   > ./test_tipc/output/cpp_infer_gpu_precision_fluid_batchsize_8.log 2>&1 !
+......
+```
+可以很方便的根据results_cpp.log中的内容判定哪一个指令运行错误。
+
+
+### 2.2 精度测试
+
+使用compare_results.py脚本比较模型预测的结果是否符合预期，主要步骤包括：
+- 提取日志中的预测坐标；
+- 从本地文件中提取保存好的坐标结果；
+- 比较上述两个结果是否符合精度预期，误差大于设置阈值时会报错。
+
+#### 使用方式
+运行命令：
+```shell
+python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/cpp_*.txt  --log_file=./test_tipc/output/cpp_*.log --atol=1e-3 --rtol=1e-3
+```
+
+参数介绍：  
+- gt_file： 指向事先保存好的预测结果路径，支持*.txt 结尾，会自动索引*.txt格式的文件，文件默认保存在test_tipc/result/ 文件夹下
+- log_file: 指向运行test_tipc/test_inference_cpp.sh 脚本的infer模式保存的预测日志，预测日志中打印的有预测结果，比如：文本框，预测文本，类别等等，同样支持cpp_infer_*.log格式传入
+- atol: 设置的绝对误差
+- rtol: 设置的相对误差
+
+#### 运行结果
+
+正常运行效果如下图：
+<img src="compare_cpp_right.png" width="1000">
+
+出现不一致结果时的运行输出：
+<img src="compare_cpp_wrong.png" width="1000">
+
+
+## 3. 更多教程
+
+本文档为功能测试用，更详细的c++预测使用教程请参考：[C++预测](../../deploy/cpp/README.md)
diff --git a/test_tipc/docs/test_paddle2onnx.md b/test_tipc/docs/test_paddle2onnx.md
new file mode 100644
index 0000000000000000000000000000000000000000..373bdb2cb5fb20e93806599b75672e3210e13297
--- /dev/null
+++ b/test_tipc/docs/test_paddle2onnx.md
@@ -0,0 +1,47 @@
+# Paddle2onnx预测功能测试
+
+PaddleServing预测功能测试的主程序为`test_paddle2onnx.sh`，可以测试Paddle2ONNX的模型转化功能，并验证正确性。
+
+## 1. 测试结论汇总
+
+基于训练是否使用量化，进行本测试的模型可以分为`正常模型`和`量化模型`，这两类模型对应的Paddle2ONNX预测功能汇总如下：
+
+| 模型类型 |device |
+|  ----   |  ---- |  
+| 正常模型 | GPU |
+| 正常模型 | CPU |
+| 量化模型 | GPU |
+| 量化模型 | CPU |
+
+## 2. 测试流程
+### 2.1 功能测试
+先运行`prepare.sh`准备数据和模型，然后运行`test_paddle2onnx.sh`进行测试，最终在```test_tipc/output```目录下生成`paddle2onnx_infer_*.log`后缀的日志文件。
+
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_paddle2onnx_python_linux_cpu.txt "paddle2onnx_infer"
+
+# 用法:
+bash test_tipc/test_paddle2onnx.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_paddle2onnx_python_linux_cpu.txt
+```  
+
+#### 运行结果
+
+各测试的运行情况会打印在 `test_tipc/output/results_paddle2onnx.log` 中：
+运行成功时会输出：
+
+```
+Run successfully with command - yolov3_darknet53_270e_coco - paddle2onnx --model_dir=./output_inference/yolov3_darknet53_270e_coco --model_filename=model.pdmodel --params_filename=model.pdiparams --save_file=./output_inference/yolov3_darknet53_270e_coco/model.onnx --opset_version=11 --enable_onnx_checker=True  !
+Run successfully with command - yolov3_darknet53_270e_coco - python3.7 ./deploy/third_engine/onnx/infer.py --infer_cfg=./output_inference/yolov3_darknet53_270e_coco/infer_cfg.yml --onnx_file=./output_inference/yolov3_darknet53_270e_coco/model.onnx --image_file=./demo/000000014439.jpg   > ./test_tipc/output/paddle2onnx_infer_cpu.log 2>&1 !
+```
+
+运行失败时会输出：
+
+```
+Run failed with command - yolov3_darknet53_270e_coco - paddle2onnx --model_dir=./output_inference/yolov3_darknet53_270e_coco --model_filename=model.pdmodel --params_filename=model.pdiparams --save_file=./output_inference/yolov3_darknet53_270e_coco/model.onnx --opset_version=11 --enable_onnx_checker=True  !
+...
+```
+
+
+## 3. 更多教程
+
+本文档为功能测试用，更详细的Paddle2onnx预测使用教程请参考：[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX)
diff --git a/test_tipc/docs/test_ptq_inference_python.md b/test_tipc/docs/test_ptq_inference_python.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b1c04c5b01b5d67ba285e88b1b8c9e3361c2b82
--- /dev/null
+++ b/test_tipc/docs/test_ptq_inference_python.md
@@ -0,0 +1,44 @@
+# Linux GPU/CPU 离线量化功能测试
+
+Linux GPU/CPU 离线量化功能测试的主程序为`test_ptq_inference_python.sh`，可以测试基于Python的离线量化功能。
+
+## 1. 测试结论汇总
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+|  ----   |  ---- |-----------|  :----:  |   :----:   |  :----:  |
+| 量化模型 | GPU | 1/2       | int8 | - | - |
+| 量化模型 | CPU | 1/2       | - | int8 | 支持 |
+
+## 2. 测试流程
+### 2.1 功能测试
+先运行`prepare.sh`准备数据和模型，然后运行`test_ptq_inference_python.sh`进行测试，最终在```test_tipc/output```目录下生成`python_infer_*.log`后缀的日志文件。
+
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_ptq_infer_python.txt "whole_infer"
+
+# 用法:
+bash test_tipc/test_ptq_inference_python.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_ptq_infer_python.txt
+```  
+
+#### 运行结果
+
+各测试的运行情况会打印在 `test_tipc/output/results_ptq_python.log` 中：
+运行成功时会输出：
+
+```
+Run successfully with command - yolov3_darknet53_270e_coco - python3.7 tools/post_quant.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --slim_config configs/slim/post_quant/yolov3_darknet53_ptq.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams filename=yolov3_darknet53_270e_coco --output_dir=./output_inference !
+Run successfully with command - yolov3_darknet53_270e_coco - python3.7 ./deploy/python/infer.py --device=gpu --run_mode=paddle --model_dir=./output_inference/yolov3_darknet53_270e_coco --batch_size=2 --image_dir=./dataset/coco/test2017/ --run_benchmark=False   > ./test_tipc/output/yolov3_darknet53_270e_coco/whole_infer/python_infer_gpu_mode_paddle_batchsize_2.log 2>&1 !
+...
+```
+
+运行失败时会输出：
+
+```
+Run failed with command - yolov3_darknet53_270e_coco - python3.7 tools/post_quant.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --slim_config configs/slim/post_quant/yolov3_darknet53_ptq.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams filename=yolov3_darknet53_270e_coco --output_dir=./output_inference!
+...
+```
+
+
+## 3. 更多教程
+
+本文档为功能测试用，更详细的离线量化功能使用教程请参考：[Paddle 离线量化官网教程](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/static/quant/quantization_api.rst#quant_post_static)
diff --git a/test_tipc/docs/test_serving.md b/test_tipc/docs/test_serving.md
new file mode 100644
index 0000000000000000000000000000000000000000..593c7beabd7c9a535128d3dc201c2341e9880051
--- /dev/null
+++ b/test_tipc/docs/test_serving.md
@@ -0,0 +1,91 @@
+# PaddleServing预测功能测试
+
+PaddleServing预测功能测试的主程序为`test_serving_infer_python.sh`和`test_serving_infer_cpp.sh`，可以测试基于PaddleServing的部署功能。
+
+## 1. 测试结论汇总
+
+基于训练是否使用量化，进行本测试的模型可以分为`正常模型`和`量化模型`，这两类模型对应的Serving预测功能汇总如下：
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+|  ----   |  ---- |-----------|  :----:  |   :----:   |  :----:  |
+| 正常模型 | GPU | 1/2       | fp32/fp16 | - | - |
+| 正常模型 | CPU | 1/2       | - | fp32 | 支持 |
+| 量化模型 | GPU | 1/2       | int8 | - | - |
+| 量化模型 | CPU | 1/2       | - | int8 | 支持 |
+
+## 2. 测试流程
+运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
+
+### 2.1 功能测试
+**python serving**
+先运行`prepare.sh`准备数据和模型，然后运行`test_serving_infer_python.sh`进行测试，最终在```test_tipc/output```目录下生成`serving_infer_python*.log`后缀的日志文件。
+
+```shell
+bash test_tipc/prepare.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt "serving_infer"
+
+# 用法1:
+bash test_tipc/test_serving_infer_python.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt
+# 用法2: 指定GPU卡预测，第二个传入参数为GPU卡号
+bash test_tipc/test_serving_infer_python.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt "1"
+```  
+**cpp serving**
+先运行`prepare.sh`准备数据和模型，然后运行`test_serving_infer_cpp.sh`进行测试，最终在```test_tipc/output```目录下生成`serving_infer_cpp*.log`后缀的日志文件。
+
+```shell
+bash test_tipc/prepare.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_serving_cpp_linux_gpu_cpu.txt "serving_infer"
+
+# 用法:
+bash test_tipc/test_serving_infer_cpp.sh test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_model_linux_gpu_normal_normal_serving_cpp_linux_gpu_cpu.txt
+```  
+
+#### 运行结果
+
+各测试的运行情况会打印在 `test_tipc/output/results_serving.log` 中：
+运行成功时会输出：
+
+```
+Run successfully  with command - python3.7 pipeline_http_client.py --image_dir=../../doc/imgs > ../../tests/output/server_infer_cpu_usemkldnn_True_threads_1_batchsize_1.log 2>&1 !
+Run successfully  with command - xxxxx
+...
+```
+
+运行失败时会输出：
+
+```
+Run failed with command - python3.7 pipeline_http_client.py --image_dir=../../doc/imgs > ../../tests/output/server_infer_cpu_usemkldnn_True_threads_1_batchsize_1.log 2>&1 !
+Run failed with command - python3.7 pipeline_http_client.py --image_dir=../../doc/imgs > ../../tests/output/server_infer_cpu_usemkldnn_True_threads_6_batchsize_1.log 2>&1 !
+Run failed with command - xxxxx
+...
+```
+
+详细的预测结果会存在 test_tipc/output/ 文件夹下，例如`server_infer_gpu_usetrt_True_precision_fp32_batchsize_1.log`中会返回检测框的坐标:
+
+```
+{'err_no': 0, 'err_msg': '', 'key': ['dt_boxes'], 'value': ['[[[ 78. 642.]\n  [409. 640.]\n  [409. 657.]\n  
+[ 78. 659.]]\n\n [[ 75. 614.]\n  [211. 614.]\n      [211. 635.]\n  [ 75. 635.]]\n\n
+[[103. 554.]\n  [135. 554.]\n  [135. 575.]\n  [103. 575.]]\n\n [[ 75. 531.]\n  
+[347. 531.]\n  [347. 549.]\n  [ 75. 549.]    ]\n\n [[ 76. 503.]\n  [309. 498.]\n  
+[309. 521.]\n  [ 76. 526.]]\n\n [[163. 462.]\n  [317. 462.]\n  [317. 493.]\n  
+[163. 493.]]\n\n [[324. 431.]\n  [414.     431.]\n  [414. 452.]\n  [324. 452.]]\n\n
+[[ 76. 412.]\n  [208. 408.]\n  [209. 424.]\n  [ 76. 428.]]\n\n [[307. 409.]\n  
+[428. 409.]\n  [428. 426.]\n  [307    . 426.]]\n\n [[ 74. 385.]\n  [217. 382.]\n  
+[217. 400.]\n  [ 74. 403.]]\n\n [[308. 381.]\n  [427. 380.]\n  [427. 400.]\n  
+[308. 401.]]\n\n [[ 74. 363.]\n      [195. 362.]\n  [195. 378.]\n  [ 74. 379.]]\n\n
+[[303. 359.]\n  [423. 357.]\n  [423. 375.]\n  [303. 377.]]\n\n [[ 70. 336.]\n  
+[239. 334.]\n  [239. 354.]\    n  [ 70. 356.]]\n\n [[ 70. 312.]\n  [204. 310.]\n  
+[204. 327.]\n  [ 70. 330.]]\n\n [[303. 308.]\n  [419. 306.]\n  [419. 326.]\n  
+[303. 328.]]\n\n [[113. 2    72.]\n  [246. 270.]\n  [247. 299.]\n  [113. 301.]]\n\n
+ [[361. 269.]\n  [384. 269.]\n  [384. 296.]\n  [361. 296.]]\n\n [[ 70. 250.]\n
+ [243. 246.]\n  [243.     265.]\n  [ 70. 269.]]\n\n [[ 65. 221.]\n  [187. 220.]\n  
+[187. 240.]\n  [ 65. 241.]]\n\n [[337. 216.]\n  [382. 216.]\n  [382. 240.]\n  
+[337. 240.]]\n\n [    [ 65. 196.]\n  [247. 193.]\n  [247. 213.]\n  [ 65. 216.]]\n\n
+[[296. 197.]\n  [423. 191.]\n  [424. 209.]\n  [296. 215.]]\n\n [[ 65. 167.]\n  [244. 167.]\n  
+[244. 186.]\n  [ 65. 186.]]\n\n [[ 67. 139.]\n  [290. 139.]\n  [290. 159.]\n  [ 67. 159.]]\n\n
+[[ 68. 113.]\n  [410. 113.]\n  [410. 128.]\n  [ 68. 129.]    ]\n\n [[277.  87.]\n  [416.  87.]\n  
+[416. 108.]\n  [277. 108.]]\n\n [[ 79.  28.]\n  [132.  28.]\n  [132.  62.]\n  [ 79.  62.]]\n\n
+[[163.  17.]\n  [410.      14.]\n  [410.  50.]\n  [163.  53.]]]']}
+```
+
+## 3. 更多教程
+
+本文档为功能测试用，更详细的Serving预测使用教程请参考：[PaddleDetection 服务化部署](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/deploy/serving)
diff --git a/test_tipc/docs/test_train_fleet_inference_python.md b/test_tipc/docs/test_train_fleet_inference_python.md
new file mode 100644
index 0000000000000000000000000000000000000000..98b0bbd3cb3cf7eba96162f653155185f7b32bc1
--- /dev/null
+++ b/test_tipc/docs/test_train_fleet_inference_python.md
@@ -0,0 +1,76 @@
+# Linux GPU/CPU 多机多卡训练推理测试
+
+Linux GPU/CPU 多机多卡训练推理测试的主程序为`test_train_fleet_inference_python.sh`，可以测试基于Python的模型训练、评估、推理等基本功能。
+
+## 1. 测试结论汇总
+
+- 训练相关：
+
+|   算法名称   | 模型名称 | 多机多卡 |
+|:--------:|   :----:  |    :----:  |
+| PP-YOLOE | ppyoloe_crn_s_300e_coco     | 分布式训练 |
+
+
+- 推理相关：
+
+|   算法名称   |           模型名称           | device_CPU | device_GPU | batchsize |
+|:--------:|:------------------------:|   :----:   |  :----:  |:---------:|
+| PP-YOLOE | ppyoloe_crn_s_300e_coco  |  支持 | 支持 |   1, 2    |
+
+
+## 2. 测试流程
+
+运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
+
+### 2.1 功能测试
+
+#### 2.1.1 修改配置文件
+
+首先，修改配置文件中的`ip`设置:  假设两台机器的`ip`地址分别为`192.168.0.1`和`192.168.0.2`，则对应的配置文件`gpu_list`字段需要修改为`gpu_list:192.168.0.1,192.168.0.2;0,1`； `ip`地址查看命令为`ifconfig`。
+
+
+#### 2.1.2 准备数据
+
+运行`prepare.sh`准备数据和模型，以配置文件`test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt`为例，数据准备命令如下所示。
+
+```shell
+bash test_tipc/prepare.sh test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt lite_train_lite_infer
+```
+
+**注意：** 由于是多机训练，这里需要在所有的节点上均运行启动上述命令，准备数据。
+
+#### 2.1.3 修改起始端口并开始测试
+
+在多机的节点上使用下面的命令设置分布式的起始端口（否则后面运行的时候会由于无法找到运行端口而hang住），一般建议设置在`10000~20000`之间。
+
+```shell
+export FLAGS_START_PORT=17000
+```
+
+以配置文件`test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt`为例，测试方法如下所示。
+
+```shell
+bash test_tipc/test_train_inference_python.sh  test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt lite_train_lite_infer
+```
+
+**注意：** 由于是多机训练，这里需要在所有的节点上均运行启动上述命令进行测试。
+
+
+#### 2.1.4 输出结果
+
+输出结果如下，表示命令运行成功。
+
+```bash
+ Run successfully with command - python3.7 -m paddle.distributed.launch --ips=192.168.0.1,192.168.0.2 --gpus=0,1
+ tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml -o log_iter=1 use_gpu=True save_dir=./test_tipc/outpu
+t/ppyoloe_crn_s_300e_coco/norm_train_gpus_0,1_autocast_null_nodes_2 epoch=1 pretrain_weights=https://paddledet.bj.bc
+ebos.com/models/ppyoloe_crn_s_300e_coco.pdparams TrainReader.batch_size=2 filename=ppyoloe_crn_s_300e_coco    !
+
+ ......
+ Run successfully with command - python3.7 ./deploy/python/infer.py --device=cpu --enable_mkldnn=False --cpu_threads
+=4 --model_dir=./test_tipc/output/ppyoloe_crn_s_300e_coco/norm_train_gpus_0,1_autocast_null_nodes_2/ppyoloe_crn_s_30
+0e_coco --batch_size=2 --image_dir=./dataset/coco/test2017/ --run_benchmark=False --trt_max_shape=1600 > ./test_tipc
+/output/ppyoloe_crn_s_300e_coco/python_infer_cpu_usemkldnn_False_threads_4_precision_fluid_batchsize_2.log 2>&1 !
+```
+
+**注意：** 由于分布式训练时，仅在`trainer_id=0`所在的节点中保存模型，因此其他的节点中在运行模型导出与推理时会报错，为正常现象。
diff --git a/test_tipc/docs/test_train_inference_python.md b/test_tipc/docs/test_train_inference_python.md
new file mode 100644
index 0000000000000000000000000000000000000000..10459b84346352e1f13c846919574f908b6be2da
--- /dev/null
+++ b/test_tipc/docs/test_train_inference_python.md
@@ -0,0 +1,152 @@
+# Linux端基础训练预测功能测试
+
+Linux端基础训练预测功能测试的主程序为`test_train_inference_python.sh`，可以测试基于Python的模型训练、评估、推理等基本功能，包括裁剪、量化、蒸馏。
+
+- Mac端基础训练预测功能测试参考[链接](./)
+- Windows端基础训练预测功能测试参考[链接](./)
+
+## 1. 测试结论汇总
+
+- 训练相关：
+
+| 算法名称 | 模型名称 | 单机单卡 | 单机多卡 | 多机多卡 | 模型压缩（单机多卡） |
+|  :----  |   :----  |    :----  |  :----   |  :----   |  :----   |
+|  PPYOLO  | ppyolo_mbv3_large_coco | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 <br> 离线量化（无需训练） |
+|  PPYOLO  | ppyolo_r50vd_dcn_1x_coco | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 <br> 离线量化（无需训练） |
+
+
+- 预测相关：基于训练是否使用量化，可以将训练产出的模型可以分为`正常模型`和`量化模型`，这两类模型对应的预测功能汇总如下，
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+|  ----   |  ---- |   ----   |  :----:  |   :----:   |  :----:  |
+| 正常模型 | GPU | 1/8 | fp32/fp16 | - | - |
+| 正常模型 | CPU | 1/8 | - | fp32/fp16 | 支持 |
+| 量化模型 | GPU | 1/8 | int8 | - | - |
+| 量化模型 | CPU | 1/8 | - | int8 | 支持 |
+
+
+## 2. 测试流程
+
+运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
+
+### 2.1 安装依赖
+- 安装PaddlePaddle >= 2.2
+- 安装PaddleDetection依赖
+    ```
+    pip install -r ./requirements.txt
+    pip install -r ./test_tipc/requirements.txt
+    ```
+- 安装autolog（规范化日志输出工具）
+    ```
+    git clone https://github.com/LDOUBLEV/AutoLog
+    cd AutoLog
+    pip install -r ./requirements.txt
+    python setup.py bdist_wheel
+    pip install ./dist/auto_log-1.0.0-py3-none-any.whl
+    ```
+- 安装PaddleSlim (可选)
+   ```
+   # 如果要测试量化、裁剪等功能，需要安装PaddleSlim
+   pip install paddleslim
+   ```
+
+
+### 2.2 功能测试
+先运行`prepare.sh`准备数据和模型，然后运行`test_train_inference_python.sh`进行测试，最终在```test_tipc/output```目录下生成`python_infer_*.log`格式的日志文件，
+以yolov3_darknet53_270e_coco为例。
+
+
+`test_train_inference_python.sh`包含5种运行模式，每种模式的运行数据不同，分别用于测试速度和精度，分别是：
+
+- 模式1：lite_train_lite_infer，使用少量数据训练，用于快速验证训练到预测的走通流程，不验证精度和速度；
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'lite_train_lite_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'lite_train_lite_infer'
+```  
+
+- 模式2：lite_train_whole_infer，使用少量数据训练，一定量数据预测，用于验证训练后的模型执行预测，预测速度是否合理；
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt  'lite_train_whole_infer'
+bash test_tipc/test_train_inference_python.sh ../test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'lite_train_whole_infer'
+```  
+
+- 模式3：whole_infer，不训练，全量数据预测，走通开源模型评估、动转静，检查inference model预测时间和精度;
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'whole_infer'
+# 用法1:
+bash test_tipc/test_train_inference_python.sh ../test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'whole_infer'
+# 用法2: 指定GPU卡预测，第三个传入参数为GPU卡号
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'whole_infer' '1'
+```  
+
+- 模式4：whole_train_whole_infer，CE： 全量数据训练，全量数据预测，验证模型训练精度，预测精度，预测速度；
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'whole_train_whole_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt 'whole_train_whole_infer'
+```  
+
+- 模式5：klquant_whole_infer，测试离线量化；
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt  'klquant_whole_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt  'klquant_whole_infer'
+```
+
+运行相应指令后，在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式下，会运行训练+推理的链条，因此，在`test_tipc/output`文件夹有以下文件：
+```
+test_tipc/output/
+|- results_python.log    # 运行指令状态的日志
+|- norm_train_gpus_0_autocast_null/  # GPU 0号卡上正常训练的训练日志和模型保存文件夹
+|- pact_train_gpus_0_autocast_null/  # GPU 0号卡上量化训练的训练日志和模型保存文件夹
+......
+|- python_infer_cpu_usemkldnn_True_threads_1_precision_fluid_batchsize_1.log  # CPU上开启Mkldnn线程数设置为1，测试batch_size=1条件下的预测运行日志
+|- python_infer_gpu_precision_trt_fp16_batchsize_1.log # GPU上开启TensorRT，测试batch_size=1的半精度预测日志
+......
+```
+
+其中`results_python.log`中包含了每条指令的运行状态，如果运行成功会输出：
+```
+Run successfully with command - python3.7 tools/train.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o use_gpu=True save_dir=./test_tipc/output/norm_train_gpus_0_autocast_null epoch=1 pretrain_weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams TrainReader.batch_size=2 filename=yolov3_darknet53_270e_coco  !
+Run successfully with command - python3.7 tools/eval.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=./test_tipc/output/norm_train_gpus_0_autocast_null/yolov3_darknet53_270e_coco/model_final.pdparams use_gpu=True  !
+......
+```
+如果运行失败，会输出：
+```
+Run failed with command - python3.7 tools/train.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o use_gpu=True save_dir=./test_tipc/output/norm_train_gpus_0_autocast_null epoch=1 pretrain_weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams TrainReader.batch_size=2 filename=yolov3_darknet53_270e_coco  !
+Run failed with command - python3.7 tools/eval.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=./test_tipc/output/norm_train_gpus_0_autocast_null/yolov3_darknet53_270e_coco/model_final.pdparams use_gpu=True  !
+......
+```
+可以很方便的根据`results_python.log`中的内容判定哪一个指令运行错误。
+
+
+### 2.3 精度测试
+
+使用compare_results.py脚本比较模型预测的结果是否符合预期，主要步骤包括：
+- 提取日志中的预测坐标；
+- 从本地文件中提取保存好的坐标结果；
+- 比较上述两个结果是否符合精度预期，误差大于设置阈值时会报错。
+
+#### 使用方式
+运行命令：
+```shell
+python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/python_*.txt  --log_file=./test_tipc/output/python_*.log --atol=1e-3 --rtol=1e-3
+```
+
+参数介绍：  
+- gt_file： 指向事先保存好的预测结果路径，支持*.txt 结尾，会自动索引*.txt格式的文件，文件默认保存在test_tipc/result/ 文件夹下
+- log_file: 指向运行test_tipc/test_train_inference_python.sh 脚本的infer模式保存的预测日志，预测日志中打印的有预测结果，比如：文本框，预测文本，类别等等，同样支持python_infer_*.log格式传入
+- atol: 设置的绝对误差
+- rtol: 设置的相对误差
+
+#### 运行结果
+
+正常运行效果如下图：
+<img src="compare_right.png" width="1000">
+
+出现不一致结果时的运行输出：
+<img src="compare_wrong.png" width="1000">
+
+
+## 3. 更多教程
+本文档为功能测试用，更丰富的训练预测使用教程请参考：  
+[模型训练](../../docs/tutorials/GETTING_STARTED_cn.md)  
+[PaddleDetection预测部署](../../deploy/README.md)
diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ff5d2f74c7138fbd7d59a1e7e6005018a890e98f
--- /dev/null
+++ b/test_tipc/prepare.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+
+FILENAME=$1
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer'
+#                 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer',
+#                 'cpp_infer', 'serving_infer',  'lite_infer', 'paddle2onnx_infer']
+MODE=$2
+
+# parse params
+dataline=$(cat ${FILENAME})
+IFS=$'\n'
+lines=(${dataline})
+
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+
+if [ ${MODE} = "whole_train_whole_infer" ];then
+    mv ./dataset/coco/download_coco.py . && rm -rf ./dataset/coco/* && mv ./download_coco.py ./dataset/coco/
+    # prepare whole training data
+    eval "${python} ./dataset/coco/download_coco.py"
+elif [ ${MODE} = "cpp_infer" ];then
+    # download coco lite data
+    wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/tipc/coco_tipc.tar --no-check-certificate
+    cd ./dataset/coco/ && tar -xvf coco_tipc.tar && mv -n coco_tipc/* .
+    rm -rf coco_tipc/ && cd ../../
+    # download wider_face lite data
+    wget -nc -P ./dataset/wider_face/ https://paddledet.bj.bcebos.com/data/tipc/wider_tipc.tar --no-check-certificate
+    cd ./dataset/wider_face/ && tar -xvf wider_tipc.tar && mv -n wider_tipc/* .
+    rm -rf wider_tipc/ && cd ../../
+    # download spine lite data
+    wget -nc -P ./dataset/spine_coco/ https://paddledet.bj.bcebos.com/data/tipc/spine_tipc.tar --no-check-certificate
+    cd ./dataset/spine_coco/ && tar -xvf spine_tipc.tar && mv -n spine_tipc/* .
+    rm -rf spine_tipc/ && cd ../../
+    if [[ ${model_name} =~ "s2anet" ]]; then
+        cd ./ppdet/ext_op && eval "${python} setup.py install"
+        cd ../../
+    elif [[ ${model_name} =~ "tinypose" ]]; then
+        wget -nc -P ./output_inference/ https://bj.bcebos.com/v1/paddledet/models/keypoint/picodet_s_320_pedestrian.tar --no-check-certificate
+        cd ./output_inference/ && tar -xvf picodet_s_320_pedestrian.tar
+        cd ../
+    fi
+    # download KL model
+    if [[ ${model_name} = "picodet_lcnet_1_5x_416_coco_KL" ]]; then
+        wget -nc -P ./output_inference/picodet_lcnet_1_5x_416_coco_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/picodet_lcnet_1_5x_416_coco_ptq.tar --no-check-certificate
+        cd ./output_inference/picodet_lcnet_1_5x_416_coco_KL/ && tar -xvf picodet_lcnet_1_5x_416_coco_ptq.tar && mv -n picodet_lcnet_1_5x_416_coco_ptq/* .
+        cd ../../
+    elif [[ ${model_name} = "ppyoloe_crn_s_300e_coco_KL" ]]; then
+        wget -nc -P ./output_inference/ppyoloe_crn_s_300e_coco_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/ppyoloe_crn_s_300e_coco_ptq.tar --no-check-certificate
+        cd ./output_inference/ppyoloe_crn_s_300e_coco_KL/ && tar -xvf ppyoloe_crn_s_300e_coco_ptq.tar && mv -n ppyoloe_crn_s_300e_coco_ptq/* .
+        cd ../../
+    elif [[ ${model_name} = "ppyolo_mbv3_large_coco_KL" ]]; then
+        wget -nc -P ./output_inference/ppyolo_mbv3_large_coco_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/ppyolo_mbv3_large_ptq.tar --no-check-certificate
+        cd ./output_inference/ppyolo_mbv3_large_coco_KL/ && tar -xvf ppyolo_mbv3_large_ptq.tar && mv -n ppyolo_mbv3_large_ptq/* .
+        cd ../../
+    elif [[ ${model_name} = "mask_rcnn_r50_fpn_1x_coco_KL" ]]; then
+        wget -nc -P ./output_inference/mask_rcnn_r50_fpn_1x_coco_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/mask_rcnn_r50_fpn_1x_coco_ptq.tar --no-check-certificate
+        cd ./output_inference/mask_rcnn_r50_fpn_1x_coco_KL/ && tar -xvf mask_rcnn_r50_fpn_1x_coco_ptq.tar && mv -n mask_rcnn_r50_fpn_1x_coco_ptq/* .
+        cd ../../
+    elif [[ ${model_name} = "tinypose_128x96_KL" ]]; then
+        wget -nc -P ./output_inference/tinypose_128x96_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/tinypose_128x96_ptq.tar --no-check-certificate
+        cd ./output_inference/tinypose_128x96_KL/ && tar -xvf tinypose_128x96_ptq.tar && mv -n tinypose_128x96_ptq/* .
+        cd ../../
+    fi
+    # download mot lite data
+    wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/tipc/mot_tipc.tar --no-check-certificate
+    cd ./dataset/mot/ && tar -xvf mot_tipc.tar && mv -n mot_tipc/* .
+    rm -rf mot_tipc/ && cd ../../
+
+    opencv_dir=$(func_parser_value "${lines[15]}")
+    # prepare opencv
+    cd ./deploy/cpp
+    if [ ${opencv_dir} = "default" ] || [ ${opencv_dir} = "null" ]; then
+        if [ -d "deps/opencv-3.4.16_gcc8.2_ffmpeg/" ]; then
+            echo "################### Opencv already exists, skip downloading. ###################"
+        else
+            mkdir -p $(pwd)/deps && cd $(pwd)/deps
+            wget -c https://paddledet.bj.bcebos.com/data/opencv-3.4.16_gcc8.2_ffmpeg.tar.gz --no-check-certificate
+            tar -xvf opencv-3.4.16_gcc8.2_ffmpeg.tar.gz && cd ../
+            echo "################### Finish downloading opencv. ###################"
+        fi
+    fi
+    cd ../../
+elif [ ${MODE} = "benchmark_train" ];then
+    pip install -U pip
+    pip install Cython
+    pip install -r requirements.txt
+    if [[ ${model_name} =~ "higherhrnet" ]] || [[ ${model_name} =~ "hrnet" ]] || [[ ${model_name} =~ "tinypose" ]];then
+        wget -nc -P ./dataset/ https://bj.bcebos.com/v1/paddledet/data/coco.tar --no-check-certificate
+        cd ./dataset/ && tar -xf coco.tar
+        ls ./coco/
+        cd ../
+    else
+        # prepare lite benchmark coco data
+        wget -nc -P ./dataset/coco/ https://bj.bcebos.com/v1/paddledet/data/cocomini.zip --no-check-certificate
+        cd ./dataset/coco/ && unzip cocomini.zip
+        mv -u cocomini/* ./
+        ls ./
+        cd ../../
+        # prepare lite benchmark mot data
+        wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/mot_benchmark.tar --no-check-certificate
+        cd ./dataset/mot/ && tar -xf mot_benchmark.tar
+        mv -u mot_benchmark/* ./
+        ls ./
+        cd ../../
+    fi
+elif [ ${MODE} = "paddle2onnx_infer" ];then
+    # install paddle2onnx
+    ${python} -m pip install paddle2onnx
+    ${python} -m pip install onnx onnxruntime
+elif [ ${MODE} = "serving_infer" ];then
+    unset https_proxy http_proxy
+    # download coco lite data
+    wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/tipc/coco_tipc.tar --no-check-certificate
+    cd ./dataset/coco/ && tar -xvf coco_tipc.tar && mv -n coco_tipc/* .
+    rm -rf coco_tipc/ && cd ../../
+    # download KL model
+    if [[ ${model_name} = "picodet_lcnet_1_5x_416_coco_KL" ]]; then
+        wget -nc -P ./output_inference/picodet_lcnet_1_5x_416_coco_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/picodet_lcnet_1_5x_416_coco_ptq.tar --no-check-certificate
+        cd ./output_inference/picodet_lcnet_1_5x_416_coco_KL/ && tar -xvf picodet_lcnet_1_5x_416_coco_ptq.tar && mv -n picodet_lcnet_1_5x_416_coco_ptq/* .
+        cd ../../
+        eval "${python} -m paddle_serving_client.convert --dirname output_inference/picodet_lcnet_1_5x_416_coco_KL/ --model_filename model.pdmodel --params_filename model.pdiparams --serving_server output_inference/picodet_lcnet_1_5x_416_coco_KL/serving_server --serving_client output_inference/picodet_lcnet_1_5x_416_coco_KL/serving_client"
+    elif [[ ${model_name} = "ppyoloe_crn_s_300e_coco_KL" ]]; then
+        wget -nc -P ./output_inference/ppyoloe_crn_s_300e_coco_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/ppyoloe_crn_s_300e_coco_ptq.tar --no-check-certificate
+        cd ./output_inference/ppyoloe_crn_s_300e_coco_KL/ && tar -xvf ppyoloe_crn_s_300e_coco_ptq.tar && mv -n ppyoloe_crn_s_300e_coco_ptq/* .
+        cd ../../
+        eval "${python} -m paddle_serving_client.convert --dirname output_inference/ppyoloe_crn_s_300e_coco_KL/ --model_filename model.pdmodel --params_filename model.pdiparams --serving_server output_inference/ppyoloe_crn_s_300e_coco_KL/serving_server --serving_client output_inference/ppyoloe_crn_s_300e_coco_KL/serving_client"
+    elif [[ ${model_name} = "ppyolo_mbv3_large_coco_KL" ]]; then
+        wget -nc -P ./output_inference/ppyolo_mbv3_large_coco_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/ppyolo_mbv3_large_ptq.tar --no-check-certificate
+        cd ./output_inference/ppyolo_mbv3_large_coco_KL/ && tar -xvf ppyolo_mbv3_large_ptq.tar && mv -n ppyolo_mbv3_large_ptq/* .
+        cd ../../
+        eval "${python} -m paddle_serving_client.convert --dirname output_inference/ppyolo_mbv3_large_coco_KL/ --model_filename model.pdmodel --params_filename model.pdiparams --serving_server output_inference/ppyolo_mbv3_large_coco_KL/serving_server --serving_client output_inference/ppyolo_mbv3_large_coco_KL/serving_client"
+    elif [[ ${model_name} = "mask_rcnn_r50_fpn_1x_coco_KL" ]]; then
+        wget -nc -P ./output_inference/mask_rcnn_r50_fpn_1x_coco_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/mask_rcnn_r50_fpn_1x_coco_ptq.tar --no-check-certificate
+        cd ./output_inference/mask_rcnn_r50_fpn_1x_coco_KL/ && tar -xvf mask_rcnn_r50_fpn_1x_coco_ptq.tar && mv -n mask_rcnn_r50_fpn_1x_coco_ptq/* .
+        cd ../../
+        eval "${python} -m paddle_serving_client.convert --dirname output_inference/mask_rcnn_r50_fpn_1x_coco_KL/ --model_filename model.pdmodel --params_filename model.pdiparams --serving_server output_inference/mask_rcnn_r50_fpn_1x_coco_KL/serving_server --serving_client output_inference/mask_rcnn_r50_fpn_1x_coco_KL/serving_client"
+    elif [[ ${model_name} = "tinypose_128x96_KL" ]]; then
+        wget -nc -P ./output_inference/tinypose_128x96_KL/ https://bj.bcebos.com/v1/paddledet/data/tipc/models/tinypose_128x96_ptq.tar --no-check-certificate
+        cd ./output_inference/tinypose_128x96_KL/ && tar -xvf tinypose_128x96_ptq.tar && mv -n tinypose_128x96_ptq/* .
+        cd ../../
+        eval "${python} -m paddle_serving_client.convert --dirname output_inference/tinypose_128x96_KL/ --model_filename model.pdmodel --params_filename model.pdiparams --serving_server output_inference/tinypose_128x96_KL/serving_server --serving_client output_inference/tinypose_128x96_KL/serving_client"
+    fi
+else
+    # download coco lite data
+    wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/tipc/coco_tipc.tar --no-check-certificate
+    cd ./dataset/coco/ && tar -xvf coco_tipc.tar && mv -n coco_tipc/* .
+    rm -rf coco_tipc/ && cd ../../
+    # download wider_face lite data
+    wget -nc -P ./dataset/wider_face/ https://paddledet.bj.bcebos.com/data/tipc/wider_tipc.tar --no-check-certificate
+    cd ./dataset/wider_face/ && tar -xvf wider_tipc.tar && mv -n wider_tipc/* .
+    rm -rf wider_tipc/ && cd ../../
+    # download spine_coco lite data
+    wget -nc -P ./dataset/spine_coco/ https://paddledet.bj.bcebos.com/data/tipc/spine_coco_tipc.tar --no-check-certificate
+    cd ./dataset/spine_coco/ && tar -xvf spine_coco_tipc.tar && mv -n spine_coco_tipc/* .
+    rm -rf spine_tipc/ && cd ../../
+    if [[ ${model_name} =~ "s2anet" ]]; then
+        cd ./ppdet/ext_op && eval "${python} setup.py install"
+        cd ../../
+    elif [[ ${model_name} =~ "ppyoloe_r_crn_s_3x_spine_coco" ]]; then
+        cd ./ppdet/ext_op && eval "${python} setup.py install"
+        cd ../../
+    fi
+    # download mot lite data
+    wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/tipc/mot_tipc.tar --no-check-certificate
+    cd ./dataset/mot/ && tar -xvf mot_tipc.tar && mv -n mot_tipc/* .
+    rm -rf mot_tipc/ && cd ../../
+fi
diff --git a/test_tipc/requirements.txt b/test_tipc/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a1aa828406da3c5e884799339564e5c1672edec
--- /dev/null
+++ b/test_tipc/requirements.txt
@@ -0,0 +1,7 @@
+pynvml
+psutil
+GPUtil
+paddleslim
+onnx
+onnxruntime
+paddle2onnx
diff --git a/test_tipc/test_inference_cpp.sh b/test_tipc/test_inference_cpp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..270ee70397f006ab2460f5e29503586abcd2fa64
--- /dev/null
+++ b/test_tipc/test_inference_cpp.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+
+FILENAME=$1
+MODE="cpp_infer"
+
+# parser model_name
+dataline=$(cat ${FILENAME})
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+echo "ppdet cpp_infer: ${model_name}"
+python=$(func_parser_value "${lines[2]}")
+filename_key=$(func_parser_key "${lines[3]}")
+filename_value=$(func_parser_value "${lines[3]}")
+
+# export params
+save_export_key=$(func_parser_key "${lines[5]}")
+save_export_value=$(func_parser_value "${lines[5]}")
+export_weight_key=$(func_parser_key "${lines[6]}")
+export_weight_value=$(func_parser_value "${lines[6]}")
+norm_export=$(func_parser_value "${lines[7]}")
+pact_export=$(func_parser_value "${lines[8]}")
+fpgm_export=$(func_parser_value "${lines[9]}")
+distill_export=$(func_parser_value "${lines[10]}")
+export_key1=$(func_parser_key "${lines[11]}")
+export_value1=$(func_parser_value "${lines[11]}")
+export_key2=$(func_parser_key "${lines[12]}")
+export_value2=$(func_parser_value "${lines[12]}")
+kl_quant_export=$(func_parser_value "${lines[13]}")
+
+# parser cpp inference model
+opencv_dir=$(func_parser_value "${lines[15]}")
+cpp_infer_mode_list=$(func_parser_value "${lines[16]}")
+cpp_infer_is_quant_list=$(func_parser_value "${lines[17]}")
+# parser cpp inference
+inference_cmd=$(func_parser_value "${lines[18]}")
+cpp_use_gpu_key=$(func_parser_key "${lines[19]}")
+cpp_use_gpu_list=$(func_parser_value "${lines[19]}")
+cpp_use_mkldnn_key=$(func_parser_key "${lines[20]}")
+cpp_use_mkldnn_list=$(func_parser_value "${lines[20]}")
+cpp_cpu_threads_key=$(func_parser_key "${lines[21]}")
+cpp_cpu_threads_list=$(func_parser_value "${lines[21]}")
+cpp_batch_size_key=$(func_parser_key "${lines[22]}")
+cpp_batch_size_list=$(func_parser_value "${lines[22]}")
+cpp_use_trt_key=$(func_parser_key "${lines[23]}")
+cpp_use_trt_list=$(func_parser_value "${lines[23]}")
+cpp_precision_key=$(func_parser_key "${lines[24]}")
+cpp_precision_list=$(func_parser_value "${lines[24]}")
+cpp_infer_model_key=$(func_parser_key "${lines[25]}")
+cpp_image_dir_key=$(func_parser_key "${lines[26]}")
+cpp_infer_img_dir=$(func_parser_value "${lines[26]}")
+cpp_benchmark_key=$(func_parser_key "${lines[27]}")
+cpp_benchmark_value=$(func_parser_value "${lines[27]}")
+cpp_infer_key1=$(func_parser_key "${lines[28]}")
+cpp_infer_value1=$(func_parser_value "${lines[28]}")
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_cpp.log"
+
+function func_cpp_inference(){
+    IFS='|'
+    _script=$1
+    _model_dir=$2
+    _log_path=$3
+    _img_dir=$4
+    _flag_quant=$5
+    # inference
+    for use_gpu in ${cpp_use_gpu_list[*]}; do
+        if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+            for use_mkldnn in ${cpp_use_mkldnn_list[*]}; do
+                if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+                    continue
+                fi
+                for threads in ${cpp_cpu_threads_list[*]}; do
+                    for batch_size in ${cpp_batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_mode_paddle_batchsize_${batch_size}.log"
+                        set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}")
+                        set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}")
+                        set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}")
+                        set_cpu_threads=$(func_set_params "${cpp_cpu_threads_key}" "${threads}")
+                        set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}")
+                        set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}")
+                        command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${cpp_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+                    done
+                done
+            done
+        elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+            for precision in ${cpp_precision_list[*]}; do
+                if [[ ${precision} != "paddle" ]]; then
+                    if [[ ${_flag_quant} = "False" ]] && [[ ${precision} = "trt_int8" ]]; then
+                        continue
+                    fi
+                    if [[ ${_flag_quant} = "True" ]] && [[ ${precision} != "trt_int8" ]]; then
+                        continue
+                    fi
+                fi
+                for batch_size in ${cpp_batch_size_list[*]}; do
+                    _save_log_path="${_log_path}/cpp_infer_gpu_mode_${precision}_batchsize_${batch_size}.log"
+                    set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}")
+                    set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}")
+                    set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}")
+                    set_precision=$(func_set_params "${cpp_precision_key}" "${precision}")
+                    set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}")
+                    set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}")
+                    command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+                    eval $command
+                    last_status=${PIPESTATUS[0]}
+                    eval "cat ${_save_log_path}"
+                    status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+                done
+            done
+        else
+            echo "Does not support hardware other than CPU and GPU Currently!"
+        fi
+    done
+}
+
+cd ./deploy/cpp
+# set OPENCV_DIR
+if [ ${opencv_dir} = "default" ] || [ ${opencv_dir} = "null" ]; then
+    OPENCV_DIR=$(pwd)/deps/opencv-3.4.16_gcc8.2_ffmpeg
+else
+    OPENCV_DIR=${opencv_dir}
+fi
+
+# build program
+# TODO: set PADDLE_INFER_DIR and TENSORRT_ROOT
+if [ -z $PADDLE_INFER_DIR ]; then
+    Paddle_Infer_Link=$2
+    if [ "" = "$Paddle_Infer_Link" ];then
+        wget -nc https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz --no-check-certificate
+        tar zxf paddle_inference.tgz
+        PADDLE_INFER_DIR=$(pwd)/paddle_inference
+    else
+        wget -nc $Paddle_Infer_Link --no-check-certificate
+        tar zxf paddle_inference.tgz
+        PADDLE_INFER_DIR=$(pwd)/paddle_inference
+        if [ ! -d "paddle_inference" ]; then
+          PADDLE_INFER_DIR=$(pwd)/paddle_inference_install_dir
+        fi
+    fi
+fi
+if [ -z $TENSORRT_ROOT ]; then
+    TENSORRT_ROOT=/usr/local/TensorRT6-cuda10.1-cudnn7
+fi
+CUDA_LIB=$(dirname `find /usr -name libcudart.so`)
+CUDNN_LIB=$(dirname `find /usr -name libcudnn.so`)
+TENSORRT_LIB_DIR="${TENSORRT_ROOT}/lib"
+TENSORRT_INC_DIR="${TENSORRT_ROOT}/include"
+
+rm -rf build
+mkdir -p build
+cd ./build
+cmake .. \
+    -DWITH_GPU=ON \
+    -DWITH_MKL=ON \
+    -DWITH_TENSORRT=OFF \
+    -DPADDLE_LIB_NAME=libpaddle_inference \
+    -DPADDLE_DIR=${PADDLE_INFER_DIR} \
+    -DCUDA_LIB=${CUDA_LIB} \
+    -DCUDNN_LIB=${CUDNN_LIB} \
+    -DTENSORRT_LIB_DIR=${TENSORRT_LIB_DIR} \
+    -DTENSORRT_INC_DIR=${TENSORRT_INC_DIR} \
+    -DOPENCV_DIR=${OPENCV_DIR} \
+    -DWITH_KEYPOINT=ON \
+    -DWITH_MOT=ON
+
+make -j8
+cd ../../../
+echo "################### build finished! ###################"
+
+
+# set cuda device
+GPUID=$3
+if [ ${#GPUID} -le 0 ];then
+    env=" "
+else
+    env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+fi
+eval $env
+
+# run cpp infer
+Count=0
+IFS="|"
+infer_quant_flag=(${cpp_infer_is_quant_list})
+for infer_mode in ${cpp_infer_mode_list[*]}; do
+    if [ ${infer_mode} != "null" ]; then
+        # run export
+        case ${infer_mode} in
+            norm) run_export=${norm_export} ;;
+            quant) run_export=${pact_export} ;;
+            fpgm) run_export=${fpgm_export} ;;
+            distill) run_export=${distill_export} ;;
+            kl_quant) run_export=${kl_quant_export} ;;
+            *) echo "Undefined infer_mode!"; exit 1;
+        esac
+        set_export_weight=$(func_set_params "${export_weight_key}" "${export_weight_value}")
+        set_save_export_dir=$(func_set_params "${save_export_key}" "${save_export_value}")
+        set_filename=$(func_set_params "${filename_key}" "${model_name}")
+        export_log_path="${LOG_PATH}/export.log"
+        export_cmd="${python} ${run_export} ${set_export_weight} ${set_filename} ${set_save_export_dir} "
+        echo  $export_cmd
+        eval "${export_cmd} > ${export_log_path} 2>&1"
+        status_export=$?
+        cat ${export_log_path}
+        status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+    fi
+
+    #run inference
+    save_export_model_dir="${save_export_value}/${model_name}"
+    is_quant=${infer_quant_flag[Count]}
+    func_cpp_inference "${inference_cmd}" "${save_export_model_dir}" "${LOG_PATH}" "${cpp_infer_img_dir}" ${is_quant}
+    Count=$(($Count + 1))
+done
+eval "unset CUDA_VISIBLE_DEVICES"
diff --git a/test_tipc/test_lite.sh b/test_tipc/test_lite.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/test_tipc/test_paddle2onnx.sh b/test_tipc/test_paddle2onnx.sh
new file mode 100644
index 0000000000000000000000000000000000000000..df4e7a0dc8c0a4f6af844a9fa2edd95f6070073e
--- /dev/null
+++ b/test_tipc/test_paddle2onnx.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+
+FILENAME=$1
+MODE="paddle2onnx_infer"
+
+# parser model_name
+dataline=$(cat ${FILENAME})
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+echo "ppdet onnx_infer: ${model_name}"
+python=$(func_parser_value "${lines[2]}")
+filename_key=$(func_parser_key "${lines[3]}")
+filename_value=$(func_parser_value "${lines[3]}")
+
+# export params
+save_export_key=$(func_parser_key "${lines[5]}")
+save_export_value=$(func_parser_value "${lines[5]}")
+export_weight_key=$(func_parser_key "${lines[6]}")
+export_weight_value=$(func_parser_value "${lines[6]}")
+norm_export=$(func_parser_value "${lines[7]}")
+pact_export=$(func_parser_value "${lines[8]}")
+fpgm_export=$(func_parser_value "${lines[9]}")
+distill_export=$(func_parser_value "${lines[10]}")
+export_key1=$(func_parser_key "${lines[11]}")
+export_value1=$(func_parser_value "${lines[11]}")
+export_param_key=$(func_parser_key "${lines[12]}")
+export_param_value=$(func_parser_value "${lines[12]}")
+kl_quant_export=$(func_parser_value "${lines[13]}")
+
+# parser paddle2onnx params
+infer_mode_list=$(func_parser_value "${lines[15]}")
+infer_is_quant_list=$(func_parser_value "${lines[16]}")
+
+padlle2onnx_cmd=$(func_parser_value "${lines[17]}")
+model_dir_key=$(func_parser_key "${lines[18]}")
+model_filename_key=$(func_parser_key "${lines[19]}")
+model_filename_value=$(func_parser_value "${lines[19]}")
+params_filename_key=$(func_parser_key "${lines[20]}")
+params_filename_value=$(func_parser_value "${lines[20]}")
+save_file_key=$(func_parser_key "${lines[21]}")
+save_file_value=$(func_parser_value "${lines[21]}")
+opset_version_key=$(func_parser_key "${lines[22]}")
+opset_version_value=$(func_parser_value "${lines[22]}")
+enable_onnx_checker_key=$(func_parser_key "${lines[23]}")
+enable_onnx_checker_value=$(func_parser_value "${lines[23]}")
+paddle2onnx_params1_key=$(func_parser_key "${lines[24]}")
+paddle2onnx_params1_value=$(func_parser_value "${lines[24]}")
+
+# parser onnx inference 
+inference_py=$(func_parser_value "${lines[25]}")
+infer_cfg_key=$(func_parser_key "${lines[26]}")
+onnx_file_key=$(func_parser_key "${lines[27]}")
+infer_image_key=$(func_parser_key "${lines[28]}")
+infer_image_value=$(func_parser_value "${lines[28]}")
+infer_param1_key=$(func_parser_key "${lines[29]}")
+infer_param1_value=$(func_parser_value "${lines[29]}")
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_paddle2onnx.log"
+
+function func_paddle2onnx_inference(){
+    IFS='|'
+    _python=$1
+    _log_path=$2
+    _export_model_dir=$3
+
+    # paddle2onnx
+    echo "################### run paddle2onnx ###################"
+    set_dirname=$(func_set_params "${model_dir_key}" "${_export_model_dir}")
+    set_model_filename=$(func_set_params "${model_filename_key}" "${model_filename_value}")
+    set_params_filename=$(func_set_params "${params_filename_key}" "${params_filename_value}")
+    set_save_model=$(func_set_params "${save_file_key}" "${_export_model_dir}/${save_file_value}")
+    set_opset_version=$(func_set_params "${opset_version_key}" "${opset_version_value}")
+    set_enable_onnx_checker=$(func_set_params "${enable_onnx_checker_key}" "${enable_onnx_checker_value}")
+    set_paddle2onnx_params1=$(func_set_params "${paddle2onnx_params1_key}" "${paddle2onnx_params1_value}")
+    trans_log_path="${_log_path}/trans_model.log"
+    trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} ${set_paddle2onnx_params1}"
+    eval "${trans_model_cmd} > ${trans_log_path} 2>&1"
+    last_status=${PIPESTATUS[0]}
+    cat ${trans_log_path}
+    status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" "${trans_log_path}"
+
+    # python inference
+    echo "################### run onnx infer ###################"
+    set_infer_cfg=$(func_set_params "${infer_cfg_key}" "${_export_model_dir}/infer_cfg.yml")
+    set_onnx_file=$(func_set_params "${onnx_file_key}" "${_export_model_dir}/${save_file_value}")
+    set_infer_image_file=$(func_set_params "${infer_image_key}" "${infer_image_value}")
+    set_infer_param1=$(func_set_params "${infer_param1_key}" "${infer_param1_value}")
+    _save_log_path="${_log_path}/paddle2onnx_infer_cpu.log"
+    infer_model_cmd="${python} ${inference_py} ${set_infer_cfg} ${set_onnx_file} ${set_infer_image_file} ${set_infer_param1}"
+    eval "${infer_model_cmd} > ${_save_log_path} 2>&1"
+    last_status=${PIPESTATUS[0]}
+    cat ${_save_log_path}
+    status_check $last_status "${infer_model_cmd}" "${status_log}" "${model_name}" "${_save_log_path}"
+}
+
+export Count=0
+IFS="|"
+for infer_mode in ${infer_mode_list[*]}; do
+    if [ ${infer_mode} != "null" ]; then
+        # run export
+        case ${infer_mode} in
+            norm) run_export=${norm_export} ;;
+            quant) run_export=${pact_export} ;;
+            fpgm) run_export=${fpgm_export} ;;
+            distill) run_export=${distill_export} ;;
+            kl_quant) run_export=${kl_quant_export} ;;
+            *) echo "Undefined infer_mode!"; exit 1;
+        esac
+        set_export_weight=$(func_set_params "${export_weight_key}" "${export_weight_value}")
+        set_save_export_dir=$(func_set_params "${save_export_key}" "${save_export_value}")
+        set_filename=$(func_set_params "${filename_key}" "${model_name}")
+        set_export_param=$(func_set_params "${export_param_key}" "${export_param_value}")
+        export_log_path="${LOG_PATH}/export.log"
+        export_cmd="${python} ${run_export} ${set_export_weight} ${set_filename} ${set_export_param} ${set_save_export_dir} "
+        echo  $export_cmd
+        eval "${export_cmd} > ${export_log_path} 2>&1"
+        status_export=$?
+        cat ${export_log_path}
+        status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+    fi
+
+    #run inference
+    export_model_dir="${save_export_value}/${model_name}"
+    func_paddle2onnx_inference "${python}" "${LOG_PATH}" "${export_model_dir}"
+    Count=$(($Count + 1))
+done
diff --git a/test_tipc/test_ptq_inference_python.sh b/test_tipc/test_ptq_inference_python.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6371d2ab3e15f0cc94babb2ff57c6e94026c7fba
--- /dev/null
+++ b/test_tipc/test_ptq_inference_python.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+
+FILENAME=$1
+MODE="whole_infer"
+
+# parser model_name
+dataline=$(cat ${FILENAME})
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+echo "ppdet ptq: ${model_name}"
+python=$(func_parser_value "${lines[2]}")
+filename_key=$(func_parser_key "${lines[3]}")
+
+# parser export params
+save_export_key=$(func_parser_key "${lines[5]}")
+save_export_value=$(func_parser_value "${lines[5]}")
+export_weight_key=$(func_parser_key "${lines[6]}")
+export_weight_value=$(func_parser_value "${lines[6]}")
+kl_quant_export=$(func_parser_value "${lines[7]}")
+export_param1_key=$(func_parser_key "${lines[8]}")
+export_param1_value=$(func_parser_value "${lines[8]}")
+
+# parser infer params
+inference_py=$(func_parser_value "${lines[10]}")
+device_key=$(func_parser_key "${lines[11]}")
+device_list=$(func_parser_value "${lines[11]}")
+use_mkldnn_key=$(func_parser_key "${lines[12]}")
+use_mkldnn_list=$(func_parser_value "${lines[12]}")
+cpu_threads_key=$(func_parser_key "${lines[13]}")
+cpu_threads_list=$(func_parser_value "${lines[13]}")
+batch_size_key=$(func_parser_key "${lines[14]}")
+batch_size_list=$(func_parser_value "${lines[14]}")
+run_mode_key=$(func_parser_key "${lines[15]}")
+run_mode_list=$(func_parser_value "${lines[15]}")
+model_dir_key=$(func_parser_key "${lines[16]}")
+image_dir_key=$(func_parser_key "${lines[17]}")
+image_dir_value=$(func_parser_value "${lines[17]}")
+run_benchmark_key=$(func_parser_key "${lines[18]}")
+run_benchmark_value=$(func_parser_value "${lines[18]}")
+infer_param1_key=$(func_parser_key "${lines[19]}")
+infer_param1_value=$(func_parser_value "${lines[19]}")
+
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_ptq_python.log"
+
+function func_ptq_inference(){
+    IFS='|'
+    _python=$1
+    _log_path=$2
+    _script=$3
+    _set_model_dir=$4
+
+    set_image_dir=$(func_set_params "${image_dir_key}" "${image_dir_value}")
+    set_run_benchmark=$(func_set_params "${run_benchmark_key}" "${run_benchmark_value}")
+    set_infer_param1=$(func_set_params "${infer_param1_key}" "${infer_param1_value}")
+    # inference
+    for device in ${device_list[*]}; do
+        set_device=$(func_set_params "${device_key}" "${device}")
+        if [ ${device} = "cpu" ]; then
+            for use_mkldnn in ${use_mkldnn_list[*]}; do
+                set_use_mkldnn=$(func_set_params "${use_mkldnn_key}" "${use_mkldnn}")
+                for threads in ${cpu_threads_list[*]}; do
+                    set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
+                    for batch_size in ${batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_mode_paddle_batchsize_${batch_size}.log"
+                        set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                        command="${_python} ${_script} ${set_device} ${set_use_mkldnn} ${set_cpu_threads} ${_set_model_dir} ${set_batchsize} ${set_image_dir} ${set_run_benchmark} ${set_infer_param1} > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+                    done
+                done
+            done
+        elif [ ${device} = "gpu" ]; then
+            for run_mode in ${run_mode_list[*]}; do
+                if [[ ${run_mode} = "paddle" ]] || [[ ${run_mode} = "trt_int8" ]]; then
+                    for batch_size in ${batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/python_infer_gpu_mode_${run_mode}_batchsize_${batch_size}.log"
+                        set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                        set_run_mode=$(func_set_params "${run_mode_key}" "${run_mode}")
+                        command="${_python} ${_script} ${set_device} ${set_run_mode} ${_set_model_dir} ${set_batchsize} ${set_image_dir} ${set_run_benchmark} ${set_infer_param1} > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+                    done
+                fi
+            done
+        else
+            echo "Does not support hardware other than CPU and GPU Currently!"
+        fi
+    done
+}
+
+IFS="|"
+# run ptq
+set_export_weight=$(func_set_params "${export_weight_key}" "${export_weight_value}")
+set_save_export_dir=$(func_set_params "${save_export_key}" "${save_export_value}")
+set_filename=$(func_set_params "${filename_key}" "${model_name}")
+export_log_path="${LOG_PATH}/export.log"
+ptq_cmd="${python} ${kl_quant_export} ${set_export_weight} ${set_filename} ${set_save_export_dir}"
+echo  $ptq_cmd
+eval "${ptq_cmd} > ${export_log_path} 2>&1"
+status_export=$?
+cat ${export_log_path}
+status_check $status_export "${ptq_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+
+#run inference
+set_export_model_dir=$(func_set_params "${model_dir_key}" "${save_export_value}/${model_name}")
+func_ptq_inference "${python}" "${LOG_PATH}" "${inference_py}" "${set_export_model_dir}"
diff --git a/test_tipc/test_serving_infer_cpp.sh b/test_tipc/test_serving_infer_cpp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4be299c16d3cabf39683d906cdd4b34bc8171351
--- /dev/null
+++ b/test_tipc/test_serving_infer_cpp.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+
+FILENAME=$1
+MODE="serving_infer"
+
+# parser model_name
+dataline=$(cat ${FILENAME})
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+echo "ppdet serving_cpp_infer: ${model_name}"
+python=$(func_parser_value "${lines[2]}")
+filename_key=$(func_parser_key "${lines[3]}")
+filename_value=$(func_parser_value "${lines[3]}")
+
+# parser export params
+save_export_key=$(func_parser_key "${lines[5]}")
+save_export_value=$(func_parser_value "${lines[5]}")
+export_weight_key=$(func_parser_key "${lines[6]}")
+export_weight_value=$(func_parser_value "${lines[6]}")
+norm_export=$(func_parser_value "${lines[7]}")
+pact_export=$(func_parser_value "${lines[8]}")
+fpgm_export=$(func_parser_value "${lines[9]}")
+distill_export=$(func_parser_value "${lines[10]}")
+export_key1=$(func_parser_key "${lines[11]}")
+export_value1=$(func_parser_value "${lines[11]}")
+export_key2=$(func_parser_key "${lines[12]}")
+export_value2=$(func_parser_value "${lines[12]}")
+kl_quant_export=$(func_parser_value "${lines[13]}")
+
+# parser serving params
+infer_mode_list=$(func_parser_value "${lines[15]}")
+infer_is_quant_list=$(func_parser_value "${lines[16]}")
+
+model_key=$(func_parser_key "${lines[17]}")
+op_key=$(func_parser_key "${lines[18]}")
+op_value=$(func_parser_value "${lines[18]}")
+port_key=$(func_parser_key "${lines[19]}")
+port_value=$(func_parser_value "${lines[19]}")
+gpu_ids_key=$(func_parser_key "${lines[20]}")
+gpu_ids_value=$(func_parser_value "${lines[20]}")
+web_service_key1=$(func_parser_key "${lines[21]}")
+web_service_value1=$(func_parser_value "${lines[21]}")
+http_client_py=$(func_parser_value "${lines[22]}")
+serving_client_key=$(func_parser_key "${lines[23]}")
+infer_image_key=$(func_parser_key "${lines[24]}")
+infer_image_value=$(func_parser_value "${lines[24]}")
+http_client_key1=$(func_parser_key "${lines[25]}")
+http_client_value1=$(func_parser_value "${lines[25]}")
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_serving_cpp.log"
+
+function func_serving_inference(){
+    IFS='|'
+    _python=$1
+    _log_path=$2
+    _set_server_model_dir=$3
+    _set_client_model_dir=$4
+    _set_image_file=$5
+
+    set_op=$(func_set_params "${op_key}" "${op_value}")
+    set_port=$(func_set_params "${port_key}" "${port_value}")
+    set_web_service_params1=$(func_set_params "${web_service_key1}" "${web_service_value1}")
+    set_http_client_params1=$(func_set_params "${http_client_key1}" "${http_client_value1}")
+    # inference
+    for gpu_ids in ${gpu_ids_value[*]}; do
+        if [ ${gpu_ids} = "null" ];then
+            server_log_path="${_log_path}/cpp_server_cpu.log"
+            client_log_path="${_log_path}/cpp_client_cpu.log"
+        else
+            server_log_path="${_log_path}/cpp_server_gpu.log"
+            client_log_path="${_log_path}/cpp_client_gpu.log"
+        fi
+        set_gpu_ids=$(func_set_params "${gpu_ids_key}" "${gpu_ids}")
+        # run web service
+        web_service_cmd="${_python} -m paddle_serving_server.serve ${_set_server_model_dir} ${set_op} ${set_port} ${set_gpu_ids} ${set_web_service_params1} > ${server_log_path} 2>&1 &"
+        eval $web_service_cmd
+        last_status=${PIPESTATUS[0]}
+        cat ${server_log_path}
+        status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" "${server_log_path}"
+        sleep 5s
+        # run http client
+        http_client_cmd="${_python} ${http_client_py} ${_set_client_model_dir} ${_set_image_file} ${set_http_client_params1} > ${client_log_path} 2>&1"
+        eval $http_client_cmd
+        last_status=${PIPESTATUS[0]}
+        cat ${client_log_path}
+        status_check $last_status "${http_client_cmd}" "${status_log}" "${model_name}" "${client_log_path}"
+        ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9
+        sleep 2s
+    done
+}
+
+# run serving infer
+Count=0
+IFS="|"
+infer_quant_flag=(${infer_is_quant_list})
+for infer_mode in ${infer_mode_list[*]}; do
+    if [ ${infer_mode} != "null" ]; then
+        # run export
+        case ${infer_mode} in
+            norm) run_export=${norm_export} ;;
+            quant) run_export=${pact_export} ;;
+            fpgm) run_export=${fpgm_export} ;;
+            distill) run_export=${distill_export} ;;
+            kl_quant) run_export=${kl_quant_export} ;;
+            *) echo "Undefined infer_mode!"; exit 1;
+        esac
+        set_export_weight=$(func_set_params "${export_weight_key}" "${export_weight_value}")
+        set_save_export_dir=$(func_set_params "${save_export_key}" "${save_export_value}")
+        set_filename=$(func_set_params "${filename_key}" "${model_name}")
+        export_log_path="${LOG_PATH}/export.log"
+        export_cmd="${python} ${run_export} ${set_export_weight} ${set_filename} ${set_save_export_dir} "
+        echo  $export_cmd
+        eval "${export_cmd} > ${export_log_path} 2>&1"
+        status_export=$?
+        cat ${export_log_path}
+        status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+    fi
+
+    #run inference
+    set_server_model_dir=$(func_set_params "${model_key}" "${save_export_value}/${model_name}/serving_server")
+    set_client_model_dir=$(func_set_params "${serving_client_key}" "${save_export_value}/${model_name}/serving_client")
+    set_infer_image_file=$(func_set_params "${infer_image_key}" "${infer_image_value}")
+    is_quant=${infer_quant_flag[Count]}
+    func_serving_inference "${python}" "${LOG_PATH}" "${set_server_model_dir}" "${set_client_model_dir}" ${set_infer_image_file}
+    Count=$(($Count + 1))
+done
+eval "unset CUDA_VISIBLE_DEVICES"
diff --git a/test_tipc/test_serving_infer_python.sh b/test_tipc/test_serving_infer_python.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fd7cc07b1449793e4ab52b2cd3e2bc7d78cb7433
--- /dev/null
+++ b/test_tipc/test_serving_infer_python.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+
+FILENAME=$1
+MODE="serving_infer"
+
+# parser model_name
+dataline=$(cat ${FILENAME})
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+echo "ppdet serving_python_infer: ${model_name}"
+python=$(func_parser_value "${lines[2]}")
+filename_key=$(func_parser_key "${lines[3]}")
+filename_value=$(func_parser_value "${lines[3]}")
+
+# parser export params
+save_export_key=$(func_parser_key "${lines[5]}")
+save_export_value=$(func_parser_value "${lines[5]}")
+export_weight_key=$(func_parser_key "${lines[6]}")
+export_weight_value=$(func_parser_value "${lines[6]}")
+norm_export=$(func_parser_value "${lines[7]}")
+pact_export=$(func_parser_value "${lines[8]}")
+fpgm_export=$(func_parser_value "${lines[9]}")
+distill_export=$(func_parser_value "${lines[10]}")
+export_key1=$(func_parser_key "${lines[11]}")
+export_value1=$(func_parser_value "${lines[11]}")
+export_key2=$(func_parser_key "${lines[12]}")
+export_value2=$(func_parser_value "${lines[12]}")
+kl_quant_export=$(func_parser_value "${lines[13]}")
+
+# parser serving params
+infer_mode_list=$(func_parser_value "${lines[15]}")
+infer_is_quant_list=$(func_parser_value "${lines[16]}")
+
+web_service_py=$(func_parser_value "${lines[17]}")
+model_dir_key=$(func_parser_key "${lines[18]}")
+opt_key=$(func_parser_key "${lines[19]}")
+opt_use_gpu_list=$(func_parser_value "${lines[19]}")
+web_service_key1=$(func_parser_key "${lines[20]}")
+web_service_value1=$(func_parser_value "${lines[20]}")
+http_client_py=$(func_parser_value "${lines[21]}")
+infer_image_key=$(func_parser_key "${lines[22]}")
+infer_image_value=$(func_parser_value "${lines[22]}")
+http_client_key1=$(func_parser_key "${lines[23]}")
+http_client_value1=$(func_parser_value "${lines[23]}")
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_serving_python.log"
+
+function func_serving_inference(){
+    IFS='|'
+    _python=$1
+    _log_path=$2
+    _service_script=$3
+    _client_script=$4
+    _set_model_dir=$5
+    _set_image_file=$6
+    set_web_service_params1=$(func_set_params "${web_service_key1}" "${web_service_value1}")
+    set_http_client_params1=$(func_set_params "${http_client_key1}" "${http_client_value1}")
+    # inference
+    for opt in ${opt_use_gpu_list[*]}; do
+        device_type=$(func_parser_key "${opt}")
+        server_log_path="${_log_path}/python_server_${device_type}.log"
+        client_log_path="${_log_path}/python_client_${device_type}.log"
+        opt_value=$(func_parser_value "${opt}")
+        _set_opt=$(func_set_params "${opt_key}" "${opt_value}")
+        # run web service
+        web_service_cmd="${_python} ${_service_script} ${_set_model_dir} ${_set_opt} ${set_web_service_params1} > ${server_log_path} 2>&1 &"
+        eval $web_service_cmd
+        last_status=${PIPESTATUS[0]}
+        cat ${server_log_path}
+        status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" "${server_log_path}"
+        sleep 5s
+        # run http client
+        http_client_cmd="${_python} ${_client_script} ${_set_image_file} ${set_http_client_params1} > ${client_log_path} 2>&1"
+        eval $http_client_cmd
+        last_status=${PIPESTATUS[0]}
+        cat ${client_log_path}
+        status_check $last_status "${http_client_cmd}" "${status_log}" "${model_name}" "${client_log_path}"
+        ps ux | grep -E 'web_service' | awk '{print $2}' | xargs kill -s 9
+        sleep 2s
+    done
+}
+
+# set cuda device
+GPUID=$3
+if [ ${#GPUID} -le 0 ];then
+    env="export CUDA_VISIBLE_DEVICES=0"
+else
+    env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+fi
+eval $env
+
+# run serving infer
+Count=0
+IFS="|"
+infer_quant_flag=(${infer_is_quant_list})
+for infer_mode in ${infer_mode_list[*]}; do
+    if [ ${infer_mode} != "null" ]; then
+        # run export
+        case ${infer_mode} in
+            norm) run_export=${norm_export} ;;
+            quant) run_export=${pact_export} ;;
+            fpgm) run_export=${fpgm_export} ;;
+            distill) run_export=${distill_export} ;;
+            kl_quant) run_export=${kl_quant_export} ;;
+            *) echo "Undefined infer_mode!"; exit 1;
+        esac
+        set_export_weight=$(func_set_params "${export_weight_key}" "${export_weight_value}")
+        set_save_export_dir=$(func_set_params "${save_export_key}" "${save_export_value}")
+        set_filename=$(func_set_params "${filename_key}" "${model_name}")
+        export_log_path="${LOG_PATH}/export.log"
+        export_cmd="${python} ${run_export} ${set_export_weight} ${set_filename} ${set_save_export_dir} "
+        echo  $export_cmd
+        eval "${export_cmd} > ${export_log_path} 2>&1"
+        status_export=$?
+        cat ${export_log_path}
+        status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+    fi
+
+    #run inference
+    set_export_model_dir=$(func_set_params "${model_dir_key}" "${save_export_value}/${model_name}")
+    set_infer_image_file=$(func_set_params "${infer_image_key}" "${infer_image_value}")
+    is_quant=${infer_quant_flag[Count]}
+    func_serving_inference "${python}" "${LOG_PATH}" "${web_service_py}" "${http_client_py}" "${set_export_model_dir}" ${set_infer_image_file}
+    Count=$(($Count + 1))
+done
+eval "unset CUDA_VISIBLE_DEVICES"
diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44cf9ea35b86df6dd0d39394244bfecd859a93dd
--- /dev/null
+++ b/test_tipc/test_train_inference_python.sh
@@ -0,0 +1,370 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+
+FILENAME=$1
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer'
+#                 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer']
+MODE=$2
+
+# parse params
+dataline=$(cat ${FILENAME})
+IFS=$'\n'
+lines=(${dataline})
+
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+echo "ppdet python_infer: ${model_name}"
+python=$(func_parser_value "${lines[2]}")
+gpu_list=$(func_parser_value "${lines[3]}")
+train_use_gpu_key=$(func_parser_key "${lines[4]}")
+train_use_gpu_value=$(func_parser_value "${lines[4]}")
+autocast_list=$(func_parser_value "${lines[5]}")
+autocast_key=$(func_parser_key "${lines[5]}")
+epoch_key=$(func_parser_key "${lines[6]}")
+epoch_num=$(func_parser_params "${lines[6]}")
+save_model_key=$(func_parser_key "${lines[7]}")
+train_batch_key=$(func_parser_key "${lines[8]}")
+train_batch_value=$(func_parser_params "${lines[8]}")
+pretrain_model_key=$(func_parser_key "${lines[9]}")
+pretrain_model_value=$(func_parser_value "${lines[9]}")
+train_model_name=$(func_parser_value "${lines[10]}")
+train_infer_img_dir=$(func_parser_value "${lines[11]}")
+train_param_key1=$(func_parser_key "${lines[12]}")
+train_param_value1=$(func_parser_value "${lines[12]}")
+
+trainer_list=$(func_parser_value "${lines[14]}")
+norm_key=$(func_parser_key "${lines[15]}")
+norm_trainer=$(func_parser_value "${lines[15]}")
+pact_key=$(func_parser_key "${lines[16]}")
+pact_trainer=$(func_parser_value "${lines[16]}")
+fpgm_key=$(func_parser_key "${lines[17]}")
+fpgm_trainer=$(func_parser_value "${lines[17]}")
+distill_key=$(func_parser_key "${lines[18]}")
+distill_trainer=$(func_parser_value "${lines[18]}")
+trainer_key1=$(func_parser_key "${lines[19]}")
+trainer_value1=$(func_parser_value "${lines[19]}")
+trainer_key2=$(func_parser_key "${lines[20]}")
+trainer_value2=$(func_parser_value "${lines[20]}")
+
+# eval params
+eval_py=$(func_parser_value "${lines[23]}")
+eval_key1=$(func_parser_key "${lines[24]}")
+eval_value1=$(func_parser_value "${lines[24]}")
+
+# export params
+save_export_key=$(func_parser_key "${lines[27]}")
+save_export_value=$(func_parser_value "${lines[27]}")
+export_weight_key=$(func_parser_key "${lines[28]}")
+export_weight_value=$(func_parser_value "${lines[28]}")
+norm_export=$(func_parser_value "${lines[29]}")
+pact_export=$(func_parser_value "${lines[30]}")
+fpgm_export=$(func_parser_value "${lines[31]}")
+distill_export=$(func_parser_value "${lines[32]}")
+export_key1=$(func_parser_key "${lines[33]}")
+export_value1=$(func_parser_value "${lines[33]}")
+export_onnx_key=$(func_parser_key "${lines[34]}")
+export_value2=$(func_parser_value "${lines[34]}")
+kl_quant_export=$(func_parser_value "${lines[35]}")
+
+# parser inference model
+infer_mode_list=$(func_parser_value "${lines[37]}")
+infer_is_quant_list=$(func_parser_value "${lines[38]}")
+# parser inference
+inference_py=$(func_parser_value "${lines[39]}")
+use_gpu_key=$(func_parser_key "${lines[40]}")
+use_gpu_list=$(func_parser_value "${lines[40]}")
+use_mkldnn_key=$(func_parser_key "${lines[41]}")
+use_mkldnn_list=$(func_parser_value "${lines[41]}")
+cpu_threads_key=$(func_parser_key "${lines[42]}")
+cpu_threads_list=$(func_parser_value "${lines[42]}")
+batch_size_key=$(func_parser_key "${lines[43]}")
+batch_size_list=$(func_parser_value "${lines[43]}")
+use_trt_key=$(func_parser_key "${lines[44]}")
+use_trt_list=$(func_parser_value "${lines[44]}")
+precision_key=$(func_parser_key "${lines[45]}")
+precision_list=$(func_parser_value "${lines[45]}")
+infer_model_key=$(func_parser_key "${lines[46]}")
+image_dir_key=$(func_parser_key "${lines[47]}")
+infer_img_dir=$(func_parser_value "${lines[47]}")
+save_log_key=$(func_parser_key "${lines[48]}")
+benchmark_key=$(func_parser_key "${lines[49]}")
+benchmark_value=$(func_parser_value "${lines[49]}")
+infer_key1=$(func_parser_key "${lines[50]}")
+infer_value1=$(func_parser_value "${lines[50]}")
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_python.log"
+
+line_num=`grep -n -w "to_static_train_benchmark_params" $FILENAME  | cut -d ":" -f 1`
+to_static_key=$(func_parser_key "${lines[line_num]}")
+to_static_trainer=$(func_parser_value "${lines[line_num]}")
+
+
+function func_inference(){
+    IFS='|'
+    _python=$1
+    _script=$2
+    _model_dir=$3
+    _log_path=$4
+    _img_dir=$5
+    _flag_quant=$6
+    _gpu=$7
+    # inference
+    for use_gpu in ${use_gpu_list[*]}; do
+        if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+            for use_mkldnn in ${use_mkldnn_list[*]}; do
+                if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+                    continue
+                fi
+                for threads in ${cpu_threads_list[*]}; do
+                    for batch_size in ${batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/python_infer_cpu_gpus_${gpu}_usemkldnn_${use_mkldnn}_threads_${threads}_mode_paddle_batchsize_${batch_size}.log"
+                        set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
+                        set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                        set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                        set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
+                        set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
+                        set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
+                        command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+                    done
+                done
+            done
+        elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+            for precision in ${precision_list[*]}; do
+                if [[ ${precision} != "paddle" ]]; then
+                    if [[ ${_flag_quant} = "False" ]] && [[ ${precision} = "trt_int8" ]]; then
+                        continue
+                    fi
+                    if [[ ${_flag_quant} = "True" ]] && [[ ${precision} != "trt_int8" ]]; then
+                        continue
+                    fi
+                fi
+                for batch_size in ${batch_size_list[*]}; do
+                    _save_log_path="${_log_path}/python_infer_gpu_gpus_${gpu}_mode_${precision}_batchsize_${batch_size}.log"
+                    set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
+                    set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                    set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                    set_precision=$(func_set_params "${precision_key}" "${precision}")
+                    set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
+                    set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
+                    command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+                    eval $command
+                    last_status=${PIPESTATUS[0]}
+                    eval "cat ${_save_log_path}"
+                    status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+                done
+            done
+        else
+            echo "Does not support hardware other than CPU and GPU Currently!"
+        fi
+    done
+}
+
+if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
+    # set CUDA_VISIBLE_DEVICES
+    GPUID=$3
+    if [ ${#GPUID} -le 0 ];then
+        env=" "
+    else
+        env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+    fi
+    eval $env
+
+    Count=0
+    gpu=0
+    IFS="|"
+    infer_quant_flag=(${infer_is_quant_list})
+    for infer_mode in ${infer_mode_list[*]}; do
+        if [ ${infer_mode} = "null" ]; then
+            continue
+        fi
+        if [ ${MODE} = "klquant_whole_infer" ] && [ ${infer_mode} != "kl_quant" ]; then
+            continue
+        fi
+        if [ ${MODE} = "whole_infer" ] && [ ${infer_mode} = "kl_quant" ]; then
+            continue
+        fi
+        # run export
+        case ${infer_mode} in
+            norm) run_export=${norm_export} ;;
+            pact) run_export=${pact_export} ;;
+            fpgm) run_export=${fpgm_export} ;;
+            distill) run_export=${distill_export} ;;
+            kl_quant) run_export=${kl_quant_export} ;;
+            *) echo "Undefined infer_mode!"; exit 1;
+        esac
+        set_export_weight=$(func_set_params "${export_weight_key}" "${export_weight_value}")
+        set_save_export_dir=$(func_set_params "${save_export_key}" "${save_export_value}")
+        set_filename=$(func_set_params "filename" "${model_name}")
+        export_cmd="${python} ${run_export} ${set_export_weight} ${set_filename} ${set_save_export_dir} "
+        echo  $export_cmd
+        eval $export_cmd
+        status_check $? "${export_cmd}" "${status_log}" "${model_name}" 
+
+        #run inference
+        save_export_model_dir="${save_export_value}/${model_name}"
+        is_quant=${infer_quant_flag[Count]}
+        func_inference "${python}" "${inference_py}" "${save_export_model_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant} "{gpu}"
+        Count=$((${Count} + 1))
+    done
+else
+    IFS="|"
+    Count=0
+    for gpu in ${gpu_list[*]}; do
+        use_gpu=${train_use_gpu_value}
+        Count=$((${Count} + 1))
+        ips=""
+        if [ ${gpu} = "-1" ];then
+            env=""
+            use_gpu=False
+        elif [ ${#gpu} -le 1 ];then
+            env="export CUDA_VISIBLE_DEVICES=${gpu}"
+            eval ${env}
+        elif [ ${#gpu} -le 15 ];then
+            IFS=","
+            array=(${gpu})
+            env="export CUDA_VISIBLE_DEVICES=${array[0]}"
+            IFS="|"
+        else
+            IFS=";"
+            array=(${gpu})
+            ips=${array[0]}
+            gpu=${array[1]}
+            IFS="|"
+            env=" "
+        fi
+        for autocast in ${autocast_list[*]}; do
+            for trainer in ${trainer_list[*]}; do
+                flag_quant=False
+                set_to_static=""
+                if [ ${trainer} = "${norm_key}" ]; then
+                    run_train=${norm_trainer}
+                    run_export=${norm_export}
+                elif [ ${trainer} = "${pact_key}" ]; then
+                    run_train=${pact_trainer}
+                    run_export=${pact_export}
+                    flag_quant=True
+                elif [ ${trainer} = "${fpgm_key}" ]; then
+                    run_train=${fpgm_trainer}
+                    run_export=${fpgm_export}
+                elif [ ${trainer} = "${distill_key}" ]; then
+                    run_train=${distill_trainer}
+                    run_export=${distill_export}
+                elif [ ${trainer} = "${trainer_key1}" ]; then
+                    run_train=${trainer_value1}
+                    run_export=${export_value1}
+                elif [ ${trainer} = "${trainer_key2}" ]; then
+                    run_train=${trainer_value2}
+                    run_export=${export_value2}
+                elif [ ${trainer} = "${to_static_key}" ]; then
+                    run_train=${norm_trainer}
+                    run_export=${norm_export}
+                    set_to_static=${to_static_trainer}
+                else
+                    continue
+                fi
+
+                if [ ${run_train} = "null" ]; then
+                    continue
+                fi
+
+                set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
+                set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
+                set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
+                set_filename=$(func_set_params "filename" "${model_name}")
+                set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${use_gpu}")
+                set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
+                save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
+                if [ ${autocast} = "amp" ] || [ ${autocast} = "fp16" ]; then
+                    set_autocast="--amp"
+                    set_amp_level="amp_level=O2"
+                else
+                    set_autocast=" "
+                    set_amp_level=" "
+                fi
+                if [ ${MODE} = "benchmark_train" ]; then
+                    set_shuffle="TrainReader.shuffle=False"
+                    set_enable_ce="--enable_ce=True"
+                else
+                    set_shuffle=" "
+                    set_enable_ce=" "
+                fi
+
+                set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
+                nodes="1"
+                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu
+                    cmd="${python} ${run_train} LearningRate.base_lr=0.0001 log_iter=1 ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_filename} ${set_shuffle} ${set_amp_level} ${set_enable_ce} ${set_autocast} ${set_to_static} ${set_train_params1}"
+                elif [ ${#ips} -le 15 ];then  # train with multi-gpu
+                    cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} log_iter=1 ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_filename} ${set_shuffle} ${set_amp_level} ${set_enable_ce} ${set_autocast} ${set_to_static} ${set_train_params1}"
+                else     # train with multi-machine
+                    IFS=","
+                    ips_array=(${ips})
+                    nodes=${#ips_array[@]}
+                    save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
+                    IFS="|"
+                    set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
+                    cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} log_iter=1 ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_filename} ${set_shuffle} ${set_amp_level} ${set_enable_ce} ${set_autocast} ${set_to_static}${set_train_params1}"
+                fi
+                # run train
+                train_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}.log"
+                if [ ${MODE} = "benchmark_train" ]; then
+                    eval "timeout 5m ${cmd} > ${train_log_path} 2>&1"
+                else
+                    eval "${cmd} > ${train_log_path} 2>&1"
+                fi
+                last_status=$?
+                cat ${train_log_path}
+                status_check $last_status "${cmd}" "${status_log}" "${model_name}" "${train_log_path}"
+
+                set_eval_trained_weight=$(func_set_params "${export_weight_key}" "${save_log}/${model_name}/${train_model_name}")
+                # run eval
+                if [ ${eval_py} != "null" ]; then
+                    set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}")
+                    eval_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log"
+                    eval_cmd="${python} ${eval_py} ${set_eval_trained_weight} ${set_use_gpu} ${set_eval_params1}"
+                    eval "${eval_cmd} > ${eval_log_path} 2>&1"
+                    last_status=$?
+                    cat ${eval_log_path}
+                    status_check $last_status "${eval_cmd}" "${status_log}" "${model_name}" "${eval_log_path}"
+                fi
+                # run export model
+                if [ ${run_export} != "null" ]; then
+                    save_export_model_dir="${save_log}/${model_name}"
+                    set_export_weight=$(func_set_params "${export_weight_key}" "${save_log}/${model_name}/${train_model_name}")
+                    set_save_export_dir=$(func_set_params "${save_export_key}" "${save_log}")
+                    if [ ${export_onnx_key} = "export_onnx" ]; then
+                        # run export onnx model for rcnn
+                        export_log_path_onnx=${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_onnx_export.log
+                        export_cmd="${python} ${run_export} ${set_export_weight} ${set_filename} export_onnx=True ${set_save_export_dir} >${export_log_path_onnx} 2>&1"
+                        eval $export_cmd
+                        status_check $? "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path_onnx}"
+                        # copy model for inference benchmark
+                        eval "cp ${save_export_model_dir}/* ${save_log}/"
+                    fi
+                    # run export model
+                    export_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log"
+                    export_cmd="${python} ${run_export} ${set_export_weight} ${set_filename} ${set_save_export_dir} "
+                    eval "${export_cmd} > ${export_log_path} 2>&1"
+                    last_status=$?
+                    cat ${export_log_path}
+                    status_check $last_status "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+
+                    #run inference
+                    if [ ${export_onnx_key} != "export_onnx" ]; then
+                        # copy model for inference benchmark
+                        eval "cp ${save_export_model_dir}/* ${save_log}/"
+                    fi
+                    eval $env
+                    func_inference "${python}" "${inference_py}" "${save_export_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" "{gpu}"
+
+                    eval "unset CUDA_VISIBLE_DEVICES"
+                fi
+            done  # done with:    for trainer in ${trainer_list[*]}; do
+        done      # done with:    for autocast in ${autocast_list[*]}; do
+    done          # done with:    for gpu in ${gpu_list[*]}; do
+fi  # end if [ ${MODE} = "infer" ]; then
diff --git a/test_tipc/test_train_inference_python_npu.sh b/test_tipc/test_train_inference_python_npu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5b51ac7ac368e96076490ee3399e9974d3279042
--- /dev/null
+++ b/test_tipc/test_train_inference_python_npu.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+source test_tipc/utils_func.sh
+function readlinkf() {
+    perl -MCwd -e 'print Cwd::abs_path shift' "$1";
+}
+function func_parser_config() {
+    strs=$1
+    IFS=" "
+    array=(${strs})
+    tmp=${array[2]}
+    echo ${tmp}
+}
+function func_parser_dir() {
+    strs=$1
+    IFS="/"
+    array=(${strs})
+    len=${#array[*]}
+    dir=""
+    count=1
+    for arr in ${array[*]}; do 
+        if [ ${len} = "${count}" ]; then
+            continue;
+        else
+            dir="${dir}/${arr}"
+            count=$((${count} + 1))
+        fi
+    done
+    echo "${dir}"
+}
+BASEDIR=$(dirname "$0")
+REPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)
+FILENAME=$1
+ # change gpu to npu in tipc txt configs
+ sed -i "s/use_gpu:True/use_npu:True/g" $FILENAME
+ sed -i "s/--device:gpu|cpu/--device:npu|cpu/g" $FILENAME
+ sed -i "s/trainer:pact_train/trainer:norm_train/g" $FILENAME
+ sed -i "s/trainer:fpgm_train/trainer:norm_train/g" $FILENAME
+ sed -i "s/--slim_config _template_pact/ /g" $FILENAME
+ sed -i "s/--slim_config _template_fpgm/ /g" $FILENAME
+ sed -i "s/--slim_config _template_kl_quant/ /g" $FILENAME
+ sed -i 's/\"gpu\"/\"npu\"/g' test_tipc/test_train_inference_python.sh
+
+ # parser params
+dataline=`cat $FILENAME`
+IFS=$'\n'
+lines=(${dataline})
+# replace training config file
+grep -n '.yml' $FILENAME  | cut -d ":" -f 1 \
+| while read line_num ; do 
+    train_cmd=$(func_parser_value "${lines[line_num-1]}")
+    trainer_config=$(func_parser_config ${train_cmd})
+    echo ${trainer_config}
+    sed -i 's/use_gpu/use_npu/g' "$REPO_ROOT_PATH/$trainer_config"
+    # fine use_gpu in those included yaml
+    sub_datalinee=`cat $REPO_ROOT_PATH/$trainer_config`
+    IFS=$'\n'
+    sub_lines=(${sub_datalinee})
+    grep -n '.yml' "$REPO_ROOT_PATH/$trainer_config" | cut -d ":" -f 1 \
+    | while read sub_line_num; do
+        sub_config=${sub_lines[sub_line_num-1]} 
+        dst=${#sub_config}-5
+        sub_path=$(func_parser_dir "${trainer_config}")
+        sub_config_path="${REPO_ROOT_PATH}${sub_path}/${sub_config:3:${dst}}"
+        echo ${sub_config_path}
+        sed -i 's/use_gpu/use_npu/g' "$sub_config_path"
+    done
+done
+# pass parameters to test_train_inference_python.sh
+cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} $2"
+echo $cmd
+eval $cmd
diff --git a/test_tipc/test_train_inference_python_xpu.sh b/test_tipc/test_train_inference_python_xpu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b020377f1e7712fde129e7362b8807f94a4c2e35
--- /dev/null
+++ b/test_tipc/test_train_inference_python_xpu.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+ source test_tipc/utils_func.sh
+
+ function readlinkf() {
+     perl -MCwd -e 'print Cwd::abs_path shift' "$1";
+ }
+
+ function func_parser_config() {
+     strs=$1
+     IFS=" "
+     array=(${strs})
+     tmp=${array[2]}
+     echo ${tmp}
+ }
+
+ function func_parser_dir() {
+     strs=$1
+     IFS="/"
+     array=(${strs})
+     len=${#array[*]}
+     dir=""
+     count=1
+     for arr in ${array[*]}; do 
+         if [ ${len} = "${count}" ]; then
+             continue;
+         else
+             dir="${dir}/${arr}"
+             count=$((${count} + 1))
+         fi
+     done
+     echo "${dir}"
+ }
+
+ BASEDIR=$(dirname "$0")
+ REPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)
+
+ FILENAME=$1
+
+ # change gpu to xpu in tipc txt configs
+ sed -i "s/use_gpu:True/use_xpu:True/g" $FILENAME
+ sed -i "s/--device:gpu|cpu/--device:xpu|cpu/g" $FILENAME
+ sed -i "s/trainer:pact_train/trainer:norm_train/g" $FILENAME
+ sed -i "s/trainer:fpgm_train/trainer:norm_train/g" $FILENAME
+ sed -i "s/--slim_config _template_pact/ /g" $FILENAME
+ sed -i "s/--slim_config _template_fpgm/ /g" $FILENAME
+ sed -i "s/--slim_config _template_kl_quant/ /g" $FILENAME
+ sed -i 's/\"gpu\"/\"xpu\"/g' test_tipc/test_train_inference_python.sh
+
+ # parser params
+ dataline=`cat $FILENAME`
+ IFS=$'\n'
+ lines=(${dataline})
+
+ # replace training config file
+ grep -n '.yml' $FILENAME  | cut -d ":" -f 1 \
+ | while read line_num ; do 
+     train_cmd=$(func_parser_value "${lines[line_num-1]}")
+     trainer_config=$(func_parser_config ${train_cmd})
+     echo ${trainer_config}
+     sed -i 's/use_gpu/use_xpu/g' "$REPO_ROOT_PATH/$trainer_config"
+     # fine use_gpu in those included yaml
+     sub_datalinee=`cat $REPO_ROOT_PATH/$trainer_config`
+     IFS=$'\n'
+     sub_lines=(${sub_datalinee})
+     grep -n '.yml' "$REPO_ROOT_PATH/$trainer_config" | cut -d ":" -f 1 \
+     | while read sub_line_num; do
+         sub_config=${sub_lines[sub_line_num-1]} 
+         dst=${#sub_config}-5
+         sub_path=$(func_parser_dir "${trainer_config}")
+         sub_config_path="${REPO_ROOT_PATH}${sub_path}/${sub_config:3:${dst}}"
+         echo ${sub_config_path}
+         sed -i 's/use_gpu/use_xpu/g' "$sub_config_path"
+     done
+ done
+
+ # pass parameters to test_train_inference_python.sh
+ cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} $2"
+ echo $cmd
+ eval $cmd
\ No newline at end of file
diff --git a/test_tipc/utils_func.sh b/test_tipc/utils_func.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4f52f34ccb404b86794ec826df78a2756a746b5f
--- /dev/null
+++ b/test_tipc/utils_func.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+function func_parser_key(){
+    strs=$1
+    echo ${strs%%:*}
+}
+
+function func_parser_value(){
+    strs=$1
+    echo ${strs#*:}
+}
+
+function func_set_params(){
+    key=$1
+    value=$2
+    if [ ${key}x = "null"x ];then
+        echo " "
+    elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then
+        echo " "
+    else
+        echo "${key}=${value}"
+    fi
+}
+
+function func_parser_params(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    key=${array[0]}
+    tmp=${array[1]}
+    IFS="|"
+    res=""
+    for _params in ${tmp[*]}; do
+        IFS="="
+        array=(${_params})
+        mode=${array[0]}
+        value=${array[1]}
+        if [[ ${mode} = ${MODE} ]]; then
+            IFS="|"
+            #echo $(func_set_params "${mode}" "${value}")
+            echo $value
+            break
+        fi
+        IFS="|"
+    done
+    echo ${res}
+}
+
+function status_check(){
+    last_status=$1   # the exit code
+    run_command=$2
+    run_log=$3
+    model_name=$4
+    log_path=$5
+    if [ $last_status -eq 0 ]; then
+        echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path}  \033[0m" | tee -a ${run_log}
+    else
+        echo -e "\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path}  \033[0m" | tee -a ${run_log}
+    fi
+}
diff --git a/tools/anchor_cluster.py b/tools/anchor_cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..e892d403090e6569e16d9548c00841368b427793
--- /dev/null
+++ b/tools/anchor_cluster.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.anchor_cluster')
+
+from scipy.cluster.vq import kmeans
+import numpy as np
+from tqdm import tqdm
+
+from ppdet.utils.cli import ArgsParser
+from ppdet.utils.check import check_gpu, check_version, check_config
+from ppdet.core.workspace import load_config, merge_config
+
+
+class BaseAnchorCluster(object):
+    def __init__(self, n, cache_path, cache, verbose=True):
+        """
+        Base Anchor Cluster
+
+        Args:
+            n (int): number of clusters
+            cache_path (str): cache directory path
+            cache (bool): whether using cache
+            verbose (bool): whether print results
+        """
+        super(BaseAnchorCluster, self).__init__()
+        self.n = n
+        self.cache_path = cache_path
+        self.cache = cache
+        self.verbose = verbose
+
+    def print_result(self, centers):
+        raise NotImplementedError('%s.print_result is not available' %
+                                  self.__class__.__name__)
+
+    def get_whs(self):
+        whs_cache_path = os.path.join(self.cache_path, 'whs.npy')
+        shapes_cache_path = os.path.join(self.cache_path, 'shapes.npy')
+        if self.cache and os.path.exists(whs_cache_path) and os.path.exists(
+                shapes_cache_path):
+            self.whs = np.load(whs_cache_path)
+            self.shapes = np.load(shapes_cache_path)
+            return self.whs, self.shapes
+        whs = np.zeros((0, 2))
+        shapes = np.zeros((0, 2))
+        self.dataset.parse_dataset()
+        roidbs = self.dataset.roidbs
+        for rec in tqdm(roidbs):
+            h, w = rec['h'], rec['w']
+            bbox = rec['gt_bbox']
+            wh = bbox[:, 2:4] - bbox[:, 0:2] + 1
+            wh = wh / np.array([[w, h]])
+            shape = np.ones_like(wh) * np.array([[w, h]])
+            whs = np.vstack((whs, wh))
+            shapes = np.vstack((shapes, shape))
+
+        if self.cache:
+            os.makedirs(self.cache_path, exist_ok=True)
+            np.save(whs_cache_path, whs)
+            np.save(shapes_cache_path, shapes)
+
+        self.whs = whs
+        self.shapes = shapes
+        return self.whs, self.shapes
+
+    def calc_anchors(self):
+        raise NotImplementedError('%s.calc_anchors is not available' %
+                                  self.__class__.__name__)
+
+    def __call__(self):
+        self.get_whs()
+        centers = self.calc_anchors()
+        if self.verbose:
+            self.print_result(centers)
+        return centers
+
+
+class YOLOv2AnchorCluster(BaseAnchorCluster):
+    def __init__(self,
+                 n,
+                 dataset,
+                 size,
+                 cache_path,
+                 cache,
+                 iters=1000,
+                 verbose=True):
+        super(YOLOv2AnchorCluster, self).__init__(
+            n, cache_path, cache, verbose=verbose)
+        """
+        YOLOv2 Anchor Cluster
+
+        The code is based on https://github.com/AlexeyAB/darknet/blob/master/scripts/gen_anchors.py
+
+        Args:
+            n (int): number of clusters
+            dataset (DataSet): DataSet instance, VOC or COCO
+            size (list): [w, h]
+            cache_path (str): cache directory path
+            cache (bool): whether using cache
+            iters (int): kmeans algorithm iters
+            verbose (bool): whether print results
+        """
+        self.dataset = dataset
+        self.size = size
+        self.iters = iters
+
+    def print_result(self, centers):
+        logger.info('%d anchor cluster result: [w, h]' % self.n)
+        for w, h in centers:
+            logger.info('[%d, %d]' % (round(w), round(h)))
+
+    def metric(self, whs, centers):
+        wh1 = whs[:, None]
+        wh2 = centers[None]
+        inter = np.minimum(wh1, wh2).prod(2)
+        return inter / (wh1.prod(2) + wh2.prod(2) - inter)
+
+    def kmeans_expectation(self, whs, centers, assignments):
+        dist = self.metric(whs, centers)
+        new_assignments = dist.argmax(1)
+        converged = (new_assignments == assignments).all()
+        return converged, new_assignments
+
+    def kmeans_maximizations(self, whs, centers, assignments):
+        new_centers = np.zeros_like(centers)
+        for i in range(centers.shape[0]):
+            mask = (assignments == i)
+            if mask.sum():
+                new_centers[i, :] = whs[mask].mean(0)
+        return new_centers
+
+    def calc_anchors(self):
+        self.whs = self.whs * np.array([self.size])
+        # random select k centers
+        whs, n, iters = self.whs, self.n, self.iters
+        logger.info('Running kmeans for %d anchors on %d points...' %
+                    (n, len(whs)))
+        idx = np.random.choice(whs.shape[0], size=n, replace=False)
+        centers = whs[idx]
+        assignments = np.zeros(whs.shape[0:1]) * -1
+        # kmeans
+        if n == 1:
+            return self.kmeans_maximizations(whs, centers, assignments)
+
+        pbar = tqdm(range(iters), desc='Cluster anchors with k-means algorithm')
+        for _ in pbar:
+            # E step
+            converged, assignments = self.kmeans_expectation(whs, centers,
+                                                             assignments)
+            if converged:
+                logger.info('kmeans algorithm has converged')
+                break
+            # M step
+            centers = self.kmeans_maximizations(whs, centers, assignments)
+            ious = self.metric(whs, centers)
+            pbar.desc = 'avg_iou: %.4f' % (ious.max(1).mean())
+
+        centers = sorted(centers, key=lambda x: x[0] * x[1])
+        return centers
+
+
+def main():
+    parser = ArgsParser()
+    parser.add_argument(
+        '--n', '-n', default=9, type=int, help='num of clusters')
+    parser.add_argument(
+        '--iters',
+        '-i',
+        default=1000,
+        type=int,
+        help='num of iterations for kmeans')
+    parser.add_argument(
+        '--verbose', '-v', default=True, type=bool, help='whether print result')
+    parser.add_argument(
+        '--size',
+        '-s',
+        default=None,
+        type=str,
+        help='image size: w,h, using comma as delimiter')
+    parser.add_argument(
+        '--method',
+        '-m',
+        default='v2',
+        type=str,
+        help='cluster method, v2 is only supported now')
+    parser.add_argument(
+        '--cache_path', default='cache', type=str, help='cache path')
+    parser.add_argument(
+        '--cache', action='store_true', help='whether use cache')
+    FLAGS = parser.parse_args()
+
+    cfg = load_config(FLAGS.config)
+    merge_config(FLAGS.opt)
+    check_config(cfg)
+    # check if set use_gpu=True in paddlepaddle cpu version
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+    check_gpu(cfg.use_gpu)
+    # check if paddlepaddle version is satisfied
+    check_version('develop')
+
+    # get dataset
+    dataset = cfg['TrainDataset']
+    if FLAGS.size:
+        if ',' in FLAGS.size:
+            size = list(map(int, FLAGS.size.split(',')))
+            assert len(size) == 2, "the format of size is incorrect"
+        else:
+            size = int(FLAGS.size)
+            size = [size, size]
+    elif 'inputs_def' in cfg['TestReader'] and 'image_shape' in cfg[
+            'TestReader']['inputs_def']:
+        size = cfg['TestReader']['inputs_def']['image_shape'][1:]
+    else:
+        raise ValueError('size is not specified')
+
+    if FLAGS.method == 'v2':
+        cluster = YOLOv2AnchorCluster(FLAGS.n, dataset, size, FLAGS.cache_path,
+                                      FLAGS.cache, FLAGS.iters, FLAGS.verbose)
+    else:
+        raise ValueError('cluster method: %s is not supported' % FLAGS.method)
+
+    anchors = cluster()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/box_distribution.py b/tools/box_distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7979ecc125f4f35f7256b6219707d04218210af
--- /dev/null
+++ b/tools/box_distribution.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import matplotlib.pyplot as plt
+import json
+import numpy as np
+import argparse
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+
+def median(data):
+    data.sort()
+    mid = len(data) // 2
+    median = (data[mid] + data[~mid]) / 2
+    return median
+
+
+def draw_distribution(width, height, out_path):
+    w_bins = int((max(width) - min(width)) // 10)
+    h_bins = int((max(height) - min(height)) // 10)
+    plt.figure()
+    plt.subplot(221)
+    plt.hist(width, bins=w_bins, color='green')
+    plt.xlabel('Width rate *1000')
+    plt.ylabel('number')
+    plt.title('Distribution of Width')
+    plt.subplot(222)
+    plt.hist(height, bins=h_bins, color='blue')
+    plt.xlabel('Height rate *1000')
+    plt.title('Distribution of Height')
+    plt.savefig(out_path)
+    print(f'Distribution saved as {out_path}')
+    plt.show()
+
+
+def get_ratio_infos(jsonfile, out_img, eval_size, small_stride):
+    coco = COCO(annotation_file=jsonfile)
+    allannjson = json.load(open(jsonfile, 'r'))
+    be_im_id = allannjson['annotations'][0]['image_id'] 
+    be_im_w = []
+    be_im_h = []
+    ratio_w = []
+    ratio_h = []
+    im_wid,im_hei=[],[]
+    for ann in tqdm(allannjson['annotations']):
+        if ann['iscrowd']:
+            continue
+        x0, y0, w, h = ann['bbox'][:]
+        if be_im_id == ann['image_id']:
+            be_im_w.append(w)
+            be_im_h.append(h)
+        else:
+            im_w = coco.imgs[be_im_id]['width']
+            im_h = coco.imgs[be_im_id]['height']
+            im_wid.append(im_w)
+            im_hei.append(im_h)
+            im_m_w = np.mean(be_im_w)
+            im_m_h = np.mean(be_im_h)
+            dis_w = im_m_w / im_w
+            dis_h = im_m_h / im_h
+            ratio_w.append(dis_w)
+            ratio_h.append(dis_h)
+            be_im_id = ann['image_id']
+            be_im_w = [w]
+            be_im_h = [h]
+        
+
+    im_w = coco.imgs[be_im_id]['width']
+    im_h = coco.imgs[be_im_id]['height']
+    im_wid.append(im_w)
+    im_hei.append(im_h)
+    all_im_m_w = np.mean(im_wid)
+    all_im_m_h = np.mean(im_hei)
+
+
+    im_m_w = np.mean(be_im_w)
+    im_m_h = np.mean(be_im_h)
+    dis_w = im_m_w / im_w
+    dis_h = im_m_h / im_h
+    ratio_w.append(dis_w)
+    ratio_h.append(dis_h)
+    mid_w = median(ratio_w)
+    mid_h = median(ratio_h)
+
+    reg_ratio = []
+    ratio_all = ratio_h + ratio_w
+    for r in ratio_all:
+        if r < 0.2:
+            reg_ratio.append(r)
+        elif r < 0.4:
+            reg_ratio.append(r/2)
+        else:
+            reg_ratio.append(r/4)
+    reg_ratio = sorted(reg_ratio)
+    max_ratio = reg_ratio[int(0.95*len(reg_ratio))]
+    reg_max = round(max_ratio*eval_size/small_stride)
+    
+    ratio_w = [i * 1000 for i in ratio_w]
+    ratio_h = [i * 1000 for i in ratio_h]
+    print(f'Suggested reg_range[1] is {reg_max+1}' )
+    print(f'Mean of all img_w is {all_im_m_w}')
+    print(f'Mean of all img_h is {all_im_m_h}') 
+    print(f'Median of ratio_w is {mid_w}')
+    print(f'Median of ratio_h is {mid_h}')
+    print('all_img with box: ', len(ratio_h))
+    print('all_ann: ', len(allannjson['annotations']))
+    draw_distribution(ratio_w, ratio_h, out_img)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--json_path', type=str, default=None, help="Dataset json path.")
+    parser.add_argument(
+        '--eval_size', type=int, default=640, help="eval size.")
+    parser.add_argument(
+        '--small_stride', type=int, default=8, help="smallest stride.")
+    parser.add_argument(
+        '--out_img',
+        type=str,
+        default='box_distribution.jpg',
+        help="Name of distibution img.")
+    args = parser.parse_args()
+
+    get_ratio_infos(args.json_path, args.out_img, args.eval_size, args.small_stride)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tools/eval.py b/tools/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..40cbbecd8b579c7b967169e632be77c9293715a3
--- /dev/null
+++ b/tools/eval.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+
+import paddle
+
+from ppdet.core.workspace import create, load_config, merge_config
+from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
+from ppdet.utils.cli import ArgsParser, merge_args
+from ppdet.engine import Trainer, init_parallel_env
+from ppdet.metrics.coco_utils import json_eval_results
+from ppdet.slim import build_slim_model
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('eval')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--output_eval",
+        default=None,
+        type=str,
+        help="Evaluation directory, default is current directory.")
+
+    parser.add_argument(
+        '--json_eval',
+        action='store_true',
+        default=False,
+        help='Whether to re eval with already exists bbox.json or mask.json')
+
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+
+    # TODO: bias should be unified
+    parser.add_argument(
+        "--bias",
+        action="store_true",
+        help="whether add bias or not while getting w and h")
+
+    parser.add_argument(
+        "--classwise",
+        action="store_true",
+        help="whether per-category AP and draw P-R Curve or not.")
+
+    parser.add_argument(
+        '--save_prediction_only',
+        action='store_true',
+        default=False,
+        help='Whether to save the evaluation results only')
+
+    parser.add_argument(
+        "--amp",
+        action='store_true',
+        default=False,
+        help="Enable auto mixed precision eval.")
+
+    # for smalldet slice_infer
+    parser.add_argument(
+        "--slice_infer",
+        action='store_true',
+        help="Whether to slice the image and merge the inference results for small object detection."
+    )
+    parser.add_argument(
+        '--slice_size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help="Height of the sliced image.")
+    parser.add_argument(
+        "--overlap_ratio",
+        nargs='+',
+        type=float,
+        default=[0.25, 0.25],
+        help="Overlap height ratio of the sliced image.")
+    parser.add_argument(
+        "--combine_method",
+        type=str,
+        default='nms',
+        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']."
+    )
+    parser.add_argument(
+        "--match_threshold",
+        type=float,
+        default=0.6,
+        help="Combine method matching threshold.")
+    parser.add_argument(
+        "--match_metric",
+        type=str,
+        default='ios',
+        help="Combine method matching metric, choose in ['iou', 'ios'].")
+    args = parser.parse_args()
+    return args
+
+
+def run(FLAGS, cfg):
+    if FLAGS.json_eval:
+        logger.info(
+            "In json_eval mode, PaddleDetection will evaluate json files in "
+            "output_eval directly. And proposal.json, bbox.json and mask.json "
+            "will be detected by default.")
+        json_eval_results(
+            cfg.metric,
+            json_directory=FLAGS.output_eval,
+            dataset=create('EvalDataset')())
+        return
+
+    # init parallel environment if nranks > 1
+    init_parallel_env()
+
+    # build trainer
+    trainer = Trainer(cfg, mode='eval')
+
+    # load weights
+    trainer.load_weights(cfg.weights)
+
+    # training
+    if FLAGS.slice_infer:
+        trainer.evaluate_slice(
+            slice_size=FLAGS.slice_size,
+            overlap_ratio=FLAGS.overlap_ratio,
+            combine_method=FLAGS.combine_method,
+            match_threshold=FLAGS.match_threshold,
+            match_metric=FLAGS.match_metric)
+    else:
+        trainer.evaluate()
+
+
+def main():
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    merge_args(cfg, FLAGS)
+    merge_config(FLAGS.opt)
+
+    # disable npu in config by default
+    if 'use_npu' not in cfg:
+        cfg.use_npu = False
+
+    # disable xpu in config by default
+    if 'use_xpu' not in cfg:
+        cfg.use_xpu = False
+
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+
+    # disable mlu in config by default
+    if 'use_mlu' not in cfg:
+        cfg.use_mlu = False
+
+    if cfg.use_gpu:
+        place = paddle.set_device('gpu')
+    elif cfg.use_npu:
+        place = paddle.set_device('npu')
+    elif cfg.use_xpu:
+        place = paddle.set_device('xpu')
+    elif cfg.use_mlu:
+        place = paddle.set_device('mlu')
+    else:
+        place = paddle.set_device('cpu')
+
+    if FLAGS.slim_config:
+        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='eval')
+
+    check_config(cfg)
+    check_gpu(cfg.use_gpu)
+    check_npu(cfg.use_npu)
+    check_xpu(cfg.use_xpu)
+    check_mlu(cfg.use_mlu)
+    check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/export_model.py b/tools/export_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf7141189906372dfb1d75da2b768e69275e6a14
--- /dev/null
+++ b/tools/export_model.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.utils.check import check_gpu, check_version, check_config
+from ppdet.utils.cli import ArgsParser
+from ppdet.engine import Trainer
+from ppdet.slim import build_slim_model
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('export_model')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output_inference",
+        help="Directory for storing the output model files.")
+    parser.add_argument(
+        "--export_serving_model",
+        type=bool,
+        default=False,
+        help="Whether to export serving model or not.")
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+    args = parser.parse_args()
+    return args
+
+
+def run(FLAGS, cfg):
+    # build detector
+    trainer = Trainer(cfg, mode='test')
+
+    # load weights
+    trainer.load_weights(cfg.weights)
+
+    # export model
+    trainer.export(FLAGS.output_dir)
+
+    if FLAGS.export_serving_model:
+        from paddle_serving_client.io import inference_model_to_serving
+        model_name = os.path.splitext(os.path.split(cfg.filename)[-1])[0]
+
+        inference_model_to_serving(
+            dirname="{}/{}".format(FLAGS.output_dir, model_name),
+            serving_server="{}/{}/serving_server".format(FLAGS.output_dir,
+                                                         model_name),
+            serving_client="{}/{}/serving_client".format(FLAGS.output_dir,
+                                                         model_name),
+            model_filename="model.pdmodel",
+            params_filename="model.pdiparams")
+
+
+def main():
+    paddle.set_device("cpu")
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    merge_config(FLAGS.opt)
+
+    if FLAGS.slim_config:
+        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')
+
+    # FIXME: Temporarily solve the priority problem of FLAGS.opt
+    merge_config(FLAGS.opt)
+    check_config(cfg)
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+    check_gpu(cfg.use_gpu)
+    check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/infer.py b/tools/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..65fb3b7253cd214ad757f3e62a1380b8218a92b0
--- /dev/null
+++ b/tools/infer.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+import glob
+import ast
+
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.engine import Trainer
+from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
+from ppdet.utils.cli import ArgsParser, merge_args
+from ppdet.slim import build_slim_model
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('train')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--infer_dir",
+        type=str,
+        default=None,
+        help="Directory for images to perform inference on.")
+    parser.add_argument(
+        "--infer_img",
+        type=str,
+        default=None,
+        help="Image path, has higher priority over --infer_dir")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Directory for storing the output visualization files.")
+    parser.add_argument(
+        "--draw_threshold",
+        type=float,
+        default=0.5,
+        help="Threshold to reserve the result for visualization.")
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+    parser.add_argument(
+        "--use_vdl",
+        type=bool,
+        default=False,
+        help="Whether to record the data to VisualDL.")
+    parser.add_argument(
+        '--vdl_log_dir',
+        type=str,
+        default="vdl_log_dir/image",
+        help='VisualDL logging directory for image.')
+    parser.add_argument(
+        "--save_results",
+        type=bool,
+        default=False,
+        help="Whether to save inference results to output_dir.")
+    parser.add_argument(
+        "--slice_infer",
+        action='store_true',
+        help="Whether to slice the image and merge the inference results for small object detection."
+    )
+    parser.add_argument(
+        '--slice_size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help="Height of the sliced image.")
+    parser.add_argument(
+        "--overlap_ratio",
+        nargs='+',
+        type=float,
+        default=[0.25, 0.25],
+        help="Overlap height ratio of the sliced image.")
+    parser.add_argument(
+        "--combine_method",
+        type=str,
+        default='nms',
+        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']."
+    )
+    parser.add_argument(
+        "--match_threshold",
+        type=float,
+        default=0.6,
+        help="Combine method matching threshold.")
+    parser.add_argument(
+        "--match_metric",
+        type=str,
+        default='ios',
+        help="Combine method matching metric, choose in ['iou', 'ios'].")
+    parser.add_argument(
+        "--visualize",
+        type=ast.literal_eval,
+        default=True,
+        help="Whether to save visualize results to output_dir.")
+    args = parser.parse_args()
+    return args
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--infer_img or --infer_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    logger.info("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+def run(FLAGS, cfg):
+    # build trainer
+    trainer = Trainer(cfg, mode='test')
+
+    # load weights
+    trainer.load_weights(cfg.weights)
+
+    # get inference images
+    images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img)
+
+    # inference
+    if FLAGS.slice_infer:
+        trainer.slice_predict(
+            images,
+            slice_size=FLAGS.slice_size,
+            overlap_ratio=FLAGS.overlap_ratio,
+            combine_method=FLAGS.combine_method,
+            match_threshold=FLAGS.match_threshold,
+            match_metric=FLAGS.match_metric,
+            draw_threshold=FLAGS.draw_threshold,
+            output_dir=FLAGS.output_dir,
+            save_results=FLAGS.save_results,
+            visualize=FLAGS.visualize)
+    else:
+        trainer.predict(
+            images,
+            draw_threshold=FLAGS.draw_threshold,
+            output_dir=FLAGS.output_dir,
+            save_results=FLAGS.save_results,
+            visualize=FLAGS.visualize)
+
+
+def main():
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    merge_args(cfg, FLAGS)
+    merge_config(FLAGS.opt)
+
+    # disable npu in config by default
+    if 'use_npu' not in cfg:
+        cfg.use_npu = False
+
+    # disable xpu in config by default
+    if 'use_xpu' not in cfg:
+        cfg.use_xpu = False
+
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+
+    # disable mlu in config by default
+    if 'use_mlu' not in cfg:
+        cfg.use_mlu = False
+
+    if cfg.use_gpu:
+        place = paddle.set_device('gpu')
+    elif cfg.use_npu:
+        place = paddle.set_device('npu')
+    elif cfg.use_xpu:
+        place = paddle.set_device('xpu')
+    elif cfg.use_mlu:
+        place = paddle.set_device('mlu')
+    else:
+        place = paddle.set_device('cpu')
+
+    if FLAGS.slim_config:
+        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')
+
+    check_config(cfg)
+    check_gpu(cfg.use_gpu)
+    check_npu(cfg.use_npu)
+    check_xpu(cfg.use_xpu)
+    check_mlu(cfg.use_mlu)
+    check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/post_quant.py b/tools/post_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..77d5333ec9d77a96c863e52c4667e54ab499d3c0
--- /dev/null
+++ b/tools/post_quant.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+
+import paddle
+
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.utils.check import check_gpu, check_version, check_config
+from ppdet.utils.cli import ArgsParser
+from ppdet.engine import Trainer
+from ppdet.slim import build_slim_model
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('post_quant')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output_inference",
+        help="Directory for storing the output model files.")
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+    args = parser.parse_args()
+    return args
+
+
+def run(FLAGS, cfg):
+    # build detector
+    trainer = Trainer(cfg, mode='eval')
+
+    # load weights
+    trainer.load_weights(cfg.weights)
+
+    # post quant model
+    trainer.post_quant(FLAGS.output_dir)
+
+
+def main():
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    # TODO: to be refined in the future
+    if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn':
+        FLAGS.opt['norm_type'] = 'bn'
+    merge_config(FLAGS.opt)
+
+    if FLAGS.slim_config:
+        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')
+
+    # FIXME: Temporarily solve the priority problem of FLAGS.opt
+    merge_config(FLAGS.opt)
+    check_config(cfg)
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+    check_gpu(cfg.use_gpu)
+    check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/slice_image.py b/tools/slice_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..f739d74244b0e4672a5b2ed3430f89b936f0bef5
--- /dev/null
+++ b/tools/slice_image.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from tqdm import tqdm
+
+
+def slice_data(image_dir, dataset_json_path, output_dir, slice_size,
+               overlap_ratio):
+    try:
+        from sahi.scripts.slice_coco import slice
+    except Exception as e:
+        raise RuntimeError(
+            'Unable to use sahi to slice images, please install sahi, for example: `pip install sahi`, see https://github.com/obss/sahi'
+        )
+    tqdm.write(
+        f" slicing for slice_size={slice_size}, overlap_ratio={overlap_ratio}")
+    slice(
+        image_dir=image_dir,
+        dataset_json_path=dataset_json_path,
+        output_dir=output_dir,
+        slice_size=slice_size,
+        overlap_ratio=overlap_ratio, )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--image_dir', type=str, default=None, help="The image folder path.")
+    parser.add_argument(
+        '--json_path', type=str, default=None, help="Dataset json path.")
+    parser.add_argument(
+        '--output_dir', type=str, default=None, help="Output dir.")
+    parser.add_argument(
+        '--slice_size', type=int, default=500, help="slice_size")
+    parser.add_argument(
+        '--overlap_ratio', type=float, default=0.25, help="overlap_ratio")
+    args = parser.parse_args()
+
+    slice_data(args.image_dir, args.json_path, args.output_dir, args.slice_size,
+               args.overlap_ratio)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..9664111c46944b3833db8007f85ad086d00205a8
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+
+import paddle
+
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.engine import Trainer, init_parallel_env, set_random_seed, init_fleet_env
+from ppdet.engine.trainer_ssod import Trainer_DenseTeacher
+from ppdet.slim import build_slim_model
+
+from ppdet.utils.cli import ArgsParser, merge_args
+import ppdet.utils.check as check
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('train')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--eval",
+        action='store_true',
+        default=False,
+        help="Whether to perform evaluation in train")
+    parser.add_argument(
+        "-r", "--resume", default=None, help="weights path for resume")
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+    parser.add_argument(
+        "--enable_ce",
+        type=bool,
+        default=False,
+        help="If set True, enable continuous evaluation job."
+        "This flag is only used for internal test.")
+    parser.add_argument(
+        "--amp",
+        action='store_true',
+        default=False,
+        help="Enable auto mixed precision training.")
+    parser.add_argument(
+        "--fleet", action='store_true', default=False, help="Use fleet or not")
+    parser.add_argument(
+        "--use_vdl",
+        type=bool,
+        default=False,
+        help="whether to record the data to VisualDL.")
+    parser.add_argument(
+        '--vdl_log_dir',
+        type=str,
+        default="vdl_log_dir/scalar",
+        help='VisualDL logging directory for scalar.')
+    parser.add_argument(
+        "--use_wandb",
+        type=bool,
+        default=False,
+        help="whether to record the data to wandb.")
+    parser.add_argument(
+        '--save_prediction_only',
+        action='store_true',
+        default=False,
+        help='Whether to save the evaluation results only')
+    parser.add_argument(
+        '--profiler_options',
+        type=str,
+        default=None,
+        help="The option of profiler, which should be in "
+        "format \"key1=value1;key2=value2;key3=value3\"."
+        "please see ppdet/utils/profiler.py for detail.")
+    parser.add_argument(
+        '--save_proposals',
+        action='store_true',
+        default=False,
+        help='Whether to save the train proposals')
+    parser.add_argument(
+        '--proposals_path',
+        type=str,
+        default="sniper/proposals.json",
+        help='Train proposals directory')
+    parser.add_argument(
+        "--to_static",
+        action='store_true',
+        default=False,
+        help="Enable dy2st to train.")
+
+    args = parser.parse_args()
+    return args
+
+
+def run(FLAGS, cfg):
+    # init fleet environment
+    if cfg.fleet:
+        init_fleet_env(cfg.get('find_unused_parameters', False))
+    else:
+        # init parallel environment if nranks > 1
+        init_parallel_env()
+
+    if FLAGS.enable_ce:
+        set_random_seed(0)
+
+    # build trainer
+    ssod_method = cfg.get('ssod_method', None)
+    if ssod_method is not None:
+        if ssod_method == 'DenseTeacher':
+            trainer = Trainer_DenseTeacher(cfg, mode='train')
+        else:
+            raise ValueError(
+                "Semi-Supervised Object Detection only support DenseTeacher now."
+            )
+    else:
+        trainer = Trainer(cfg, mode='train')
+
+    # load weights
+    if FLAGS.resume is not None:
+        trainer.resume_weights(FLAGS.resume)
+    elif 'pretrain_weights' in cfg and cfg.pretrain_weights:
+        trainer.load_weights(cfg.pretrain_weights)
+
+    # training
+    trainer.train(FLAGS.eval)
+
+
+def main():
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    merge_args(cfg, FLAGS)
+    merge_config(FLAGS.opt)
+
+    # disable npu in config by default
+    if 'use_npu' not in cfg:
+        cfg.use_npu = False
+
+    # disable xpu in config by default
+    if 'use_xpu' not in cfg:
+        cfg.use_xpu = False
+
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+
+    # disable mlu in config by default
+    if 'use_mlu' not in cfg:
+        cfg.use_mlu = False
+
+    if cfg.use_gpu:
+        place = paddle.set_device('gpu')
+    elif cfg.use_npu:
+        place = paddle.set_device('npu')
+    elif cfg.use_xpu:
+        place = paddle.set_device('xpu')
+    elif cfg.use_mlu:
+        place = paddle.set_device('mlu')
+    else:
+        place = paddle.set_device('cpu')
+
+    if FLAGS.slim_config:
+        cfg = build_slim_model(cfg, FLAGS.slim_config)
+
+    # FIXME: Temporarily solve the priority problem of FLAGS.opt
+    merge_config(FLAGS.opt)
+    check.check_config(cfg)
+    check.check_gpu(cfg.use_gpu)
+    check.check_npu(cfg.use_npu)
+    check.check_xpu(cfg.use_xpu)
+    check.check_mlu(cfg.use_mlu)
+    check.check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/x2coco.py b/tools/x2coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..78e8619b42edfa8343770b1bbf12991d8d4d326a
--- /dev/null
+++ b/tools/x2coco.py
@@ -0,0 +1,542 @@
+#!/usr/bin/env python
+# coding: utf-8
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+import json
+import os
+import os.path as osp
+import shutil
+import xml.etree.ElementTree as ET
+
+import numpy as np
+import PIL.ImageDraw
+from tqdm import tqdm
+import cv2
+
+label_to_num = {}
+categories_list = []
+labels_list = []
+
+
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        else:
+            return super(MyEncoder, self).default(obj)
+
+
+def images_labelme(data, num):
+    image = {}
+    image['height'] = data['imageHeight']
+    image['width'] = data['imageWidth']
+    image['id'] = num + 1
+    if '\\' in data['imagePath']:
+        image['file_name'] = data['imagePath'].split('\\')[-1]
+    else:
+        image['file_name'] = data['imagePath'].split('/')[-1]
+    return image
+
+
+def images_cityscape(data, num, img_file):
+    image = {}
+    image['height'] = data['imgHeight']
+    image['width'] = data['imgWidth']
+    image['id'] = num + 1
+    image['file_name'] = img_file
+    return image
+
+
+def categories(label, labels_list):
+    category = {}
+    category['supercategory'] = 'component'
+    category['id'] = len(labels_list) + 1
+    category['name'] = label
+    return category
+
+
+def annotations_rectangle(points, label, image_num, object_num, label_to_num):
+    annotation = {}
+    seg_points = np.asarray(points).copy()
+    seg_points[1, :] = np.asarray(points)[2, :]
+    seg_points[2, :] = np.asarray(points)[1, :]
+    annotation['segmentation'] = [list(seg_points.flatten())]
+    annotation['iscrowd'] = 0
+    annotation['image_id'] = image_num + 1
+    annotation['bbox'] = list(
+        map(float, [
+            points[0][0], points[0][1], points[1][0] - points[0][0], points[1][
+                1] - points[0][1]
+        ]))
+    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
+    annotation['category_id'] = label_to_num[label]
+    annotation['id'] = object_num + 1
+    return annotation
+
+
+def annotations_polygon(height, width, points, label, image_num, object_num,
+                        label_to_num):
+    annotation = {}
+    annotation['segmentation'] = [list(np.asarray(points).flatten())]
+    annotation['iscrowd'] = 0
+    annotation['image_id'] = image_num + 1
+    annotation['bbox'] = list(map(float, get_bbox(height, width, points)))
+    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
+    annotation['category_id'] = label_to_num[label]
+    annotation['id'] = object_num + 1
+    return annotation
+
+
+def get_bbox(height, width, points):
+    polygons = points
+    mask = np.zeros([height, width], dtype=np.uint8)
+    mask = PIL.Image.fromarray(mask)
+    xy = list(map(tuple, polygons))
+    PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1)
+    mask = np.array(mask, dtype=bool)
+    index = np.argwhere(mask == 1)
+    rows = index[:, 0]
+    clos = index[:, 1]
+    left_top_r = np.min(rows)
+    left_top_c = np.min(clos)
+    right_bottom_r = np.max(rows)
+    right_bottom_c = np.max(clos)
+    return [
+        left_top_c, left_top_r, right_bottom_c - left_top_c,
+        right_bottom_r - left_top_r
+    ]
+
+
+def deal_json(ds_type, img_path, json_path):
+    data_coco = {}
+    images_list = []
+    annotations_list = []
+    image_num = -1
+    object_num = -1
+    for img_file in os.listdir(img_path):
+        img_label = os.path.splitext(img_file)[0]
+        if img_file.split('.')[
+                -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']:
+            continue
+        label_file = osp.join(json_path, img_label + '.json')
+        print('Generating dataset from:', label_file)
+        image_num = image_num + 1
+        with open(label_file) as f:
+            data = json.load(f)
+            if ds_type == 'labelme':
+                images_list.append(images_labelme(data, image_num))
+            elif ds_type == 'cityscape':
+                images_list.append(images_cityscape(data, image_num, img_file))
+            if ds_type == 'labelme':
+                for shapes in data['shapes']:
+                    object_num = object_num + 1
+                    label = shapes['label']
+                    if label not in labels_list:
+                        categories_list.append(categories(label, labels_list))
+                        labels_list.append(label)
+                        label_to_num[label] = len(labels_list)
+                    p_type = shapes['shape_type']
+                    if p_type == 'polygon':
+                        points = shapes['points']
+                        annotations_list.append(
+                            annotations_polygon(data['imageHeight'], data[
+                                'imageWidth'], points, label, image_num,
+                                                object_num, label_to_num))
+
+                    if p_type == 'rectangle':
+                        (x1, y1), (x2, y2) = shapes['points']
+                        x1, x2 = sorted([x1, x2])
+                        y1, y2 = sorted([y1, y2])
+                        points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]]
+                        annotations_list.append(
+                            annotations_rectangle(points, label, image_num,
+                                                  object_num, label_to_num))
+            elif ds_type == 'cityscape':
+                for shapes in data['objects']:
+                    object_num = object_num + 1
+                    label = shapes['label']
+                    if label not in labels_list:
+                        categories_list.append(categories(label, labels_list))
+                        labels_list.append(label)
+                        label_to_num[label] = len(labels_list)
+                    points = shapes['polygon']
+                    annotations_list.append(
+                        annotations_polygon(data['imgHeight'], data[
+                            'imgWidth'], points, label, image_num, object_num,
+                                            label_to_num))
+    data_coco['images'] = images_list
+    data_coco['categories'] = categories_list
+    data_coco['annotations'] = annotations_list
+    return data_coco
+
+
+def voc_get_label_anno(ann_dir_path, ann_ids_path, labels_path):
+    with open(labels_path, 'r') as f:
+        labels_str = f.read().split()
+    labels_ids = list(range(1, len(labels_str) + 1))
+
+    with open(ann_ids_path, 'r') as f:
+        ann_ids = [lin.strip().split(' ')[-1] for lin in f.readlines()]
+
+    ann_paths = []
+    for aid in ann_ids:
+        if aid.endswith('xml'):
+            ann_path = os.path.join(ann_dir_path, aid)
+        else:
+            ann_path = os.path.join(ann_dir_path, aid + '.xml')
+        ann_paths.append(ann_path)
+
+    return dict(zip(labels_str, labels_ids)), ann_paths
+
+
+def voc_get_image_info(annotation_root, im_id):
+    filename = annotation_root.findtext('filename')
+    assert filename is not None
+    img_name = os.path.basename(filename)
+
+    size = annotation_root.find('size')
+    width = float(size.findtext('width'))
+    height = float(size.findtext('height'))
+
+    image_info = {
+        'file_name': filename,
+        'height': height,
+        'width': width,
+        'id': im_id
+    }
+    return image_info
+
+
+def voc_get_coco_annotation(obj, label2id):
+    label = obj.findtext('name')
+    assert label in label2id, "label is not in label2id."
+    category_id = label2id[label]
+    bndbox = obj.find('bndbox')
+    xmin = float(bndbox.findtext('xmin'))
+    ymin = float(bndbox.findtext('ymin'))
+    xmax = float(bndbox.findtext('xmax'))
+    ymax = float(bndbox.findtext('ymax'))
+    assert xmax > xmin and ymax > ymin, "Box size error."
+    o_width = xmax - xmin
+    o_height = ymax - ymin
+    anno = {
+        'area': o_width * o_height,
+        'iscrowd': 0,
+        'bbox': [xmin, ymin, o_width, o_height],
+        'category_id': category_id,
+        'ignore': 0,
+    }
+    return anno
+
+
+def voc_xmls_to_cocojson(annotation_paths, label2id, output_dir, output_file):
+    output_json_dict = {
+        "images": [],
+        "type": "instances",
+        "annotations": [],
+        "categories": []
+    }
+    bnd_id = 1  # bounding box start id
+    im_id = 0
+    print('Start converting !')
+    for a_path in tqdm(annotation_paths):
+        # Read annotation xml
+        ann_tree = ET.parse(a_path)
+        ann_root = ann_tree.getroot()
+
+        img_info = voc_get_image_info(ann_root, im_id)
+        output_json_dict['images'].append(img_info)
+
+        for obj in ann_root.findall('object'):
+            ann = voc_get_coco_annotation(obj=obj, label2id=label2id)
+            ann.update({'image_id': im_id, 'id': bnd_id})
+            output_json_dict['annotations'].append(ann)
+            bnd_id = bnd_id + 1
+        im_id += 1
+
+    for label, label_id in label2id.items():
+        category_info = {'supercategory': 'none', 'id': label_id, 'name': label}
+        output_json_dict['categories'].append(category_info)
+    output_file = os.path.join(output_dir, output_file)
+    with open(output_file, 'w') as f:
+        output_json = json.dumps(output_json_dict)
+        f.write(output_json)
+
+
+def widerface_to_cocojson(root_path):
+    train_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_train_bbx_gt.txt")
+    val_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_val_bbx_gt.txt")
+    train_img_dir = os.path.join(root_path, "WIDER_train", "images")
+    val_img_dir = os.path.join(root_path, "WIDER_val", "images")
+    assert train_gt_txt
+    assert val_gt_txt
+    assert train_img_dir
+    assert val_img_dir
+    save_path = os.path.join(root_path, "widerface_train.json")
+    widerface_convert(train_gt_txt, train_img_dir, save_path)
+    print("Wider Face train dataset converts sucess, the json path: {}".format(save_path))
+    save_path = os.path.join(root_path, "widerface_val.json")
+    widerface_convert(val_gt_txt, val_img_dir, save_path)
+    print("Wider Face val dataset converts sucess, the json path: {}".format(save_path))
+
+
+def widerface_convert(gt_txt, img_dir, save_path):
+    output_json_dict = {
+        "images": [],
+        "type": "instances",
+        "annotations": [],
+        "categories": [{'supercategory': 'none', 'id': 0, 'name': "human_face"}]
+    }
+    bnd_id = 1  # bounding box start id
+    im_id = 0
+    print('Start converting !')
+    with open(gt_txt) as fd:
+        lines = fd.readlines()
+
+    i = 0
+    while i < len(lines):
+        image_name = lines[i].strip()
+        bbox_num = int(lines[i + 1].strip())
+        i += 2
+        img_info = get_widerface_image_info(img_dir, image_name, im_id)
+        if img_info:
+            output_json_dict["images"].append(img_info)
+            for j in range(i, i + bbox_num):
+                anno = get_widerface_ann_info(lines[j])
+                anno.update({'image_id': im_id, 'id': bnd_id})
+                output_json_dict['annotations'].append(anno)
+                bnd_id += 1
+        else:
+            print("The image dose not exist: {}".format(os.path.join(img_dir, image_name)))
+        bbox_num = 1 if bbox_num == 0 else bbox_num
+        i += bbox_num
+        im_id += 1
+    with open(save_path, 'w') as f:
+        output_json = json.dumps(output_json_dict)
+        f.write(output_json)
+
+
+def get_widerface_image_info(img_root, img_relative_path, img_id):
+    image_info = {}
+    save_path = os.path.join(img_root, img_relative_path)
+    if os.path.exists(save_path):
+        img = cv2.imread(save_path)
+        image_info["file_name"] = os.path.join(os.path.basename(
+            os.path.dirname(img_root)), os.path.basename(img_root),
+            img_relative_path)
+        image_info["height"] = img.shape[0]
+        image_info["width"] = img.shape[1]
+        image_info["id"] = img_id
+    return image_info
+
+
+def get_widerface_ann_info(info):
+    info = [int(x) for x in info.strip().split()]
+    anno = {
+        'area': info[2] * info[3],
+        'iscrowd': 0,
+        'bbox': [info[0], info[1], info[2], info[3]],
+        'category_id': 0,
+        'ignore': 0,
+        'blur': info[4],
+        'expression': info[5],
+        'illumination': info[6],
+        'invalid': info[7],
+        'occlusion': info[8],
+        'pose': info[9]
+    }
+    return anno
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--dataset_type',
+        help='the type of dataset, can be `voc`, `widerface`, `labelme` or `cityscape`')
+    parser.add_argument('--json_input_dir', help='input annotated directory')
+    parser.add_argument('--image_input_dir', help='image directory')
+    parser.add_argument(
+        '--output_dir', help='output dataset directory', default='./')
+    parser.add_argument(
+        '--train_proportion',
+        help='the proportion of train dataset',
+        type=float,
+        default=1.0)
+    parser.add_argument(
+        '--val_proportion',
+        help='the proportion of validation dataset',
+        type=float,
+        default=0.0)
+    parser.add_argument(
+        '--test_proportion',
+        help='the proportion of test dataset',
+        type=float,
+        default=0.0)
+    parser.add_argument(
+        '--voc_anno_dir',
+        help='In Voc format dataset, path to annotation files directory.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_anno_list',
+        help='In Voc format dataset, path to annotation files ids list.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_label_list',
+        help='In Voc format dataset, path to label list. The content of each line is a category.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_out_name',
+        type=str,
+        default='voc.json',
+        help='In Voc format dataset, path to output json file')
+    parser.add_argument(
+        '--widerface_root_dir',
+        help='The root_path for wider face dataset, which contains `wider_face_split`, `WIDER_train` and `WIDER_val`.And the json file will save in this path',
+        type=str,
+        default=None)
+    args = parser.parse_args()
+    try:
+        assert args.dataset_type in ['voc', 'labelme', 'cityscape', 'widerface']
+    except AssertionError as e:
+        print(
+            'Now only support the voc, cityscape dataset and labelme dataset!!')
+        os._exit(0)
+
+    if args.dataset_type == 'voc':
+        assert args.voc_anno_dir and args.voc_anno_list and args.voc_label_list
+        label2id, ann_paths = voc_get_label_anno(
+            args.voc_anno_dir, args.voc_anno_list, args.voc_label_list)
+        voc_xmls_to_cocojson(
+            annotation_paths=ann_paths,
+            label2id=label2id,
+            output_dir=args.output_dir,
+            output_file=args.voc_out_name)
+    elif args.dataset_type == "widerface":
+        assert args.widerface_root_dir
+        widerface_to_cocojson(args.widerface_root_dir)
+    else:
+        try:
+            assert os.path.exists(args.json_input_dir)
+        except AssertionError as e:
+            print('The json folder does not exist!')
+            os._exit(0)
+        try:
+            assert os.path.exists(args.image_input_dir)
+        except AssertionError as e:
+            print('The image folder does not exist!')
+            os._exit(0)
+        try:
+            assert abs(args.train_proportion + args.val_proportion \
+                    + args.test_proportion - 1.0) < 1e-5
+        except AssertionError as e:
+            print(
+                'The sum of pqoportion of training, validation and test datase must be 1!'
+            )
+            os._exit(0)
+
+        # Allocate the dataset.
+        total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json')))
+        if args.train_proportion != 0:
+            train_num = int(total_num * args.train_proportion)
+            out_dir = args.output_dir + '/train'
+            if not os.path.exists(out_dir):
+                os.makedirs(out_dir)
+        else:
+            train_num = 0
+        if args.val_proportion == 0.0:
+            val_num = 0
+            test_num = total_num - train_num
+            out_dir = args.output_dir + '/test'
+            if args.test_proportion != 0.0 and not os.path.exists(out_dir):
+                os.makedirs(out_dir)
+        else:
+            val_num = int(total_num * args.val_proportion)
+            test_num = total_num - train_num - val_num
+            val_out_dir = args.output_dir + '/val'
+            if not os.path.exists(val_out_dir):
+                os.makedirs(val_out_dir)
+            test_out_dir = args.output_dir + '/test'
+            if args.test_proportion != 0.0 and not os.path.exists(test_out_dir):
+                os.makedirs(test_out_dir)
+        count = 1
+        for img_name in os.listdir(args.image_input_dir):
+            if count <= train_num:
+                if osp.exists(args.output_dir + '/train/'):
+                    shutil.copyfile(
+                        osp.join(args.image_input_dir, img_name),
+                        osp.join(args.output_dir + '/train/', img_name))
+            else:
+                if count <= train_num + val_num:
+                    if osp.exists(args.output_dir + '/val/'):
+                        shutil.copyfile(
+                            osp.join(args.image_input_dir, img_name),
+                            osp.join(args.output_dir + '/val/', img_name))
+                else:
+                    if osp.exists(args.output_dir + '/test/'):
+                        shutil.copyfile(
+                            osp.join(args.image_input_dir, img_name),
+                            osp.join(args.output_dir + '/test/', img_name))
+            count = count + 1
+
+        # Deal with the json files.
+        if not os.path.exists(args.output_dir + '/annotations'):
+            os.makedirs(args.output_dir + '/annotations')
+        if args.train_proportion != 0:
+            train_data_coco = deal_json(args.dataset_type,
+                                        args.output_dir + '/train',
+                                        args.json_input_dir)
+            train_json_path = osp.join(args.output_dir + '/annotations',
+                                       'instance_train.json')
+            json.dump(
+                train_data_coco,
+                open(train_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+        if args.val_proportion != 0:
+            val_data_coco = deal_json(args.dataset_type,
+                                      args.output_dir + '/val',
+                                      args.json_input_dir)
+            val_json_path = osp.join(args.output_dir + '/annotations',
+                                     'instance_val.json')
+            json.dump(
+                val_data_coco,
+                open(val_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+        if args.test_proportion != 0:
+            test_data_coco = deal_json(args.dataset_type,
+                                       args.output_dir + '/test',
+                                       args.json_input_dir)
+            test_json_path = osp.join(args.output_dir + '/annotations',
+                                      'instance_test.json')
+            json.dump(
+                test_data_coco,
+                open(test_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+
+
+if __name__ == '__main__':
+    main()