add demo module

abaad570 · limm · a64900f3 · abaad570 · abaad570 · abaad570
Commit abaad570 authored Jun 25, 2025 by limm
19 changed files
--- a/demo/java/src/ImageSegmentation.java
+++ b/demo/java/src/ImageSegmentation.java
+import mmdeploy.Segmentor;
+import mmdeploy.PixelFormat;
+import mmdeploy.DataType;
+import mmdeploy.Mat;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+import java.io.File;
+import java.io.IOException;
+
+/** @description: this is a class for ImageSegmentation java demo. */
+public class ImageSegmentation {
+
+    /** The main function for ImageSegmentation Java demo.
+     * @param deviceName: the device name of the demo.
+     * @param modelPath: the image segmentation model path.
+     * @param imagePath: the image path.
+     */
+    public static void main(String[] args) {
+        // Parse arguments
+        if (args.length != 3) {
+            System.out.println("usage:\njava ImageSegmentation deviceName modelPath imagePath");
+            return;
+        }
+        String deviceName = args[0];
+        String modelPath = args[1];
+        String imagePath = args[2];
+
+        // create segmentor
+        Segmentor segmentor = null;
+
+        try {
+            segmentor = new Segmentor(modelPath, deviceName, 0);
+
+            // load image
+            Mat img = Utils.loadImage(imagePath);
+
+            // apply segmentor
+            Segmentor.Result[] result = segmentor.apply(img);
+
+            // print results
+            for (Segmentor.Result value : result) {
+                System.out.printf("mask height=%d, width=%d\n", value.height, value.width);
+            }
+        } catch (Exception e) {
+            System.out.println("exception: " + e.getMessage());
+        } finally {
+            // release segmentor
+            if (segmentor != null) {
+                segmentor.release();
+            }
+        }
+    }
+}
--- a/demo/java/src/ObjectDetection.java
+++ b/demo/java/src/ObjectDetection.java
+import mmdeploy.Detector;
+import mmdeploy.PixelFormat;
+import mmdeploy.DataType;
+import mmdeploy.Mat;
+
+import javax.imageio.ImageIO;
+
+import java.awt.Color;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+import java.awt.Graphics;
+import java.io.File;
+import java.io.IOException;
+
+/** @description: this is a class for ObjectDetection java demo. */
+public class ObjectDetection {
+
+    /** The main function for ObjectDetection Java demo.
+     * @param deviceName: the device name of the demo.
+     * @param modelPath: the object detection model path.
+     * @param imagePath: the image path.
+     */
+    public static void main(String[] args) {
+        // Parse arguments
+        if (args.length != 3) {
+            System.out.println("usage:\njava ObjectDetection deviceName modelPath imagePath");
+            return;
+        }
+        String deviceName = args[0];
+        String modelPath = args[1];
+        String imagePath = args[2];
+
+        // create detector
+        Detector detector = null;
+        try {
+            detector = new Detector(modelPath, deviceName, 0);
+
+            // load image
+            BufferedImage srcImg = ImageIO.read(new File(imagePath));
+            Mat img = Utils.bufferedImage2Mat(srcImg);
+
+            // apply detector
+            Detector.Result[] result = detector.apply(img);
+
+            // print results
+            Graphics ghandle = srcImg.createGraphics();
+            for (int i = 0; i < result.length; i++) {
+                Detector.Result value = result[i];
+                System.out.printf("box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n",
+                                  i, value.bbox.left, value.bbox.top, value.bbox.right, value.bbox.bottom, value.label_id, value.score);
+                if ((value.bbox.right - value.bbox.left) < 1 || (value.bbox.bottom - value.bbox.top) < 1) {
+                    continue;
+                }
+
+                // skip detections less than specified score threshold
+                if (value.score < 0.3) {
+                    continue;
+                }
+                if (value.mask != null) {
+                    System.out.printf("mask %d, height=%d, width=%d\n", i, value.mask.shape[0], value.mask.shape[1]);
+                }
+                ghandle.setColor(new Color(0, 255, 0));
+                ghandle.drawRect((int)value.bbox.left, (int)value.bbox.top, (int)value.bbox.right - (int)value.bbox.left + 1, (int)value.bbox.bottom - (int)value.bbox.top + 1);
+            }
+            ghandle.dispose();
+            ImageIO.write(srcImg, "png", new File("output_detection.png"));
+        } catch (Exception e) {
+            System.out.println("exception: " + e.getMessage());
+        } finally {
+            // release detector
+            if (detector != null) {
+                detector.release();
+            }
+        }
+    }
+}
--- a/demo/java/src/Ocr.java
+++ b/demo/java/src/Ocr.java
+import mmdeploy.TextDetector;
+import mmdeploy.TextRecognizer;
+import mmdeploy.PixelFormat;
+import mmdeploy.DataType;
+import mmdeploy.Mat;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+import java.io.File;
+import java.io.IOException;
+
+/** @description: this is a class for Ocr java demo. */
+public class Ocr {
+
+    /** The main function for Ocr Java demo.
+     * @param deviceName: the device name of the demo.
+     * @param detModelPath: the text detection model path.
+     * @param recModelPath: the text recognition model path.
+     * @param imagePath: the image path.
+     */
+    public static void main(String[] args) {
+        // Parse arguments
+        if (args.length != 4) {
+            System.out.println("usage:\njava Ocr deviceName detModelPath recModelPath imagePath");
+            return;
+        }
+        String deviceName = args[0];
+        String detModelPath = args[1];
+        String recModelPath = args[2];
+        String imagePath = args[3];
+
+        // create text detector and recognizer
+        TextDetector text_detector = null;
+        TextRecognizer text_recognizer = null;
+
+        try {
+            text_detector = new TextDetector(detModelPath, deviceName, 0);
+            text_recognizer = new TextRecognizer(recModelPath, deviceName, 0);
+
+            // load image
+            Mat img = Utils.loadImage(imagePath);
+
+            // apply text detector
+            TextDetector.Result[] detResult = text_detector.apply(img);
+
+            int [] detResultCount = {detResult.length};
+            TextRecognizer.Result[] recResult = text_recognizer.applyBbox(img, detResult, detResultCount);
+            if (recResult == null) {
+                System.out.println("Apply TextRecognizer failed.");
+                System.exit(1);
+            }
+            // print results
+            for (int i = 0; i < detResultCount[0]; ++i) {
+                System.out.printf("box[%d]: %s\n", i, new String(recResult[i].text));
+                for (int j = 0; j < 4; ++j) {
+                    System.out.printf("x: %.2f, y: %.2f, ", detResult[i].bbox[j].x, detResult[i].bbox[j].y);
+                }
+                System.out.printf("\n");
+            }
+        } catch (Exception e) {
+            System.out.println("exception: " + e.getMessage());
+        } finally {
+            // release text detector and recognizer
+            if (text_recognizer != null) {
+                text_recognizer.release();
+            }
+            if (text_detector != null) {
+                text_detector.release();
+            }
+        }
+    }
+}
--- a/demo/java/src/PoseDetection.java
+++ b/demo/java/src/PoseDetection.java
+import mmdeploy.PoseDetector;
+import mmdeploy.PixelFormat;
+import mmdeploy.DataType;
+import mmdeploy.Mat;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+import java.io.File;
+import java.io.IOException;
+
+/** @description: this is a class for PoseDetection java demo. */
+public class PoseDetection {
+
+    /** The main function for PoseDetection Java demo.
+     * @param deviceName: the device name of the demo.
+     * @param modelPath: the pose detection model path.
+     * @param imagePath: the image path.
+     */
+    public static void main(String[] args) {
+        // Parse arguments
+        if (args.length != 3) {
+            System.out.println("usage:\njava PoseDetection deviceName modelPath imagePath");
+            return;
+        }
+        String deviceName = args[0];
+        String modelPath = args[1];
+        String imagePath = args[2];
+
+        // create pose estimator
+        PoseDetector poseEstimator = null;
+
+        try {
+            poseEstimator = new PoseDetector(modelPath, deviceName, 0);
+
+            // load image
+            Mat img = Utils.loadImage(imagePath);
+
+            // apply pose estimator
+            PoseDetector.Result[] result = poseEstimator.apply(img);
+
+            // print results
+            for (PoseDetector.Result value : result) {
+                for (int i = 0; i < value.point.length; i++) {
+                    System.out.printf("point %d, x: %d, y: %d\n", i, (int)value.point[i].x, (int)value.point[i].y);
+                }
+            }
+        } catch (Exception e) {
+            System.out.println("exception: " + e.getMessage());
+        } finally {
+            // release pose estimator
+            if (poseEstimator != null) {
+                poseEstimator.release();
+            }
+        }
+    }
+}
--- a/demo/java/src/PoseTracker.java
+++ b/demo/java/src/PoseTracker.java
+import mmdeploy.PixelFormat;
+import mmdeploy.PointF;
+import mmdeploy.DataType;
+import mmdeploy.Mat;
+import mmdeploy.Model;
+import mmdeploy.Device;
+import mmdeploy.Context;
+import mmdeploy.Profiler;
+import mmdeploy.PoseTracker.*;
+
+import org.opencv.videoio.*;
+import org.opencv.core.*;
+import org.opencv.imgproc.*;
+import org.opencv.imgcodecs.*;
+import org.opencv.highgui.*;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+import java.io.File;
+import java.io.IOException;
+import java.lang.Math;
+
+/** @description: this is a class for PoseTracker java demo. */
+public class PoseTracker {
+    static {
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
+    }
+
+    /** This function visualize the PoseTracker results.
+     * @param frame: a video frame
+     * @param results: results of PoseTracker
+     * @param size: image size
+     * @param frameID: the frame index in the video
+     * @param withBbox: draw the person bbox or not
+     * @return: whether the quit keyboard input received
+     */
+    public static boolean Visualize(org.opencv.core.Mat frame, mmdeploy.PoseTracker.Result[] results, int size,
+        int frameID, boolean withBbox) {
+        int skeleton[][] = {{15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12},
+                {5, 6}, {5, 7}, {6, 8}, {7, 9}, {8, 10}, {1, 2}, {0, 1},
+                {0, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 6}};
+        Scalar palette[] = {new Scalar(255, 128, 0), new Scalar(255, 153, 51), new Scalar(255, 178, 102),
+                            new Scalar(230, 230, 0), new Scalar(255, 153, 255), new Scalar(153, 204, 255),
+                            new Scalar(255, 102, 255), new Scalar(255, 51, 255), new Scalar(102, 178, 255),
+                            new Scalar(51, 153, 255), new Scalar(255, 153, 153), new Scalar(255, 102, 102),
+                            new Scalar(255, 51, 51), new Scalar(153, 255, 153), new Scalar(102, 255, 102),
+                            new Scalar(51, 255, 51), new Scalar(0, 255, 0), new Scalar(0, 0, 255),
+                            new Scalar(255, 0, 0), new Scalar(255, 255, 255)};
+        int linkColor[] = {
+                0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
+            };
+        int pointColor[] = {16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0};
+
+        float scale = (float)size / (float)Math.max(frame.cols(), frame.rows());
+        if (scale != 1) {
+            Imgproc.resize(frame, frame, new Size(), scale, scale);
+        }
+        else {
+            frame = frame.clone();
+        }
+        for (int i = 0; i < results.length; i++) {
+            mmdeploy.PoseTracker.Result pt = results[i];
+            for (int j = 0; j < pt.keypoints.length; j++) {
+                PointF p = pt.keypoints[j];
+                p.x *= scale;
+                p.y *= scale;
+                pt.keypoints[j] = p;
+            }
+            float scoreThr = 0.5f;
+            int used[] = new int[pt.keypoints.length * 2];
+            for (int j = 0; j < skeleton.length; j++) {
+                int u = skeleton[j][0];
+                int v = skeleton[j][1];
+                if (pt.scores[u] > scoreThr && pt.scores[v] > scoreThr) {
+                    used[u] = used[v] = 1;
+                    Point pointU = new Point(pt.keypoints[u].x, pt.keypoints[u].y);
+                    Point pointV = new Point(pt.keypoints[v].x, pt.keypoints[v].y);
+                    Imgproc.line(frame, pointU, pointV, palette[linkColor[j]], 1);
+                }
+            }
+            for (int j = 0; j < pt.keypoints.length; j++) {
+                if (used[j] == 1) {
+                    Point p = new Point(pt.keypoints[j].x, pt.keypoints[j].y);
+                    Imgproc.circle(frame, p, 1, palette[pointColor[j]], 2);
+                }
+            }
+            if (withBbox) {
+                float bbox[] = {pt.bbox.left, pt.bbox.top, pt.bbox.right, pt.bbox.bottom};
+                for (int j = 0; j < 4; j++) {
+                    bbox[j] *= scale;
+                }
+                Imgproc.rectangle(frame, new Point(bbox[0], bbox[1]),
+                    new Point(bbox[2], bbox[3]), new Scalar(0, 255, 0));
+                }
+            }
+
+            HighGui.imshow("Pose Tracker", frame);
+            // Press any key to quit.
+            return HighGui.waitKey(5) == -1;
+    }
+
+    /** The main function for PoseTracker Java demo.
+     * @param deviceName: the device name of PoseTracker
+     * @param detModelPath: the person detection model path
+     * @param poseModelPath: the pose estimation model path
+     * @param videoPath: the video path
+     */
+    public static void main(String[] args) {
+        // Parse arguments
+        if (args.length != 4) {
+            System.out.println("usage:\n-Dcommand needs deviceName detModel poseModel videoPath");
+            return;
+        }
+        String deviceName = args[0];
+        String detModelPath = args[1];
+        String poseModelPath = args[2];
+        String videoPath = args[3];
+
+        // create pose tracker
+        mmdeploy.PoseTracker poseTracker = null;
+        Model detModel = new Model(detModelPath);
+        Model poseModel = new Model(poseModelPath);
+        Device device = new Device(deviceName, 0);
+        if (detModel.handle() == -1 || poseModel.handle() == -1 || device.handle() == -1) {
+            System.out.println("failed to create model or device");
+            System.exit(1);
+        }
+        Context context = new Context();
+        context.add(device);
+        try {
+            poseTracker = new mmdeploy.PoseTracker(detModel, poseModel, context);
+
+            mmdeploy.PoseTracker.Params params = poseTracker.initParams();
+            params.detInterval = 5;
+            params.poseMaxNumBboxes = 6;
+            long stateHandle = poseTracker.createState(params);
+            VideoCapture cap = new VideoCapture(videoPath);
+            if (!cap.isOpened()) {
+                System.out.printf("failed to open video: %s", videoPath);
+                System.exit(1);
+            }
+            int frameID = 0;
+            org.opencv.core.Mat frame = new org.opencv.core.Mat();
+            while (true) {
+                cap.read(frame);
+                System.out.printf("processing frame %d\n", frameID);
+                if (frame.empty()) {
+                    break;
+                }
+                Mat mat = Utils.cvMatToMat(frame);
+                // process
+                mmdeploy.PoseTracker.Result[] result = poseTracker.apply(stateHandle, mat, -1);
+
+                // visualize
+                if (!Visualize(frame, result, 1280, frameID++, true)) {
+                    break;
+                }
+            }
+        } catch (Exception e) {
+            System.out.println("exception: " + e.getMessage());
+        } finally {
+            // release pose tracker
+            if (poseTracker != null) {
+                poseTracker.release();
+            }
+            System.exit(0);
+        }
+    }
+}
--- a/demo/java/src/RotatedDetection.java
+++ b/demo/java/src/RotatedDetection.java
+import mmdeploy.RotatedDetector;
+import mmdeploy.PixelFormat;
+import mmdeploy.DataType;
+import mmdeploy.Mat;
+
+import javax.imageio.ImageIO;
+
+import java.awt.Color;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+import java.awt.Graphics;
+import java.io.File;
+import java.io.IOException;
+import java.lang.Math;
+
+/** @description: this is a class for RotatedDetection java demo. */
+public class RotatedDetection {
+
+    /** The main function for RotatedDetection Java demo.
+     * @param deviceName: the device name of the demo.
+     * @param modelPath: the rotated detection model path.
+     * @param imagePath: the image path.
+     */
+    public static void main(String[] args) {
+        // Parse arguments
+        if (args.length != 3) {
+            System.out.println("usage:\njava RotatedDetection deviceName modelPath imagePath");
+            return;
+        }
+        String deviceName = args[0];
+        String modelPath = args[1];
+        String imagePath = args[2];
+
+        // create rotated detector
+        RotatedDetector rotatedDetector = null;
+        try {
+            rotatedDetector = new RotatedDetector(modelPath, deviceName, 0);
+
+            // load image
+            BufferedImage srcImg = ImageIO.read(new File(imagePath));
+            Mat img = Utils.bufferedImage2Mat(srcImg);
+
+            // apply rotated detector
+            RotatedDetector.Result[] result = rotatedDetector.apply(img);
+
+            // print results
+            Graphics ghandle = srcImg.createGraphics();
+            for (int i = 0; i < result.length; i++) {
+                RotatedDetector.Result value = result[i];
+                float cx, cy, w, h, angle;
+                cx = value.rbbox[0];
+                cy = value.rbbox[1];
+                w = value.rbbox[2];
+                h = value.rbbox[3];
+                angle = value.rbbox[4];
+                float wx = w / 2 * (float)Math.cos(angle);
+                float wy = w / 2 * (float)Math.sin(angle);
+                float hx = -h / 2 * (float)Math.sin(angle);
+                float hy = h / 2 * (float)Math.cos(angle);
+                System.out.printf("box %d, score %.2f, point1: (%.2f, %.2f), point2: (%.2f, %.2f), point3: (%.2f, %.2f), point4: (%.2f, %.2f)\n",
+                                  i, value.score, cx - wx - hx, cy - wy - hy, cx + wx - hx, cy + wy - hy, cx + wx + hx, cy + wy + hy, cx - wx + hx, cy - wy + hy);
+
+                // skip rotated detections less than specified score threshold
+                if (value.score < 0.1) {
+                    continue;
+                }
+                ghandle.setColor(new Color(0, 255, 0));
+                int[] polygonX = new int[] {(int)(cx - wx - hx), (int)(cx + wx - hx), (int)(cx + wx + hx), (int)(cx - wx + hx)};
+                int[] polygonY = new int[] {(int)(cy - wy - hy), (int)(cy + wy - hy), (int)(cy + wy + hy), (int)(cy - wy + hy)};
+                ghandle.drawPolygon(polygonX, polygonY, 4);
+            }
+            ghandle.dispose();
+            ImageIO.write(srcImg, "png", new File("output_rotated_detection.png"));
+        } catch (Exception e) {
+            System.out.println("exception: " + e.getMessage());
+        } finally {
+            // release rotated detector
+            if (rotatedDetector != null) {
+                rotatedDetector.release();
+            }
+        }
+    }
+}
--- a/demo/java/src/Utils.java
+++ b/demo/java/src/Utils.java
+import mmdeploy.PixelFormat;
+import mmdeploy.DataType;
+import mmdeploy.Mat;
+
+import org.opencv.core.*;
+import org.opencv.imgcodecs.*;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+import java.io.File;
+import java.io.IOException;
+import java.lang.*;
+
+/** @description: this is a util class for java demo. */
+public class Utils {
+
+    /** This function loads the image by path.
+     * @param path: the image path.
+     * @return: the image with Mat format.
+     * @exception IOException: throws an IO exception when load failed.
+     */
+    public static Mat loadImage(String path) throws IOException {
+        BufferedImage img = ImageIO.read(new File(path));
+        return bufferedImage2Mat(img);
+    }
+
+    /** This function changes bufferedImage to Mat.
+     * @param img: the bufferedImage.
+     * @return: the image with Mat format.
+     */
+    public static Mat bufferedImage2Mat(BufferedImage img) {
+        byte[] data = ((DataBufferByte) img.getData().getDataBuffer()).getData();
+        return new Mat(img.getHeight(), img.getWidth(), img.getColorModel().getNumComponents(),
+                PixelFormat.BGR, DataType.INT8, data);
+    }
+
+    /** This function changes cvMat to Mat.
+     * @param cvMat: the image with opencv Mat format.
+     * @return: the image with Mat format.
+     */
+    public static Mat cvMatToMat(org.opencv.core.Mat cvMat)
+    {
+        byte[] dataPointer = new byte[cvMat.rows() * cvMat.cols() * cvMat.channels() * (int)cvMat.elemSize()];
+        cvMat.get(0, 0, dataPointer);
+        return new Mat(cvMat.rows(), cvMat.cols(), cvMat.channels(),
+                PixelFormat.BGR, DataType.INT8, dataPointer);
+    }
+}
--- a/demo/python/det_pose.py
+++ b/demo/python/det_pose.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+import numpy as np
+from mmdeploy_runtime import Detector, PoseDetector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use SDK Python API')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'det_model_path',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument(
+        'pose_model_path',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument('image_path', help='path of input image')
+    args = parser.parse_args()
+    return args
+
+
+def visualize(frame, keypoints, filename, thr=0.5, resize=1280):
+    skeleton = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 12), (5, 11),
+                (6, 12), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (1, 2),
+                (0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6)]
+    palette = [(255, 128, 0), (255, 153, 51), (255, 178, 102), (230, 230, 0),
+               (255, 153, 255), (153, 204, 255), (255, 102, 255),
+               (255, 51, 255), (102, 178, 255),
+               (51, 153, 255), (255, 153, 153), (255, 102, 102), (255, 51, 51),
+               (153, 255, 153), (102, 255, 102), (51, 255, 51), (0, 255, 0),
+               (0, 0, 255), (255, 0, 0), (255, 255, 255)]
+    link_color = [
+        0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
+    ]
+    point_color = [16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]
+
+    scale = resize / max(frame.shape[0], frame.shape[1])
+
+    scores = keypoints[..., 2]
+    keypoints = (keypoints[..., :2] * scale).astype(int)
+
+    img = cv2.resize(frame, (0, 0), fx=scale, fy=scale)
+    for kpts, score in zip(keypoints, scores):
+        show = [0] * len(kpts)
+        for (u, v), color in zip(skeleton, link_color):
+            if score[u] > thr and score[v] > thr:
+                cv2.line(img, kpts[u], tuple(kpts[v]), palette[color], 1,
+                         cv2.LINE_AA)
+                show[u] = show[v] = 1
+        for kpt, show, color in zip(kpts, show, point_color):
+            if show:
+                cv2.circle(img, kpt, 1, palette[color], 2, cv2.LINE_AA)
+    cv2.imwrite(filename, img)
+
+
+def main():
+    args = parse_args()
+
+    # load image
+    img = cv2.imread(args.image_path)
+
+    # create object detector
+    detector = Detector(
+        model_path=args.det_model_path, device_name=args.device_name)
+    # create pose detector
+    pose_detector = PoseDetector(
+        model_path=args.pose_model_path, device_name=args.device_name)
+
+    # apply detector
+    bboxes, labels, _ = detector(img)
+
+    # filter detections
+    keep = np.logical_and(labels == 0, bboxes[..., 4] > 0.6)
+    bboxes = bboxes[keep, :4]
+
+    # apply pose detector
+    poses = pose_detector(img, bboxes)
+
+    visualize(img, poses, 'det_pose_output.jpg', 0.5, 1280)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/image_classification.py
+++ b/demo/python/image_classification.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+from mmdeploy_runtime import Classifier
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'model_path',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument('image_path', help='path of an image')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    img = cv2.imread(args.image_path)
+    classifier = Classifier(
+        model_path=args.model_path, device_name=args.device_name, device_id=0)
+    result = classifier(img)
+    for label_id, score in result:
+        print(label_id, score)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/image_restorer.py
+++ b/demo/python/image_restorer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+from mmdeploy_runtime import Restorer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'model_path', help='path of SDK model dumped by model converter')
+    parser.add_argument('image_path', help='path of an image')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    img = cv2.imread(args.image_path)
+
+    restorer = Restorer(
+        model_path=args.model_path, device_name=args.device_name, device_id=0)
+    result = restorer(img)
+
+    # convert to BGR
+    result = result[..., ::-1]
+    cv2.imwrite('output_restorer.bmp', result)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/image_segmentation.py
+++ b/demo/python/image_segmentation.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+import numpy as np
+from mmdeploy_runtime import Segmentor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'model_path',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument('image_path', help='path of an image')
+    args = parser.parse_args()
+    return args
+
+
+def get_palette(num_classes=256):
+    state = np.random.get_state()
+    # random color
+    np.random.seed(42)
+    palette = np.random.randint(0, 256, size=(num_classes, 3))
+    np.random.set_state(state)
+    return [tuple(c) for c in palette]
+
+
+def main():
+    args = parse_args()
+
+    img = cv2.imread(args.image_path)
+
+    segmentor = Segmentor(
+        model_path=args.model_path, device_name=args.device_name, device_id=0)
+    seg = segmentor(img)
+    if seg.dtype == np.float32:
+        seg = np.argmax(seg, axis=0)
+
+    palette = get_palette()
+    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+    for label, color in enumerate(palette):
+        color_seg[seg == label, :] = color
+    # convert to BGR
+    color_seg = color_seg[..., ::-1]
+
+    img = img * 0.5 + color_seg * 0.5
+    img = img.astype(np.uint8)
+    cv2.imwrite('output_segmentation.png', img)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/object_detection.py
+++ b/demo/python/object_detection.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import math
+
+import cv2
+from mmdeploy_runtime import Detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'model_path',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument('image_path', help='path of an image')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    img = cv2.imread(args.image_path)
+    detector = Detector(
+        model_path=args.model_path, device_name=args.device_name, device_id=0)
+    bboxes, labels, masks = detector(img)
+
+    indices = [i for i in range(len(bboxes))]
+    for index, bbox, label_id in zip(indices, bboxes, labels):
+        [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
+        if score < 0.3:
+            continue
+
+        cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
+
+        if masks[index].size:
+            mask = masks[index]
+            blue, green, red = cv2.split(img)
+            if mask.shape == img.shape[:2]:  # rtmdet-inst
+                mask_img = blue
+            else:  # maskrcnn
+                x0 = int(max(math.floor(bbox[0]) - 1, 0))
+                y0 = int(max(math.floor(bbox[1]) - 1, 0))
+                mask_img = blue[y0:y0 + mask.shape[0], x0:x0 + mask.shape[1]]
+            cv2.bitwise_or(mask, mask_img, mask_img)
+            img = cv2.merge([blue, green, red])
+
+    cv2.imwrite('output_detection.png', img)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/ocr.py
+++ b/demo/python/ocr.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+from mmdeploy_runtime import TextDetector, TextRecognizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument('image_path', help='path of an image')
+    parser.add_argument(
+        '--textdet',
+        default='',
+        help='path of mmdeploy text-detector SDK model dumped by'
+        'model converter',
+    )
+    parser.add_argument(
+        '--textrecog',
+        default='',
+        help='path of mmdeploy text-recognizer SDK model dumped by'
+        'model converter',
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    img = cv2.imread(args.image_path)
+
+    if args.textdet:
+        detector = TextDetector(
+            model_path=args.textdet, device_name=args.device_name, device_id=0)
+        bboxes = detector(img)
+        print(f'bboxes.shape={bboxes.shape}')
+        print(f'bboxes={bboxes}')
+        if len(bboxes) > 0:
+            pts = ((bboxes[:, 0:8] + 0.5).reshape(len(bboxes), -1,
+                                                  2).astype(int))
+            cv2.polylines(img, pts, True, (0, 255, 0), 2)
+            cv2.imwrite('output_ocr.png', img)
+
+        if len(bboxes) > 0 and args.textrecog:
+            recognizer = TextRecognizer(
+                model_path=args.textrecog,
+                device_name=args.device_name,
+                device_id=0,
+            )
+            texts = recognizer(img, bboxes.flatten().tolist())
+            print(texts)
+
+    elif args.textrecog:
+        recognizer = TextRecognizer(
+            model_path=args.textrecog,
+            device_name=args.device_name,
+            device_id=0,
+        )
+        texts = recognizer(img)
+        print(texts)
+    else:
+        print('do nothing since neither text detection sdk model or '
+              'text recognition sdk model in input')
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/pipeline.py
+++ b/demo/python/pipeline.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+
+import cv2
+from mmdeploy_runtime import Context, Device, Model, Pipeline
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Demo of MMDeploy SDK pipeline API')
+    parser.add_argument('device', help='name of device, cuda or cpu')
+    parser.add_argument('det_model_path', help='path of detection model')
+    parser.add_argument('cls_model_path', help='path of classification model')
+    parser.add_argument('image_path', help='path to test image')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    det_model = Model(args.det_model_path)
+    reg_model = Model(args.cls_model_path)
+
+    config = dict(
+        type='Pipeline',
+        input='img',
+        tasks=[
+            dict(
+                type='Inference',
+                input='img',
+                output='dets',
+                params=dict(model=det_model)),
+            dict(
+                type='Pipeline',
+                # flatten dets ([[a]] -> [a]) and broadcast img
+                input=['boxes=*dets', 'imgs=+img'],
+                tasks=[
+                    dict(
+                        type='Task',
+                        module='CropBox',
+                        input=['imgs', 'boxes'],
+                        output='patches'),
+                    dict(
+                        type='Inference',
+                        input='patches',
+                        output='labels',
+                        params=dict(model=reg_model))
+                ],
+                # unflatten labels ([a] -> [[a]])
+                output='*labels')
+        ],
+        output=['dets', 'labels'])
+
+    device = Device(args.device)
+    pipeline = Pipeline(config, Context(device))
+
+    img = cv2.imread(args.image_path)
+
+    output = pipeline(dict(ori_img=img))
+
+    print(json.dumps(output, indent=4))
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/pose_detection.py
+++ b/demo/python/pose_detection.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+import numpy as np
+from mmdeploy_runtime import PoseDetector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'model_path',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument('image_path', help='path of an image')
+    parser.add_argument(
+        '--bbox',
+        default=None,
+        nargs='+',
+        type=int,
+        help='bounding box of an object in format (x, y, w, h)')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    img = cv2.imread(args.image_path)
+
+    detector = PoseDetector(
+        model_path=args.model_path, device_name=args.device_name, device_id=0)
+
+    if args.bbox is None:
+        result = detector(img)
+    else:
+        # converter (x, y, w, h) -> (left, top, right, bottom)
+        print(args.bbox)
+        bbox = np.array(args.bbox, dtype=int)
+        bbox[2:] += bbox[:2]
+        result = detector(img, bbox)
+    print(result)
+
+    _, point_num, _ = result.shape
+    points = result[:, :, :2].reshape(point_num, 2)
+    for [x, y] in points.astype(int):
+        cv2.circle(img, (x, y), 1, (0, 255, 0), 2)
+
+    cv2.imwrite('output_pose.png', img)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/pose_tracker.py
+++ b/demo/python/pose_tracker.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+import cv2
+import numpy as np
+from mmdeploy_runtime import PoseTracker
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use SDK Python API')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'det_model',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument(
+        'pose_model',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument('video', help='video path or camera index')
+    parser.add_argument('--output_dir', help='output directory', default=None)
+    parser.add_argument(
+        '--skeleton',
+        default='coco',
+        choices=['coco', 'coco_wholebody'],
+        help='skeleton for keypoints')
+
+    args = parser.parse_args()
+    if args.video.isnumeric():
+        args.video = int(args.video)
+    return args
+
+
+VISUALIZATION_CFG = dict(
+    coco=dict(
+        skeleton=[(15, 13), (13, 11), (16, 14), (14, 12), (11, 12), (5, 11),
+                  (6, 12), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (1, 2),
+                  (0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6)],
+        palette=[(255, 128, 0), (255, 153, 51), (255, 178, 102), (230, 230, 0),
+                 (255, 153, 255), (153, 204, 255), (255, 102, 255),
+                 (255, 51, 255), (102, 178, 255), (51, 153, 255),
+                 (255, 153, 153), (255, 102, 102), (255, 51, 51),
+                 (153, 255, 153), (102, 255, 102), (51, 255, 51), (0, 255, 0),
+                 (0, 0, 255), (255, 0, 0), (255, 255, 255)],
+        link_color=[
+            0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
+        ],
+        point_color=[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0],
+        sigmas=[
+            0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072,
+            0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+        ]),
+    coco_wholebody=dict(
+        skeleton=[(15, 13), (13, 11), (16, 14), (14, 12), (11, 12), (5, 11),
+                  (6, 12), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (1, 2),
+                  (0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (15, 17),
+                  (15, 18), (15, 19), (16, 20), (16, 21), (16, 22), (91, 92),
+                  (92, 93), (93, 94), (94, 95), (91, 96), (96, 97), (97, 98),
+                  (98, 99), (91, 100), (100, 101), (101, 102), (102, 103),
+                  (91, 104), (104, 105), (105, 106), (106, 107), (91, 108),
+                  (108, 109), (109, 110), (110, 111), (112, 113), (113, 114),
+                  (114, 115), (115, 116), (112, 117), (117, 118), (118, 119),
+                  (119, 120), (112, 121), (121, 122), (122, 123), (123, 124),
+                  (112, 125), (125, 126), (126, 127), (127, 128), (112, 129),
+                  (129, 130), (130, 131), (131, 132)],
+        palette=[(51, 153, 255), (0, 255, 0), (255, 128, 0), (255, 255, 255),
+                 (255, 153, 255), (102, 178, 255), (255, 51, 51)],
+        link_color=[
+            1, 1, 2, 2, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
+            2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1,
+            1, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1
+        ],
+        point_color=[
+            0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2,
+            2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+            3, 3, 3, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1,
+            1, 1, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1
+        ],
+        sigmas=[
+            0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072,
+            0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068,
+            0.066, 0.066, 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043,
+            0.040, 0.035, 0.031, 0.025, 0.020, 0.023, 0.029, 0.032, 0.037,
+            0.038, 0.043, 0.041, 0.045, 0.013, 0.012, 0.011, 0.011, 0.012,
+            0.012, 0.011, 0.011, 0.013, 0.015, 0.009, 0.007, 0.007, 0.007,
+            0.012, 0.009, 0.008, 0.016, 0.010, 0.017, 0.011, 0.009, 0.011,
+            0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, 0.034, 0.008,
+            0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, 0.009,
+            0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+            0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+            0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032,
+            0.02, 0.019, 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047,
+            0.026, 0.025, 0.024, 0.035, 0.018, 0.024, 0.022, 0.026, 0.017,
+            0.021, 0.021, 0.032, 0.02, 0.019, 0.022, 0.031
+        ]))
+
+
+def visualize(frame,
+              results,
+              output_dir,
+              frame_id,
+              thr=0.5,
+              resize=1280,
+              skeleton_type='coco'):
+
+    skeleton = VISUALIZATION_CFG[skeleton_type]['skeleton']
+    palette = VISUALIZATION_CFG[skeleton_type]['palette']
+    link_color = VISUALIZATION_CFG[skeleton_type]['link_color']
+    point_color = VISUALIZATION_CFG[skeleton_type]['point_color']
+
+    scale = resize / max(frame.shape[0], frame.shape[1])
+    keypoints, bboxes, _ = results
+    scores = keypoints[..., 2]
+    keypoints = (keypoints[..., :2] * scale).astype(int)
+    bboxes *= scale
+    img = cv2.resize(frame, (0, 0), fx=scale, fy=scale)
+    for kpts, score, bbox in zip(keypoints, scores, bboxes):
+        show = [1] * len(kpts)
+        for (u, v), color in zip(skeleton, link_color):
+            if score[u] > thr and score[v] > thr:
+                cv2.line(img, kpts[u], tuple(kpts[v]), palette[color], 1,
+                         cv2.LINE_AA)
+            else:
+                show[u] = show[v] = 0
+        for kpt, show, color in zip(kpts, show, point_color):
+            if show:
+                cv2.circle(img, kpt, 1, palette[color], 2, cv2.LINE_AA)
+    if output_dir:
+        cv2.imwrite(f'{output_dir}/{str(frame_id).zfill(6)}.jpg', img)
+    else:
+        cv2.imshow('pose_tracker', img)
+        return cv2.waitKey(1) != 'q'
+    return True
+
+
+def main():
+    args = parse_args()
+    np.set_printoptions(precision=4, suppress=True)
+    video = cv2.VideoCapture(args.video)
+
+    tracker = PoseTracker(
+        det_model=args.det_model,
+        pose_model=args.pose_model,
+        device_name=args.device_name)
+
+    # optionally use OKS for keypoints similarity comparison
+    sigmas = VISUALIZATION_CFG[args.skeleton]['sigmas']
+    state = tracker.create_state(
+        det_interval=1, det_min_bbox_size=100, keypoint_sigmas=sigmas)
+
+    if args.output_dir:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    frame_id = 0
+    while True:
+        success, frame = video.read()
+        if not success:
+            break
+        results = tracker(state, frame, detect=-1)
+        if not visualize(
+                frame,
+                results,
+                args.output_dir,
+                frame_id,
+                skeleton_type=args.skeleton):
+            break
+        frame_id += 1
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/rotated_object_detection.py
+++ b/demo/python/rotated_object_detection.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from math import cos, sin
+
+import cv2
+import numpy as np
+from mmdeploy_runtime import RotatedDetector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'model_path', help='path of SDK model dumped by model converter')
+    parser.add_argument('image_path', help='path of an image')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    img = cv2.imread(args.image_path)
+    detector = RotatedDetector(args.model_path, args.device_name, 0)
+    rbboxes, labels = detector(img)
+
+    indices = [i for i in range(len(rbboxes))]
+    for index, rbbox, label_id in zip(indices, rbboxes, labels):
+        [cx, cy, w, h, angle], score = rbbox[0:5], rbbox[-1]
+        if score < 0.1:
+            continue
+        [wx, wy, hx, hy] = \
+            0.5 * np.array([w, w, -h, h]) * \
+            np.array([cos(angle), sin(angle), sin(angle), cos(angle)])
+        points = np.array([[[int(cx - wx - hx),
+                             int(cy - wy - hy)],
+                            [int(cx + wx - hx),
+                             int(cy + wy - hy)],
+                            [int(cx + wx + hx),
+                             int(cy + wy + hy)],
+                            [int(cx - wx + hx),
+                             int(cy - wy + hy)]]])
+        cv2.drawContours(img, points, -1, (0, 255, 0), 2)
+
+    cv2.imwrite('output_detection.png', img)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/python/video_recognition.py
+++ b/demo/python/video_recognition.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+from mmdeploy_runtime import VideoRecognizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'model_path',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument('video_path', help='path of an video')
+    parser.add_argument(
+        '--clip_len', help='Frames of each sampled output clip', default=1)
+    parser.add_argument(
+        '--frame_interval',
+        help='Temporal interval of adjacent sampled frames.',
+        default=1)
+    parser.add_argument(
+        '--num_clips', help='Number of clips to be sampled', default=25)
+    args = parser.parse_args()
+    return args
+
+
+def SampleFrames(cap, clip_len, frame_interval, num_clips):
+    if not cap.isOpened():
+        print('failed to load video')
+        exit(-1)
+
+    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    ori_clip_len = clip_len * frame_interval
+    avg_interval = (num_frames - ori_clip_len + 1) / float(num_clips)
+    frame_inds = []
+    for i in range(num_clips):
+        clip_offset = int(i * avg_interval + avg_interval / 2.0)
+        for j in range(clip_len):
+            ind = (j * frame_interval + clip_offset) % num_frames
+            if num_frames <= ori_clip_len - 1:
+                ind = j % num_frames
+            frame_inds.append(ind)
+
+    unique_inds = sorted(list(set(frame_inds)))
+    buffer = {}
+    ind = 0
+    for i, tid in enumerate(unique_inds):
+        while ind < tid:
+            _, mat = cap.read()
+            ind += 1
+        _, mat = cap.read()
+        buffer[tid] = mat
+        ind += 1
+
+    clips = []
+    for tid in frame_inds:
+        clips.append(buffer[tid])
+    info = (clip_len, num_clips)
+    return clips, info
+
+
+def main():
+    args = parse_args()
+    cap = cv2.VideoCapture(args.video_path)
+
+    recognizer = VideoRecognizer(
+        model_path=args.model_path, device_name=args.device_name, device_id=0)
+
+    clips, info = SampleFrames(cap, args.clip_len, args.frame_interval,
+                               args.num_clips)
+
+    result = recognizer(clips, info)
+    for label_id, score in result:
+        print(label_id, score)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/tutorials/tutorials_1.ipynb
+++ b/demo/tutorials/tutorials_1.ipynb