#define BLOCKSIZE 256

#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdio.h> 
#include <cassert>
#include <cstdint>
#include <type_traits>
#include <sys/time.h>

#pragma once
struct my_timer
{
    timeval ts, te; //起始时刻，终止时刻
    float dt; // 时间间隔,单位毫秒(ms)
    void start(){
        gettimeofday(&ts, NULL);
    }
    void stop(){
        gettimeofday(&te, NULL);
        long int dt_sec  = te.tv_sec - ts.tv_sec;
        long int dt_usec = te.tv_usec - ts.tv_usec;
        dt = dt_sec * 1.0e3 + dt_usec / 1.0e3;
    }
};