Commit 64b02fb6 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #176 failed with stages
in 0 seconds
// https://github.com/vivkin/gason - pulled January 10, 2016
#pragma once
#include <stdint.h>
#include <stddef.h>
#include <assert.h>
enum JsonTag {
JSON_NUMBER = 0,
JSON_STRING,
JSON_ARRAY,
JSON_OBJECT,
JSON_TRUE,
JSON_FALSE,
JSON_NULL = 0xF
};
struct JsonNode;
#define JSON_VALUE_PAYLOAD_MASK 0x00007FFFFFFFFFFFULL
#define JSON_VALUE_NAN_MASK 0x7FF8000000000000ULL
#define JSON_VALUE_TAG_MASK 0xF
#define JSON_VALUE_TAG_SHIFT 47
union JsonValue {
uint64_t ival;
double fval;
JsonValue(double x)
: fval(x) {
}
JsonValue(JsonTag tag = JSON_NULL, void *payload = nullptr) {
assert((uintptr_t)payload <= JSON_VALUE_PAYLOAD_MASK);
ival = JSON_VALUE_NAN_MASK | ((uint64_t)tag << JSON_VALUE_TAG_SHIFT) | (uintptr_t)payload;
}
bool isDouble() const {
return (int64_t)ival <= (int64_t)JSON_VALUE_NAN_MASK;
}
JsonTag getTag() const {
return isDouble() ? JSON_NUMBER : JsonTag((ival >> JSON_VALUE_TAG_SHIFT) & JSON_VALUE_TAG_MASK);
}
uint64_t getPayload() const {
assert(!isDouble());
return ival & JSON_VALUE_PAYLOAD_MASK;
}
double toNumber() const {
assert(getTag() == JSON_NUMBER);
return fval;
}
char *toString() const {
assert(getTag() == JSON_STRING);
return (char *)getPayload();
}
JsonNode *toNode() const {
assert(getTag() == JSON_ARRAY || getTag() == JSON_OBJECT);
return (JsonNode *)getPayload();
}
};
struct JsonNode {
JsonValue value;
JsonNode *next;
char *key;
};
struct JsonIterator {
JsonNode *p;
void operator++() {
p = p->next;
}
bool operator!=(const JsonIterator &x) const {
return p != x.p;
}
JsonNode *operator*() const {
return p;
}
JsonNode *operator->() const {
return p;
}
};
inline JsonIterator begin(JsonValue o) {
return JsonIterator{o.toNode()};
}
inline JsonIterator end(JsonValue) {
return JsonIterator{nullptr};
}
#define JSON_ERRNO_MAP(XX) \
XX(OK, "ok") \
XX(BAD_NUMBER, "bad number") \
XX(BAD_STRING, "bad string") \
XX(BAD_IDENTIFIER, "bad identifier") \
XX(STACK_OVERFLOW, "stack overflow") \
XX(STACK_UNDERFLOW, "stack underflow") \
XX(MISMATCH_BRACKET, "mismatch bracket") \
XX(UNEXPECTED_CHARACTER, "unexpected character") \
XX(UNQUOTED_KEY, "unquoted key") \
XX(BREAKING_BAD, "breaking bad") \
XX(ALLOCATION_FAILURE, "allocation failure")
enum JsonErrno {
#define XX(no, str) JSON_##no,
JSON_ERRNO_MAP(XX)
#undef XX
};
const char *jsonStrError(int err);
class JsonAllocator {
struct Zone {
Zone *next;
size_t used;
} *head = nullptr;
public:
JsonAllocator() = default;
JsonAllocator(const JsonAllocator &) = delete;
JsonAllocator &operator=(const JsonAllocator &) = delete;
JsonAllocator(JsonAllocator &&x) : head(x.head) {
x.head = nullptr;
}
JsonAllocator &operator=(JsonAllocator &&x) {
head = x.head;
x.head = nullptr;
return *this;
}
~JsonAllocator() {
deallocate();
}
void *allocate(size_t size);
void deallocate();
};
int jsonParse(char *str, char **endptr, JsonValue *value, JsonAllocator &allocator);
/**************************************************************************
* Microsoft COCO Toolbox. version 2.0
* Data, paper, and tutorials available at: http://mscoco.org/
* Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
* Licensed under the Simplified BSD License [see coco/license.txt]
**************************************************************************/
#include "maskApi.h"
#include <math.h>
#include <stdlib.h>
uint umin( uint a, uint b ) { return (a<b) ? a : b; }
uint umax( uint a, uint b ) { return (a>b) ? a : b; }
void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) {
R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m);
siz j; if(cnts) for(j=0; j<m; j++) R->cnts[j]=cnts[j];
}
void rleFree( RLE *R ) {
free(R->cnts); R->cnts=0;
}
void rlesInit( RLE **R, siz n ) {
siz i; *R = (RLE*) malloc(sizeof(RLE)*n);
for(i=0; i<n; i++) rleInit((*R)+i,0,0,0,0);
}
void rlesFree( RLE **R, siz n ) {
siz i; for(i=0; i<n; i++) rleFree((*R)+i); free(*R); *R=0;
}
void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) {
siz i, j, k, a=w*h; uint c, *cnts; byte p;
cnts = malloc(sizeof(uint)*(a+1));
for(i=0; i<n; i++) {
const byte *T=M+a*i; k=0; p=0; c=0;
for(j=0; j<a; j++) { if(T[j]!=p) { cnts[k++]=c; c=0; p=T[j]; } c++; }
cnts[k++]=c; rleInit(R+i,h,w,k,cnts);
}
free(cnts);
}
void rleEncodePaste( RLE *R, const byte *M, siz h, siz w, siz n, siz oy, siz ox, siz oh, siz ow ) {
siz i, j, k, a=w*h, lp=ox, rp=ow-(ox+w), tp=oy, bp=oh-(oy+h); uint c, *cnts; byte p;
cnts = malloc(sizeof(uint)*(ow*oh+1));
for(i=0; i<n; i++) {
const byte *T=M+a*i; siz jj=0; k=0; p=0; c=lp*oh+tp;
for(j=0; j<a; j++) {
if(bp+tp > 0) {
if(j-jj == h) {
// completed one column
if((k&1) == 0) {
// add to zero run
c += bp+tp;
} else {
// complete one run
cnts[k++]=c;
// start zero run
c=tp+bp;
p=0;
}
jj = j;
}
}
if(T[j]!=p) {
cnts[k++]=c;
c=0;
p=T[j];
}
c++;
}
if (rp > 0 || bp > 0) {
if((k&1) == 0) {
// add to zero run
c += bp + rp*oh;
} else {
// complete one run
cnts[k++] = c;
c = bp + rp*oh;
p = 0;
}
}
cnts[k++]=c;
rleInit(R+i,oh,ow,k,cnts);
}
free(cnts);
}
void rleDecode( const RLE *R, byte *M, siz n ) {
siz i, j, k; for( i=0; i<n; i++ ) {
byte v=0; for( j=0; j<R[i].m; j++ ) {
for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
}
void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) {
uint *cnts, c, ca, cb, cc, ct; int v, va, vb, vp;
siz i, a, b, h=R[0].h, w=R[0].w, m=R[0].m; RLE A, B;
if(n==0) { rleInit(M,0,0,0,0); return; }
if(n==1) { rleInit(M,h,w,m,R[0].cnts); return; }
cnts = malloc(sizeof(uint)*(h*w+1));
for( a=0; a<m; a++ ) cnts[a]=R[0].cnts[a];
for( i=1; i<n; i++ ) {
B=R[i]; if(B.h!=h||B.w!=w) { h=w=m=0; break; }
rleInit(&A,h,w,m,cnts); ca=A.cnts[0]; cb=B.cnts[0];
v=va=vb=0; m=0; a=b=1; cc=0; ct=1;
while( ct>0 ) {
c=umin(ca,cb); cc+=c; ct=0;
ca-=c; if(!ca && a<A.m) { ca=A.cnts[a++]; va=!va; } ct+=ca;
cb-=c; if(!cb && b<B.m) { cb=B.cnts[b++]; vb=!vb; } ct+=cb;
vp=v; if(intersect) v=va&&vb; else v=va||vb;
if( v!=vp||ct==0 ) { cnts[m++]=cc; cc=0; }
}
rleFree(&A);
}
rleInit(M,h,w,m,cnts); free(cnts);
}
void rleArea( const RLE *R, siz n, uint *a ) {
siz i, j; for( i=0; i<n; i++ ) {
a[i]=0; for( j=1; j<R[i].m; j+=2 ) a[i]+=R[i].cnts[j]; }
}
void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) {
siz g, d; BB db, gb; int crowd;
db=malloc(sizeof(double)*m*4); rleToBbox(dt,db,m);
gb=malloc(sizeof(double)*n*4); rleToBbox(gt,gb,n);
bbIou(db,gb,m,n,iscrowd,o); free(db); free(gb);
for( g=0; g<n; g++ ) for( d=0; d<m; d++ ) if(o[g*m+d]>0) {
crowd=iscrowd!=NULL && iscrowd[g];
if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; }
siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb;
ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0;
cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1;
while( ct>0 ) {
c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0;
ca-=c; if(!ca && a<ka) { ca=dt[d].cnts[a++]; va=!va; } ct+=ca;
cb-=c; if(!cb && b<kb) { cb=gt[g].cnts[b++]; vb=!vb; } ct+=cb;
}
if(i==0) u=1; else if(crowd) rleArea(dt+d,1,&u);
o[g*m+d] = (double)i/(double)u;
}
}
void rleNms( RLE *dt, siz n, uint *keep, double thr ) {
siz i, j; double u;
for( i=0; i<n; i++ ) keep[i]=1;
for( i=0; i<n; i++ ) if(keep[i]) {
for( j=i+1; j<n; j++ ) if(keep[j]) {
rleIou(dt+i,dt+j,1,1,0,&u);
if(u>thr) keep[j]=0;
}
}
}
void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) {
double h, w, i, u, ga, da; siz g, d; int crowd;
for( g=0; g<n; g++ ) {
BB G=gt+g*4; ga=G[2]*G[3]; crowd=iscrowd!=NULL && iscrowd[g];
for( d=0; d<m; d++ ) {
BB D=dt+d*4; da=D[2]*D[3]; o[g*m+d]=0;
w=fmin(D[2]+D[0],G[2]+G[0])-fmax(D[0],G[0]); if(w<=0) continue;
h=fmin(D[3]+D[1],G[3]+G[1])-fmax(D[1],G[1]); if(h<=0) continue;
i=w*h; u = crowd ? da : da+ga-i; o[g*m+d]=i/u;
}
}
}
void bbNms( BB dt, siz n, uint *keep, double thr ) {
siz i, j; double u;
for( i=0; i<n; i++ ) keep[i]=1;
for( i=0; i<n; i++ ) if(keep[i]) {
for( j=i+1; j<n; j++ ) if(keep[j]) {
bbIou(dt+i*4,dt+j*4,1,1,0,&u);
if(u>thr) keep[j]=0;
}
}
}
void rleToBbox( const RLE *R, BB bb, siz n ) {
siz i; for( i=0; i<n; i++ ) {
uint h, w, x, y, xs, ys, xe, ye, xp, cc, t; siz j, m;
h=(uint)R[i].h; w=(uint)R[i].w; m=R[i].m;
m=((siz)(m/2))*2; xs=w; ys=h; xe=ye=0; cc=0;
if(m==0) { bb[4*i+0]=bb[4*i+1]=bb[4*i+2]=bb[4*i+3]=0; continue; }
for( j=0; j<m; j++ ) {
cc+=R[i].cnts[j]; t=cc-j%2; y=t%h; x=(t-y)/h;
if(j%2==0) xp=x; else if(xp<x) { ys=0; ye=h-1; }
xs=umin(xs,x); xe=umax(xe,x); ys=umin(ys,y); ye=umax(ye,y);
}
bb[4*i+0]=xs; bb[4*i+2]=xe-xs+1;
bb[4*i+1]=ys; bb[4*i+3]=ye-ys+1;
}
}
void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) {
siz i; for( i=0; i<n; i++ ) {
double xs=bb[4*i+0], xe=xs+bb[4*i+2];
double ys=bb[4*i+1], ye=ys+bb[4*i+3];
double xy[8] = {xs,ys,xs,ye,xe,ye,xe,ys};
rleFrPoly( R+i, xy, 4, h, w );
}
}
int uintCompare(const void *a, const void *b) {
uint c=*((uint*)a), d=*((uint*)b); return c>d?1:c<d?-1:0;
}
void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
/* upsample and get discrete points densely along entire boundary */
siz j, m=0; double scale=5; int *x, *y, *u, *v; uint *a, *b;
x=malloc(sizeof(int)*(k+1)); y=malloc(sizeof(int)*(k+1));
for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
for(j=0; j<k; j++) y[j]=(int)(scale*xy[j*2+1]+.5); y[k]=y[0];
for(j=0; j<k; j++) m+=umax(abs(x[j]-x[j+1]),abs(y[j]-y[j+1]))+1;
u=malloc(sizeof(int)*m); v=malloc(sizeof(int)*m); m=0;
for( j=0; j<k; j++ ) {
int xs=x[j], xe=x[j+1], ys=y[j], ye=y[j+1], dx, dy, t, d;
int flip; double s; dx=abs(xe-xs); dy=abs(ys-ye);
flip = (dx>=dy && xs>xe) || (dx<dy && ys>ye);
if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; }
s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy;
if(dx>=dy) for( d=0; d<=dx; d++ ) {
t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++;
} else for( d=0; d<=dy; d++ ) {
t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++;
}
}
/* get points along y-boundary and downsample */
free(x); free(y); k=m; m=0; double xd, yd;
x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k);
for( j=1; j<k; j++ ) if(u[j]!=u[j-1]) {
xd=(double)(u[j]<u[j-1]?u[j]:u[j]-1); xd=(xd+.5)/scale-.5;
if( floor(xd)!=xd || xd<0 || xd>w-1 ) continue;
yd=(double)(v[j]<v[j-1]?v[j]:v[j-1]); yd=(yd+.5)/scale-.5;
if(yd<0) yd=0; else if(yd>h) yd=h; yd=ceil(yd);
x[m]=(int) xd; y[m]=(int) yd; m++;
}
/* compute rle encoding given y-boundary points */
k=m; a=malloc(sizeof(uint)*(k+1));
for( j=0; j<k; j++ ) a[j]=(uint)(x[j]*(int)(h)+y[j]);
a[k++]=(uint)(h*w); free(u); free(v); free(x); free(y);
qsort(a,k,sizeof(uint),uintCompare); uint p=0;
for( j=0; j<k; j++ ) { uint t=a[j]; a[j]-=p; p=t; }
b=malloc(sizeof(uint)*k); j=m=0; b[m++]=a[j++];
while(j<k) if(a[j]>0) b[m++]=a[j++]; else {
j++; if(j<k) b[m-1]+=a[j++]; }
rleInit(R,h,w,m,b); free(a); free(b);
}
char* rleToString( const RLE *R ) {
/* Similar to LEB128 but using 6 bits/char and ascii chars 48-111. */
siz i, m=R->m, p=0; long x; int more;
char *s=malloc(sizeof(char)*m*6);
for( i=0; i<m; i++ ) {
x=(long) R->cnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1;
while( more ) {
char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0;
if(more) c |= 0x20; c+=48; s[p++]=c;
}
}
s[p]=0; return s;
}
void rleFrString( RLE *R, char *s, siz h, siz w ) {
siz m=0, p=0, k; long x; int more; uint *cnts;
while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0;
while( s[p] ) {
x=0; k=0; more=1;
while( more ) {
char c=s[p]-48; x |= (c & 0x1f) << 5*k;
more = c & 0x20; p++; k++;
if(!more && (c & 0x10)) x |= -1 << 5*k;
}
if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x;
}
rleInit(R,h,w,m,cnts); free(cnts);
}
/**************************************************************************
* Microsoft COCO Toolbox. version 2.0
* Data, paper, and tutorials available at: http://mscoco.org/
* Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
* Licensed under the Simplified BSD License [see coco/license.txt]
**************************************************************************/
#pragma once
typedef unsigned int uint;
typedef unsigned long siz;
typedef unsigned char byte;
typedef double* BB;
typedef struct { siz h, w, m; uint *cnts; } RLE;
/* Initialize/destroy RLE. */
void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
void rleFree( RLE *R );
/* Initialize/destroy RLE array. */
void rlesInit( RLE **R, siz n );
void rlesFree( RLE **R, siz n );
/* Encode binary masks using RLE. */
void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
void rleEncodePaste( RLE *R, const byte *M, siz h, siz w, siz n, siz oy, siz ox, siz oh, siz ow );
/* Decode binary masks encoded via RLE. */
void rleDecode( const RLE *R, byte *mask, siz n );
/* Compute union or intersection of encoded masks. */
void rleMerge( const RLE *R, RLE *M, siz n, int intersect );
/* Compute area of encoded masks. */
void rleArea( const RLE *R, siz n, uint *a );
/* Compute intersection over union between masks. */
void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
/* Compute non-maximum suppression between bounding masks */
void rleNms( RLE *dt, siz n, uint *keep, double thr );
/* Compute intersection over union between bounding boxes. */
void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
/* Compute non-maximum suppression between bounding boxes */
void bbNms( BB dt, siz n, uint *keep, double thr );
/* Get bounding boxes surrounding encoded masks. */
void rleToBbox( const RLE *R, BB bb, siz n );
/* Convert bounding boxes to encoded masks. */
void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
/* Convert polygon to encoded mask. */
void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
/* Get compressed string representation of encoded mask. */
char* rleToString( const RLE *R );
/* Convert from compressed string representation of encoded mask. */
void rleFrString( RLE *R, char *s, siz h, siz w );
Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The views and conclusions contained in the software and documentation are those
of the authors and should not be interpreted as representing official policies,
either expressed or implied, of the FreeBSD Project.
#!/bin/bash
## DL params
#export BATCHSIZE=${BATCHSIZE:-32}
export BATCHSIZE=${BATCHSIZE:-16}
export NUMEPOCHS=${NUMEPOCHS:-1}
export LR=${LR:-0.000085}
export WARMUP_EPOCHS=${WARMUP_EPOCHS:-0}
#export EXTRA_PARAMS=${EXTRA_PARAMS:-'--jit --frozen-bn-opt --frozen-bn-fp16 --apex-adam --apex-focal-loss --apex-head-fusion --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --cuda-graphs --dali --dali-matched-idxs --dali-eval --skip-metric-loss --cuda-graphs-syn --sync-after-graph-replay --async-coco'}
#export EXTRA_PARAMS=${EXTRA_PARAMS:-'--frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --skip-metric-loss --sync-after-graph-replay --async-coco'}
#export EXTRA_PARAMS="--frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco"
#export EXTRA_PARAMS="--apex-adam --fp16-allreduce --skip-metric-loss --async-coco"
export EXTRA_PARAMS=""
## System run parms
export DGXNNODES=1
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME_MINUTES=160
export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES}))
## System config params
export DGXNGPU=8
export DGXSOCKETCORES=64
export DGXNSOCKET=2
export DGXHT=2 # HT is on is 2, HT off is 1
## System data paths
MLPERF_LOGIN_HOST="${MLPERF_LOGIN_HOST:-$(hostname | sed -E 's/-.*$//')}"
MLPERF_HOST_CONFIG=$(dirname "${BASH_SOURCE[0]}")/config_data_"${MLPERF_LOGIN_HOST}".sh
echo "${MLPERF_HOST_CONFIG}"
if [ -f "${MLPERF_HOST_CONFIG}" ]; then
source "${MLPERF_HOST_CONFIG}"
fi
#!/bin/bash
## DL params
export BATCHSIZE=${BATCHSIZE:-4}
export NUMEPOCHS=${NUMEPOCHS:-6}
export LR=${LR:-0.0001}
export WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
export EXTRA_PARAMS=${EXTRA_PARAMS:-'--jit --frozen-bn-opt --frozen-bn-fp16 --apex-adam --apex-focal-loss --apex-head-fusion --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --cuda-graphs --dali --dali-matched-idxs --dali-eval --skip-metric-loss --cuda-graphs-syn --sync-after-graph-replay --async-coco'}
## System run parms
export DGXNNODES=8
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME_MINUTES=20
export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES}))
## System config params
export DGXNGPU=8
export DGXSOCKETCORES=64
export DGXNSOCKET=2
export DGXHT=2 # HT is on is 2, HT off is 1
## System data paths
MLPERF_LOGIN_HOST="${MLPERF_LOGIN_HOST:-$(hostname | sed -E 's/-.*$//')}"
MLPERF_HOST_CONFIG=$(dirname "${BASH_SOURCE[0]}")/config_data_"${MLPERF_LOGIN_HOST}".sh
echo "${MLPERF_HOST_CONFIG}"
if [ -f "${MLPERF_HOST_CONFIG}" ]; then
source "${MLPERF_HOST_CONFIG}"
fi
#!/bin/bash
## DL params
export BATCHSIZE=${BATCHSIZE:-1}
export NUMEPOCHS=${NUMEPOCHS:-10}
export LR=${LR:-0.0001}
export WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
export EXTRA_PARAMS=${EXTRA_PARAMS:-'--jit --frozen-bn-opt --frozen-bn-fp16 --apex-adam --apex-focal-loss --apex-head-fusion --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --cuda-graphs --dali --dali-matched-idxs --dali-eval --dali-eval-cache --skip-metric-loss --cuda-graphs-syn --sync-after-graph-replay --async-coco'}
## System run parms
export DGXNNODES=160
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME_MINUTES=15
export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES}))
## System config params
export DGXNGPU=8
export DGXSOCKETCORES=64
export DGXNSOCKET=2
export DGXHT=2 # HT is on is 2, HT off is 1
## System data paths
MLPERF_LOGIN_HOST="${MLPERF_LOGIN_HOST:-$(hostname | sed -E 's/-.*$//')}"
MLPERF_HOST_CONFIG=$(dirname "${BASH_SOURCE[0]}")/config_data_"${MLPERF_LOGIN_HOST}".sh
echo "${MLPERF_HOST_CONFIG}"
if [ -f "${MLPERF_HOST_CONFIG}" ]; then
source "${MLPERF_HOST_CONFIG}"
fi
#!/bin/bash
## DL params
export BATCHSIZE=${BATCHSIZE:-1}
export NUMEPOCHS=${NUMEPOCHS:-10}
export LR=${LR:-0.000135}
export WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
export EXTRA_PARAMS=${EXTRA_PARAMS:-'--jit --frozen-bn-opt --frozen-bn-fp16 --apex-adam --apex-focal-loss --apex-head-fusion --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --cuda-graphs --dali --dali-matched-idxs --dali-eval --dali-eval-cache --skip-metric-loss --cuda-graphs-syn --sync-after-graph-replay --async-coco'}
## System run parms
export DGXNNODES=256
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME_MINUTES=15
export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES}))
## System config params
export DGXNGPU=8
export DGXSOCKETCORES=64
export DGXNSOCKET=2
export DGXHT=2 # HT is on is 2, HT off is 1
## System data paths
MLPERF_LOGIN_HOST="${MLPERF_LOGIN_HOST:-$(hostname | sed -E 's/-.*$//')}"
MLPERF_HOST_CONFIG=$(dirname "${BASH_SOURCE[0]}")/config_data_"${MLPERF_LOGIN_HOST}".sh
echo "${MLPERF_HOST_CONFIG}"
if [ -f "${MLPERF_HOST_CONFIG}" ]; then
source "${MLPERF_HOST_CONFIG}"
fi
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.10)
set(CMAKE_CUDA_ARCHITECTURES "35;50;52;60;61;70;75;80;86")
project(box_iou_plugin LANGUAGES CUDA CXX C)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_C_STANDARD 11)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
include_directories(SYSTEM "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
execute_process(
COMMAND python -c "import nvidia.dali as dali; print(dali.sysconfig.get_lib_dir())"
OUTPUT_VARIABLE DALI_LIB_DIR)
string(STRIP ${DALI_LIB_DIR} DALI_LIB_DIR)
execute_process(
COMMAND python -c "import nvidia.dali as dali; print(\" \".join(dali.sysconfig.get_compile_flags()))"
OUTPUT_VARIABLE DALI_COMPILE_FLAGS)
string(STRIP ${DALI_COMPILE_FLAGS} DALI_COMPILE_FLAGS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DALI_COMPILE_FLAGS} ")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${DALI_COMPILE_FLAGS} ")
link_directories("${DALI_LIB_DIR}")
add_library(_box_iou SHARED box_iou.cc box_iou.cu)
target_link_libraries(_box_iou dali)
// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "box_iou.h"
DALI_SCHEMA(box_iou)
.DocStr("Compute box IoU")
.NumInput(2)
.NumOutput(1);
// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime_api.h>
#include "box_iou.h"
namespace other_ns {
__global__ void box_iou_cuda_kernel(float *box_iou, float4 *box1, float4 *box2, long num_images, long M,
long N, int idxJump) {
int idx = blockIdx.x*blockDim.x + threadIdx.x;
size_t b1_idx, b2_idx, b1_row_offset, b2_row_offset, im_id, im_offset;
float xmin1, xmin2, xmax1, xmax2, ymin1, ymin2, ymax1, ymax2;
float x_tl, y_tl, x_br, y_br, w, h, inter, area1, area2, iou;
for (long i = idx; i < num_images * M * N; i += idxJump){
im_id = i / (M * N);
im_offset = i % (M * N);
b1_idx = im_offset / N;
b2_idx = i % N;
b1_row_offset = im_id * M + b1_idx;
b2_row_offset = im_id * N + b2_idx;
xmin1 = box1[b1_row_offset].x;
ymin1 = box1[b1_row_offset].y;
xmax1 = box1[b1_row_offset].z;
ymax1 = box1[b1_row_offset].w;
xmin2 = box2[b2_row_offset].x;
ymin2 = box2[b2_row_offset].y;
xmax2 = box2[b2_row_offset].z;
ymax2 = box2[b2_row_offset].w;
if (xmin1 == -1.0 && ymin1 == -1.0 && xmax1 == -1.0 && ymax1 == -1.0) {
// do not consider padded targets
box_iou[im_id * M * N + b1_idx * N + b2_idx] = -1;
} else {
x_tl = fmaxf(xmin1, xmin2);
y_tl = fmaxf(ymin1, ymin2);
x_br = fminf(xmax1, xmax2);
y_br = fminf(ymax1, ymax2);
w = (x_br - x_tl) < 0 ? 0.0f : (x_br - x_tl);
h = (y_br - y_tl) < 0 ? 0.0f : (y_br - y_tl);
inter = w * h;
area1 = (xmax1 - xmin1) * (ymax1 - ymin1);
area2 = (xmax2 - xmin2) * (ymax2 - ymin2);
iou = inter / (area1 + area2 - inter);
box_iou[im_id * M * N + b1_idx * N + b2_idx] = iou;
}
}
}
template<>
void box_iou<::dali::GPUBackend>::RunImpl(::dali::DeviceWorkspace &ws) {
const auto &box1 = ws.Input<::dali::GPUBackend>(0);
const auto &box2 = ws.Input<::dali::GPUBackend>(1);
const auto &shape1 = box1.shape();
const auto &shape2 = box2.shape();
auto &output = ws.Output<::dali::GPUBackend>(0);
int minGridSize;
int blockSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize,
&blockSize,
(void*) box_iou_cuda_kernel,
0, // dynamic memory
0); // maximum utilized threads
dim3 gridDim(minGridSize);
dim3 blockDim(blockSize);
int idxJump = minGridSize * blockSize;
int numImages = shape1.num_samples();
for (int sample_idx = 0; sample_idx < numImages; sample_idx++) {
long M = shape1[sample_idx][0];
long N = shape2[0][0];
box_iou_cuda_kernel<<<gridDim, blockDim, 0, ws.stream()>>>(
(float*) output.raw_mutable_tensor(sample_idx),
(float4*) box1.raw_tensor(sample_idx),
(float4*) box2.raw_tensor(0),
1, M, N,
idxJump);
}
}
} // namespace other_ns
DALI_REGISTER_OPERATOR(box_iou, ::other_ns::box_iou<::dali::GPUBackend>, ::dali::GPU);
// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef DALI_BOX_IOU_H_
#define DALI_BOX_IOU_H_
#include <vector>
#include "dali/pipeline/operator/operator.h"
namespace other_ns {
template <typename Backend>
class box_iou : public ::dali::Operator<Backend> {
public:
inline explicit box_iou(const ::dali::OpSpec &spec) :
::dali::Operator<Backend>(spec) {}
virtual inline ~box_iou() = default;
box_iou(const box_iou&) = delete;
box_iou& operator=(const box_iou&) = delete;
box_iou(box_iou&&) = delete;
box_iou& operator=(box_iou&&) = delete;
protected:
bool CanInferOutputs() const override {
return true;
}
bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,
const ::dali::workspace_t<Backend> &ws) override {
const auto &box1 = ws.template Input<Backend>(0);
const auto &box2 = ws.template Input<Backend>(1);
auto box1_shape = box1.shape();
auto box2_shape = box2.shape();
output_desc.resize(1);
const int N = box1.num_samples();
output_desc[0].shape = box1_shape;
for (int i = 0; i < N; i++) {
output_desc[0].shape.tensor_shape_span(i).back() = box2_shape[i][0];
}
output_desc[0].type = box2.type();
return true;
}
void RunImpl(::dali::workspace_t<Backend> &ws) override;
};
} // namespace other_ns
#endif // DALI_BOX_IOU_H_
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.10)
set(CMAKE_CUDA_ARCHITECTURES "35;50;52;60;61;70;75;80;86")
project(proposal_matcher_plugin LANGUAGES CUDA CXX C)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_C_STANDARD 11)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
include_directories(SYSTEM "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
execute_process(
COMMAND python -c "import nvidia.dali as dali; print(dali.sysconfig.get_lib_dir())"
OUTPUT_VARIABLE DALI_LIB_DIR)
string(STRIP ${DALI_LIB_DIR} DALI_LIB_DIR)
execute_process(
COMMAND python -c "import nvidia.dali as dali; print(\" \".join(dali.sysconfig.get_compile_flags()))"
OUTPUT_VARIABLE DALI_COMPILE_FLAGS)
string(STRIP ${DALI_COMPILE_FLAGS} DALI_COMPILE_FLAGS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DALI_COMPILE_FLAGS} ")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${DALI_COMPILE_FLAGS} ")
link_directories("${DALI_LIB_DIR}")
add_library(_proposal_matcher SHARED proposal_matcher.cc proposal_matcher.cu)
target_link_libraries(_proposal_matcher dali)
// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "proposal_matcher.h"
DALI_SCHEMA(proposal_matcher)
.DocStr("Proposal matcher")
.NumInput(1)
.NumOutput(1);
// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime_api.h>
#include "proposal_matcher.h"
namespace other_ns {
__launch_bounds__(256) static __global__
void max_along_gt_idx(float *match, unsigned char *pred_forgiven, long *max_gt_idx, long long gt,long long preds,
bool include_low_quality, float low_th, float high_th) {
long long tid = blockIdx.x * blockDim.x + threadIdx.x;
int image_id = blockIdx.y;
int offset_match_matrix = image_id * preds * gt;
int offset_preds = image_id * preds;
if(tid < preds){
float max_iou = 0.0f;
int max_idx = 0;
float iou;
for(long long i = 0;i < gt; i++){
iou = match[offset_match_matrix + i * preds + tid];
if (iou > max_iou) {max_iou = iou; max_idx = i;}
}
if (max_iou >= high_th) max_gt_idx[offset_preds + tid] = max_idx;
else if ((pred_forgiven[offset_preds + tid] == 1 && include_low_quality)) max_gt_idx[offset_preds + tid] = max_idx;
else if (max_iou < low_th) max_gt_idx[offset_preds + tid] = -1;
else if (max_iou < high_th) max_gt_idx[offset_preds + tid] = -2;
}
}
__device__ void warpReduce(volatile float* sdata, int tid) {
sdata[tid] = fmax(sdata[tid],sdata[tid + 32]);
sdata[tid] = fmax(sdata[tid],sdata[tid + 16]);
sdata[tid] = fmax(sdata[tid],sdata[tid + 8]);
sdata[tid] = fmax(sdata[tid],sdata[tid + 4]);
sdata[tid] = fmax(sdata[tid],sdata[tid + 2]);
sdata[tid] = fmax(sdata[tid],sdata[tid + 1]);
}
static __global__
void max_along_preds(float* match, float* inter_gt, long long gt,long long preds) {
int gt_idx = blockIdx.x;
int chunk_idx = blockIdx.y;
int image_id = blockIdx.z;
int num_chunks = (preds + 2047) / 2048;
int gt_offset = chunk_idx * 2048;
int start_idx = image_id * preds * gt + gt_idx * preds + gt_offset;
int idx = threadIdx.x;
__shared__ float shbuf[1024];
shbuf[idx] = 0.0f;
__syncthreads();
if(gt_offset + idx + 1024 < preds) shbuf[idx] = fmax(match[start_idx + idx], match[start_idx + idx + 1024]);
else if (gt_offset + idx < preds) shbuf[idx] = match[start_idx + idx];
__syncthreads();
if(idx < 512) shbuf[idx] = fmax(shbuf[idx],shbuf[idx + 512]);
__syncthreads();
if(idx < 256) shbuf[idx] = fmax(shbuf[idx], shbuf[idx + 256]);
__syncthreads();
if(idx < 128) shbuf[idx] = fmax(shbuf[idx], shbuf[idx + 128]);
__syncthreads();
if(idx < 64) shbuf[idx] = fmax(shbuf[idx], shbuf[idx + 64]);
__syncthreads();
if(idx < 32) warpReduce(shbuf, idx);
if (idx == 0) inter_gt[image_id * num_chunks * gt + num_chunks * gt_idx + chunk_idx] = shbuf[idx];
}
__launch_bounds__(256) static __global__
void max_along_preds_reduced(float *match, float *max_preds, long long gt,long long preds) {
long long tid = blockIdx.x * blockDim.x + threadIdx.x;
int image_id = blockIdx.y;
if (tid < gt){
float max_iou = 0.0f;
float iou;
for(long long i = 0; i < preds; i++){
iou = match[image_id * gt * preds + tid * preds + i];
if (iou > max_iou) max_iou = iou;
}
max_preds[image_id * gt + tid] = max_iou;
}
}
__launch_bounds__(256) static __global__
void forgive_preds(float *match_quality_data, float *d_best_pred_per_gt, unsigned char *d_pred_forgiven,
long gt, long preds) {
long tid = blockIdx.x * blockDim.x + threadIdx.x;
int image_id = blockIdx.y;
int offset = image_id * gt * preds;
if (tid < preds) {
unsigned char forgiven = 0;
float iou;
for(int i = 0; i < gt; i++) {
iou = match_quality_data[offset + i * preds + tid];
// do not consider predictions from padded targets (iou = -1)
if(((iou == d_best_pred_per_gt[image_id * gt + i])) && (iou != -1.0)) {
forgiven = 1;
break;
}
}
d_pred_forgiven[image_id * preds + tid] = forgiven;
}
}
template<>
void proposal_matcher<::dali::GPUBackend>::RunImpl(::dali::DeviceWorkspace &ws) {
const auto &input = ws.Input<::dali::GPUBackend>(0);
const auto &shape = input.shape();
auto &output = ws.Output<::dali::GPUBackend>(0);
bool allow_low_quality_matches = true;
float low_th = 0.4;
float high_th = 0.5;
int num_images = 1; //shape.num_samples();
for (int sample_idx = 0; sample_idx < shape.num_samples(); sample_idx++) {
int gt = shape[sample_idx][0];
long long preds = shape[sample_idx][1];
float *match_quality_data = (float*) input.raw_tensor(sample_idx);
int num_chunks = (preds + 2047) / 2048;
// do an intermediate reduction along all predictions for each gt
dim3 block(1024, 1, 1);
dim3 grid(gt, num_chunks, num_images);
if (allow_low_quality_matches) max_along_preds<<<grid, block, 0, ws.stream()>>>(
(float*) input.raw_tensor(sample_idx),
d_intergt,
gt,
preds);
// final reduction to find best iou per gt
int numThreads = 256;
int numBlocks = (gt + numThreads - 1) / numThreads;
dim3 grid2(numBlocks, num_images, 1);
if (allow_low_quality_matches) max_along_preds_reduced<<<grid2, numThreads, 0, ws.stream()>>>(
d_intergt,
d_best_pred_per_gt,
gt,
num_chunks);
numBlocks=(preds + numThreads - 1) / numThreads;
dim3 grid_preds(numBlocks, num_images, 1);
// if low_quality_matches are allowed, mark some predictions to keep their best matching gt even though
// iou < threshold
if (allow_low_quality_matches) forgive_preds<<<grid_preds, numThreads, 0, ws.stream()>>>(
(float*) input.raw_tensor(sample_idx),
d_best_pred_per_gt,
d_pred_forgiven,
gt,
preds);
// compute resulting tensor of indices
max_along_gt_idx<<<grid_preds, numThreads, 0, ws.stream()>>>(
(float*) input.raw_tensor(sample_idx),
d_pred_forgiven,
(long*) output.raw_mutable_tensor(sample_idx),
gt,
preds,
allow_low_quality_matches,
low_th,
high_th);
}
}
} // namespace other_ns
DALI_REGISTER_OPERATOR(proposal_matcher, ::other_ns::proposal_matcher<::dali::GPUBackend>, ::dali::GPU);
// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef DALI_PROPOSAL_MATCHER_H_
#define DALI_PROPOSAL_MATCHER_H_
#include <vector>
#include "dali/pipeline/operator/operator.h"
namespace other_ns {
template <typename Backend>
class proposal_matcher : public ::dali::Operator<Backend> {
public:
inline explicit proposal_matcher(const ::dali::OpSpec &spec) :
::dali::Operator<Backend>(spec) {
int gt = 1000;
int preds = 120087;
int num_chunks = (preds + 2047) / 2048;
cudaMalloc(&d_best_pred_per_gt, gt * sizeof(float));
cudaMalloc(&d_intergt, gt * num_chunks * sizeof(float));
cudaMalloc(&d_pred_forgiven, preds * sizeof(unsigned char));
}
virtual inline ~proposal_matcher() {
cudaFree(d_best_pred_per_gt);
cudaFree(d_intergt);
cudaFree(d_pred_forgiven);
}
proposal_matcher(const proposal_matcher&) = delete;
proposal_matcher& operator=(const proposal_matcher&) = delete;
proposal_matcher(proposal_matcher&&) = delete;
proposal_matcher& operator=(proposal_matcher&&) = delete;
protected:
float *d_best_pred_per_gt, *d_intergt;
unsigned char *d_pred_forgiven;
bool CanInferOutputs() const override {
return true;
}
bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,
const ::dali::workspace_t<Backend> &ws) override {
const auto &input = ws.template Input<Backend>(0);
auto shape = input.shape();
output_desc.resize(1);
const int N = input.num_samples();
output_desc[0].shape = shape;
for (int i = 0; i < N; i++) {
output_desc[0].shape.tensor_shape_span(i)[0] = 1;
output_desc[0].shape.tensor_shape_span(i)[1] = shape[i][1];
}
//output_desc[0].type = input.type();
output_desc[0].type = dali::DALI_INT64;
return true;
}
void RunImpl(::dali::workspace_t<Backend> &ws) override;
};
} // namespace other_ns
#endif // DALI_PROPOSAL_MATCHER_H_
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import pdb
import numpy as np
import torch
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types
from nvidia.dali.plugin.pytorch import DALIGenericIterator, LastBatchPolicy
import nvidia.dali.plugin_manager as plugin_manager
plugin_manager.load_library('/usr/local/lib/lib_box_iou.so')
plugin_manager.load_library('/usr/local/lib/lib_proposal_matcher.so')
class DaliDataIterator(object):
def __init__(self, data_path, anno_path, batch_size,
num_shards, shard_id, is_training,
image_size=(800, 800), num_threads=8, prefetch_queue_depth=2,
compute_matched_idxs=False, anchors=None, cpu_decode=False,
lazy_init=True, cache=False, seed=-1):
self.data_path = data_path
self.anno_path = anno_path
self.batch_size = batch_size
self.num_shards = num_shards
self.shard_id = shard_id
self.is_training = is_training
self.compute_matched_idxs = compute_matched_idxs
self.num_threads = num_threads
self.seed = seed
self.lazy_init = lazy_init
self.image_size = image_size
self.prefetch_queue_depth = prefetch_queue_depth
self.cpu_decode = cpu_decode
self.cache = cache
self.cache_ready = False
self.cached_vals = []
assert not(self.is_training and self.cache), "cache can't be used with training"
self.pipe = Pipeline(batch_size=self.batch_size,
num_threads=self.num_threads,
seed=self.seed,
device_id=torch.cuda.current_device())
with self.pipe:
inputs, bboxes, labels, image_ids = fn.readers.coco(
name="coco",
file_root=self.data_path,
annotations_file=self.anno_path,
num_shards=self.num_shards,
shard_id=self.shard_id,
stick_to_shard=not self.is_training,
pad_last_batch=not self.is_training,
lazy_init=self.lazy_init,
ltrb=True,
shuffle_after_epoch=self.is_training,
avoid_class_remapping=True,
image_ids=True,
ratio=True,
prefetch_queue_depth=self.prefetch_queue_depth,
read_ahead=True,
skip_empty=False)
# Images
images_shape = fn.peek_image_shape(inputs) # HWC
if self.cpu_decode:
images = fn.decoders.image(inputs, device='cpu').gpu()
else:
images = fn.decoders.image(inputs, device='mixed')
if self.is_training:
flip = fn.random.coin_flip(probability=0.5)
images = fn.flip(images, horizontal=flip, device='gpu')
mean = np.array([[[255 * 0.485]], [[255 * 0.456]], [[255 * 0.406]]], dtype=np.float32)
stddev = np.array([[[255 * 0.229]], [[255 * 0.224]], [[255 * 0.225]]], dtype=np.float32)
images = fn.normalize(fn.transpose(images, perm=[2, 0, 1]),
axes=[1, 2],
mean=mean,
stddev=stddev)
images = fn.resize(images, resize_x=self.image_size[0], resize_y=self.image_size[1])
# Labels
labels_shape = fn.shapes(labels)
labels = fn.pad(labels, axes=(0,))
labels = labels.gpu()
labels = fn.cast(labels, dtype=types.INT64)
# BBoxes
if self.is_training:
bboxes = fn.bb_flip(bboxes, horizontal=flip, ltrb=True)
lt_x = bboxes[:, 0] * self.image_size[0]
lt_y = bboxes[:, 1] * self.image_size[1]
rb_x = bboxes[:, 2] * self.image_size[0]
rb_y = bboxes[:, 3] * self.image_size[1]
bboxes = fn.stack(lt_x, lt_y, rb_x, rb_y, axis=1)
bboxes_shape = fn.shapes(bboxes)
bboxes = bboxes.gpu()
if self.compute_matched_idxs:
self.anchors = anchors[0]
match_quality_matrix = fn.box_iou(bboxes, self.anchors, device='gpu')
matched_idxs = fn.proposal_matcher(match_quality_matrix, device='gpu')
bboxes = fn.pad(bboxes, axes=(0,))
set_outputs = [images, images_shape, image_ids, bboxes, bboxes_shape, labels, labels_shape]
if self.compute_matched_idxs:
set_outputs.append(matched_idxs)
self.pipe.set_outputs(*set_outputs)
self.pipe.build()
output_map = ['images', 'images_shape', 'images_id', 'boxes', 'boxes_shape', 'labels', 'labels_shape']
if self.compute_matched_idxs:
output_map.append('matched_idxs')
# With the data set [1,2,3,4,5,6,7] and the batch size 2:
# last_batch_policy = LastBatchPolicy.PARTIAL, last_batch_padded = True -> last batch = [7], next iteration will return [1, 2] <= Validation
# last_batch_policy = LastBatchPolicy.PARTIAL, last_batch_padded = False -> last batch = [7], next iteration will return [2, 3]
# last_batch_policy = LastBatchPolicy.FILL, last_batch_padded = True -> last batch = [7, 7], next iteration will return [1, 2]
# last_batch_policy = LastBatchPolicy.FILL, last_batch_padded = False -> last batch = [7, 1], next iteration will return [2, 3] <= Training
# last_batch_policy = LastBatchPolicy.DROP, last_batch_padded = True -> last batch = [5, 6], next iteration will return [1, 2]
# last_batch_policy = LastBatchPolicy.DROP, last_batch_padded = False -> last batch = [5, 6], next iteration will return [2, 3]
last_batch_policy = LastBatchPolicy.FILL if self.is_training else LastBatchPolicy.PARTIAL
self.dali_iter = DALIGenericIterator(pipelines=[self.pipe],
reader_name="coco",
output_map=output_map,
auto_reset=True,
last_batch_policy=last_batch_policy)
def __len__(self):
return len(self.dali_iter)
def __iter__(self):
if self.cache_ready:
return iter(self.cached_vals)
return itertools.chain(self.cached_vals, self.__iter())
def __iter(self):
for obj in self.dali_iter:
obj = obj[0]
# images
images = obj['images']
# targets
boxes = [b[0][:b[1][0]] for b in zip(obj['boxes'], obj['boxes_shape'])]
labels = [b[0][:b[1][0]] for b in zip(obj['labels'].to(torch.int64), obj['labels_shape'])]
image_id = obj['images_id']
original_image_size = obj['images_shape']
targets = dict(boxes=boxes, labels=labels, image_id=image_id, original_image_size=original_image_size[:, 0:2])
if self.compute_matched_idxs:
matched_idxs = obj['matched_idxs'][:, 0, :]
targets['matched_idxs'] = matched_idxs
if self.cache:
self.cached_vals.append((images, targets))
yield images, targets
if self.cache:
self.cache_ready = True
if __name__ == '__main__':
device = torch.device(0)
# dali_iter = DaliDataIterator(data_path='/datasets/open-images-v6-mlperf/train/data',
# anno_path='/datasets/open-images-v6-mlperf/train/labels/openimages-mlperf.json',
# batch_size=8, num_threads=4, world=1)
dali_iter = DaliDataIterator(data_path='/datasets/coco2017/train2017',
anno_path='/datasets/coco2017/annotations/instances_train2017.json',
batch_size=2, num_threads=1, world=1, training=True)
for images, targets in dali_iter:
pdb.set_trace()
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
## DL params
export BATCHSIZE=16
export NUMEPOCHS=6
export DATASET_DIR="/data/OpenImages_mlperf"
export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco'
# Set variables
EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
LOG_INTERVAL=${LOG_INTERVAL:-20}
TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}
# run benchmark
echo "running benchmark"
PARAMS=(
--batch-size "${BATCHSIZE}"
--eval-batch-size "${EVALBATCHSIZE}"
--epochs "${NUMEPOCHS}"
--print-freq "${LOG_INTERVAL}"
--dataset-path "${DATASET_DIR}"
--local_rank "${comm_rank}"
--world-size "${comm_size}"
)
# run training
APP="python3 train.py ${PARAMS[@]} ${EXTRA_PARAMS}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
## DL params
export BATCHSIZE=8
export NUMEPOCHS=6
export DATASET_DIR="/data/OpenImages_mlperf"
export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco --no-amp'
# Set variables
EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
LOG_INTERVAL=${LOG_INTERVAL:-20}
TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}
# run benchmark
echo "running benchmark"
PARAMS=(
--batch-size "${BATCHSIZE}"
--eval-batch-size "${EVALBATCHSIZE}"
--epochs "${NUMEPOCHS}"
--print-freq "${LOG_INTERVAL}"
--dataset-path "${DATASET_DIR}"
--local_rank "${comm_rank}"
--world-size "${comm_size}"
)
# run training
APP="python3 train.py ${PARAMS[@]} ${EXTRA_PARAMS}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
#!/bin/bash
# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# runs benchmark and reports time to convergence
# to use the script:
# run_and_time.sh
set +x
set -e
source config_DGXA100_001x08x032.sh
# Only rank print
[ "${SLURM_LOCALID-0}" -ne 0 ] && set +x
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
# Set variables
[ "${DEBUG}" = "1" ] && set -x
LR=${LR:-0.0001}
WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
BATCHSIZE=${BATCHSIZE:-2}
EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
NUMEPOCHS=${NUMEPOCHS:-10}
LOG_INTERVAL=${LOG_INTERVAL:-20}
DATASET_DIR=${DATASET_DIR:-"/public/home/liangjj/2023/training_results_v2.1-main/NVIDIA/benchmarks/ssd/implementations/pytorch-22.09/public-scripts/datasets/open-images-v6"}
TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-home"}
TIME_TAGS=${TIME_TAGS:-0}
NVTX_FLAG=${NVTX_FLAG:-0}
NCCL_TEST=${NCCL_TEST:-0}
EPOCH_PROF=${EPOCH_PROF:-0}
SYNTH_DATA=${SYNTH_DATA:-0}
DISABLE_CG=${DISABLE_CG:-0}
# run benchmark
echo "running benchmark"
#if [ ${NVTX_FLAG} -gt 0 ]; then
## FIXME mfrank 2022-May-24: NSYSCMD needs to be an array, not a space-separated string
# NSYSCMD=" /nsight/bin/nsys profile --capture-range cudaProfilerApi --capture-range-end stop --sample=none --cpuctxsw=none --trace=cuda,nvtx --force-overwrite true --output /results/single_stage_detector_pytorch_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}_${SLURM_PROCID}_${SYNTH_DATA}_${DISABLE_CG}.nsys-rep "
#else
# NSYSCMD=""
#fi
#if [ ${SYNTH_DATA} -gt 0 ]; then
#EXTRA_PARAMS+=" --syn-dataset --cuda-graphs-syn "
#EXTRA_PARAMS=$(echo $EXTRA_PARAMS | sed 's/--dali//')
#fi
declare -a CMD
#if [ -n "${SLURM_LOCALID-}" ]; then
# # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
# if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
# CMD=( 'bindpcie' '--ib=single' '--' ${NSYSCMD} 'python' '-u' )
# else
# CMD=( ${NSYSCMD} 'python' '-u' )
# fi
#else
# # Mode 2: Single-node Docker, we've been launched with `torch_run`
# # TODO: Replace below CMD with NSYSCMD..., but make sure NSYSCMD is an array, not a string
# CMD=( "python" )
#fi
CMD=( "python" )
#if [ "$LOGGER" = "apiLog.sh" ];
#then
# LOGGER="${LOGGER} -p MLPerf/${MODEL_NAME} -v ${FRAMEWORK}/train/${DGXSYSTEM}"
# TODO(ahmadki): track the apiLog.sh bug and remove the workaround
# there is a bug in apiLog.sh preventing it from collecting
# NCCL logs, the workaround is to log a single rank only
# LOCAL_RANK is set with an enroot hook for Pytorch containers
# SLURM_LOCALID is set by Slurm
# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
# readonly node_rank="${SLURM_NODEID:-0}"
# readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
# if [ "$node_rank" -eq 0 ] && [ "$local_rank" -eq 0 ];
# then
# LOGGER=$LOGGER
# else
# LOGGER=""
# fi
#fi
PARAMS=(
--lr "${LR}"
--batch-size "${BATCHSIZE}"
--eval-batch-size "${EVALBATCHSIZE}"
--epochs "${NUMEPOCHS}"
--print-freq "${LOG_INTERVAL}"
--dataset-path "${DATASET_DIR}"
--warmup-epochs "${WARMUP_EPOCHS}"
)
export HIP_VISIBLE_DEVICES=4,5
export HSA_FORCE_FINE_GRAIN_PCIE=1
#export MIOPEN_FIND_MODE=5
#export NCCL_NET_GDR_LEVEL=5
export NCCL_P2P_LEVEL=5
# run training
#${LOGGER:-} "${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?
python -m torch.distributed.launch --nnodes 1 --nproc_per_node=2 train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?
sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="SINGLE_STAGE_DETECTOR"
echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment