version 1

64b02fb6 · liangjing · 64b02fb6 · 64b02fb6 · 64b02fb6 · 64b02fb6
Commit 64b02fb6 authored Apr 11, 2023 by liangjing
20 changed files
--- a/cocoapi-0.7.0/common/gason.h
+++ b/cocoapi-0.7.0/common/gason.h
+// https://github.com/vivkin/gason - pulled January 10, 2016
+#pragma once
+
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+
+enum JsonTag {
+    JSON_NUMBER = 0,
+    JSON_STRING,
+    JSON_ARRAY,
+    JSON_OBJECT,
+    JSON_TRUE,
+    JSON_FALSE,
+    JSON_NULL = 0xF
+};
+
+struct JsonNode;
+
+#define JSON_VALUE_PAYLOAD_MASK 0x00007FFFFFFFFFFFULL
+#define JSON_VALUE_NAN_MASK 0x7FF8000000000000ULL
+#define JSON_VALUE_TAG_MASK 0xF
+#define JSON_VALUE_TAG_SHIFT 47
+
+union JsonValue {
+    uint64_t ival;
+    double fval;
+
+    JsonValue(double x)
+        : fval(x) {
+    }
+    JsonValue(JsonTag tag = JSON_NULL, void *payload = nullptr) {
+        assert((uintptr_t)payload <= JSON_VALUE_PAYLOAD_MASK);
+        ival = JSON_VALUE_NAN_MASK | ((uint64_t)tag << JSON_VALUE_TAG_SHIFT) | (uintptr_t)payload;
+    }
+    bool isDouble() const {
+        return (int64_t)ival <= (int64_t)JSON_VALUE_NAN_MASK;
+    }
+    JsonTag getTag() const {
+        return isDouble() ? JSON_NUMBER : JsonTag((ival >> JSON_VALUE_TAG_SHIFT) & JSON_VALUE_TAG_MASK);
+    }
+    uint64_t getPayload() const {
+        assert(!isDouble());
+        return ival & JSON_VALUE_PAYLOAD_MASK;
+    }
+    double toNumber() const {
+        assert(getTag() == JSON_NUMBER);
+        return fval;
+    }
+    char *toString() const {
+        assert(getTag() == JSON_STRING);
+        return (char *)getPayload();
+    }
+    JsonNode *toNode() const {
+        assert(getTag() == JSON_ARRAY || getTag() == JSON_OBJECT);
+        return (JsonNode *)getPayload();
+    }
+};
+
+struct JsonNode {
+    JsonValue value;
+    JsonNode *next;
+    char *key;
+};
+
+struct JsonIterator {
+    JsonNode *p;
+
+    void operator++() {
+        p = p->next;
+    }
+    bool operator!=(const JsonIterator &x) const {
+        return p != x.p;
+    }
+    JsonNode *operator*() const {
+        return p;
+    }
+    JsonNode *operator->() const {
+        return p;
+    }
+};
+
+inline JsonIterator begin(JsonValue o) {
+    return JsonIterator{o.toNode()};
+}
+inline JsonIterator end(JsonValue) {
+    return JsonIterator{nullptr};
+}
+
+#define JSON_ERRNO_MAP(XX)                           \
+    XX(OK, "ok")                                     \
+    XX(BAD_NUMBER, "bad number")                     \
+    XX(BAD_STRING, "bad string")                     \
+    XX(BAD_IDENTIFIER, "bad identifier")             \
+    XX(STACK_OVERFLOW, "stack overflow")             \
+    XX(STACK_UNDERFLOW, "stack underflow")           \
+    XX(MISMATCH_BRACKET, "mismatch bracket")         \
+    XX(UNEXPECTED_CHARACTER, "unexpected character") \
+    XX(UNQUOTED_KEY, "unquoted key")                 \
+    XX(BREAKING_BAD, "breaking bad")                 \
+    XX(ALLOCATION_FAILURE, "allocation failure")
+
+enum JsonErrno {
+#define XX(no, str) JSON_##no,
+    JSON_ERRNO_MAP(XX)
+#undef XX
+};
+
+const char *jsonStrError(int err);
+
+class JsonAllocator {
+    struct Zone {
+        Zone *next;
+        size_t used;
+    } *head = nullptr;
+
+public:
+    JsonAllocator() = default;
+    JsonAllocator(const JsonAllocator &) = delete;
+    JsonAllocator &operator=(const JsonAllocator &) = delete;
+    JsonAllocator(JsonAllocator &&x) : head(x.head) {
+        x.head = nullptr;
+    }
+    JsonAllocator &operator=(JsonAllocator &&x) {
+        head = x.head;
+        x.head = nullptr;
+        return *this;
+    }
+    ~JsonAllocator() {
+        deallocate();
+    }
+    void *allocate(size_t size);
+    void deallocate();
+};
+
+int jsonParse(char *str, char **endptr, JsonValue *value, JsonAllocator &allocator);
--- a/cocoapi-0.7.0/common/maskApi.c
+++ b/cocoapi-0.7.0/common/maskApi.c
+/**************************************************************************
+* Microsoft COCO Toolbox.      version 2.0
+* Data, paper, and tutorials available at:  http://mscoco.org/
+* Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+* Licensed under the Simplified BSD License [see coco/license.txt]
+**************************************************************************/
+#include "maskApi.h"
+#include <math.h>
+#include <stdlib.h>
+
+uint umin( uint a, uint b ) { return (a<b) ? a : b; }
+uint umax( uint a, uint b ) { return (a>b) ? a : b; }
+
+void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) {
+  R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m);
+  siz j; if(cnts) for(j=0; j<m; j++) R->cnts[j]=cnts[j];
+}
+
+void rleFree( RLE *R ) {
+  free(R->cnts); R->cnts=0;
+}
+
+void rlesInit( RLE **R, siz n ) {
+  siz i; *R = (RLE*) malloc(sizeof(RLE)*n);
+  for(i=0; i<n; i++) rleInit((*R)+i,0,0,0,0);
+}
+
+void rlesFree( RLE **R, siz n ) {
+  siz i; for(i=0; i<n; i++) rleFree((*R)+i); free(*R); *R=0;
+}
+
+void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) {
+  siz i, j, k, a=w*h; uint c, *cnts; byte p;
+  cnts = malloc(sizeof(uint)*(a+1));
+  for(i=0; i<n; i++) {
+    const byte *T=M+a*i; k=0; p=0; c=0;
+    for(j=0; j<a; j++) { if(T[j]!=p) { cnts[k++]=c; c=0; p=T[j]; } c++; }
+    cnts[k++]=c; rleInit(R+i,h,w,k,cnts);
+  }
+  free(cnts);
+}
+
+void rleEncodePaste( RLE *R, const byte *M, siz h, siz w, siz n, siz oy, siz ox, siz oh, siz ow ) {
+  siz i, j, k, a=w*h, lp=ox, rp=ow-(ox+w), tp=oy, bp=oh-(oy+h); uint c, *cnts; byte p;
+  cnts = malloc(sizeof(uint)*(ow*oh+1));
+  for(i=0; i<n; i++) {
+    const byte *T=M+a*i; siz jj=0; k=0; p=0; c=lp*oh+tp;
+    for(j=0; j<a; j++) { 
+      if(bp+tp > 0) {
+        if(j-jj == h) {
+          // completed one column
+          if((k&1) == 0) {
+            // add to zero run
+            c += bp+tp;
+          } else {
+            // complete one run
+            cnts[k++]=c; 
+            // start zero run
+            c=tp+bp;
+            p=0;
+          }
+          jj = j;
+        }
+      }
+      if(T[j]!=p) { 
+        cnts[k++]=c; 
+        c=0; 
+        p=T[j]; 
+      }
+      c++; 
+    }
+    if (rp > 0 || bp > 0) {
+      if((k&1) == 0) {
+        // add to zero run
+        c += bp + rp*oh;
+      } else {
+        // complete one run
+        cnts[k++] = c;
+        c = bp + rp*oh;
+        p = 0;
+      }
+    }
+    cnts[k++]=c; 
+    rleInit(R+i,oh,ow,k,cnts);
+  }
+  free(cnts);
+}
+
+void rleDecode( const RLE *R, byte *M, siz n ) {
+  siz i, j, k; for( i=0; i<n; i++ ) {
+    byte v=0; for( j=0; j<R[i].m; j++ ) {
+      for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
+}
+
+void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) {
+  uint *cnts, c, ca, cb, cc, ct; int v, va, vb, vp;
+  siz i, a, b, h=R[0].h, w=R[0].w, m=R[0].m; RLE A, B;
+  if(n==0) { rleInit(M,0,0,0,0); return; }
+  if(n==1) { rleInit(M,h,w,m,R[0].cnts); return; }
+  cnts = malloc(sizeof(uint)*(h*w+1));
+  for( a=0; a<m; a++ ) cnts[a]=R[0].cnts[a];
+  for( i=1; i<n; i++ ) {
+    B=R[i]; if(B.h!=h||B.w!=w) { h=w=m=0; break; }
+    rleInit(&A,h,w,m,cnts); ca=A.cnts[0]; cb=B.cnts[0];
+    v=va=vb=0; m=0; a=b=1; cc=0; ct=1;
+    while( ct>0 ) {
+      c=umin(ca,cb); cc+=c; ct=0;
+      ca-=c; if(!ca && a<A.m) { ca=A.cnts[a++]; va=!va; } ct+=ca;
+      cb-=c; if(!cb && b<B.m) { cb=B.cnts[b++]; vb=!vb; } ct+=cb;
+      vp=v; if(intersect) v=va&&vb; else v=va||vb;
+      if( v!=vp||ct==0 ) { cnts[m++]=cc; cc=0; }
+    }
+    rleFree(&A);
+  }
+  rleInit(M,h,w,m,cnts); free(cnts);
+}
+
+void rleArea( const RLE *R, siz n, uint *a ) {
+  siz i, j; for( i=0; i<n; i++ ) {
+    a[i]=0; for( j=1; j<R[i].m; j+=2 ) a[i]+=R[i].cnts[j]; }
+}
+
+void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) {
+  siz g, d; BB db, gb; int crowd;
+  db=malloc(sizeof(double)*m*4); rleToBbox(dt,db,m);
+  gb=malloc(sizeof(double)*n*4); rleToBbox(gt,gb,n);
+  bbIou(db,gb,m,n,iscrowd,o); free(db); free(gb);
+  for( g=0; g<n; g++ ) for( d=0; d<m; d++ ) if(o[g*m+d]>0) {
+    crowd=iscrowd!=NULL && iscrowd[g];
+    if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; }
+    siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb;
+    ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0;
+    cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1;
+    while( ct>0 ) {
+      c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0;
+      ca-=c; if(!ca && a<ka) { ca=dt[d].cnts[a++]; va=!va; } ct+=ca;
+      cb-=c; if(!cb && b<kb) { cb=gt[g].cnts[b++]; vb=!vb; } ct+=cb;
+    }
+    if(i==0) u=1; else if(crowd) rleArea(dt+d,1,&u);
+    o[g*m+d] = (double)i/(double)u;
+  }
+}
+
+void rleNms( RLE *dt, siz n, uint *keep, double thr ) {
+  siz i, j; double u;
+  for( i=0; i<n; i++ ) keep[i]=1;
+  for( i=0; i<n; i++ ) if(keep[i]) {
+    for( j=i+1; j<n; j++ ) if(keep[j]) {
+      rleIou(dt+i,dt+j,1,1,0,&u);
+      if(u>thr) keep[j]=0;
+    }
+  }
+}
+
+void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) {
+  double h, w, i, u, ga, da; siz g, d; int crowd;
+  for( g=0; g<n; g++ ) {
+    BB G=gt+g*4; ga=G[2]*G[3]; crowd=iscrowd!=NULL && iscrowd[g];
+    for( d=0; d<m; d++ ) {
+      BB D=dt+d*4; da=D[2]*D[3]; o[g*m+d]=0;
+      w=fmin(D[2]+D[0],G[2]+G[0])-fmax(D[0],G[0]); if(w<=0) continue;
+      h=fmin(D[3]+D[1],G[3]+G[1])-fmax(D[1],G[1]); if(h<=0) continue;
+      i=w*h; u = crowd ? da : da+ga-i; o[g*m+d]=i/u;
+    }
+  }
+}
+
+void bbNms( BB dt, siz n, uint *keep, double thr ) {
+  siz i, j; double u;
+  for( i=0; i<n; i++ ) keep[i]=1;
+  for( i=0; i<n; i++ ) if(keep[i]) {
+    for( j=i+1; j<n; j++ ) if(keep[j]) {
+      bbIou(dt+i*4,dt+j*4,1,1,0,&u);
+      if(u>thr) keep[j]=0;
+    }
+  }
+}
+
+void rleToBbox( const RLE *R, BB bb, siz n ) {
+  siz i; for( i=0; i<n; i++ ) {
+    uint h, w, x, y, xs, ys, xe, ye, xp, cc, t; siz j, m;
+    h=(uint)R[i].h; w=(uint)R[i].w; m=R[i].m;
+    m=((siz)(m/2))*2; xs=w; ys=h; xe=ye=0; cc=0;
+    if(m==0) { bb[4*i+0]=bb[4*i+1]=bb[4*i+2]=bb[4*i+3]=0; continue; }
+    for( j=0; j<m; j++ ) {
+      cc+=R[i].cnts[j]; t=cc-j%2; y=t%h; x=(t-y)/h;
+      if(j%2==0) xp=x; else if(xp<x) { ys=0; ye=h-1; }
+      xs=umin(xs,x); xe=umax(xe,x); ys=umin(ys,y); ye=umax(ye,y);
+    }
+    bb[4*i+0]=xs; bb[4*i+2]=xe-xs+1;
+    bb[4*i+1]=ys; bb[4*i+3]=ye-ys+1;
+  }
+}
+
+void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) {
+  siz i; for( i=0; i<n; i++ ) {
+    double xs=bb[4*i+0], xe=xs+bb[4*i+2];
+    double ys=bb[4*i+1], ye=ys+bb[4*i+3];
+    double xy[8] = {xs,ys,xs,ye,xe,ye,xe,ys};
+    rleFrPoly( R+i, xy, 4, h, w );
+  }
+}
+
+int uintCompare(const void *a, const void *b) {
+  uint c=*((uint*)a), d=*((uint*)b); return c>d?1:c<d?-1:0;
+}
+
+void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
+  /* upsample and get discrete points densely along entire boundary */
+  siz j, m=0; double scale=5; int *x, *y, *u, *v; uint *a, *b;
+  x=malloc(sizeof(int)*(k+1)); y=malloc(sizeof(int)*(k+1));
+  for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
+  for(j=0; j<k; j++) y[j]=(int)(scale*xy[j*2+1]+.5); y[k]=y[0];
+  for(j=0; j<k; j++) m+=umax(abs(x[j]-x[j+1]),abs(y[j]-y[j+1]))+1;
+  u=malloc(sizeof(int)*m); v=malloc(sizeof(int)*m); m=0;
+  for( j=0; j<k; j++ ) {
+    int xs=x[j], xe=x[j+1], ys=y[j], ye=y[j+1], dx, dy, t, d;
+    int flip; double s; dx=abs(xe-xs); dy=abs(ys-ye);
+    flip = (dx>=dy && xs>xe) || (dx<dy && ys>ye);
+    if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; }
+    s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy;
+    if(dx>=dy) for( d=0; d<=dx; d++ ) {
+      t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++;
+    } else for( d=0; d<=dy; d++ ) {
+      t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++;
+    }
+  }
+  /* get points along y-boundary and downsample */
+  free(x); free(y); k=m; m=0; double xd, yd;
+  x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k);
+  for( j=1; j<k; j++ ) if(u[j]!=u[j-1]) {
+    xd=(double)(u[j]<u[j-1]?u[j]:u[j]-1); xd=(xd+.5)/scale-.5;
+    if( floor(xd)!=xd || xd<0 || xd>w-1 ) continue;
+    yd=(double)(v[j]<v[j-1]?v[j]:v[j-1]); yd=(yd+.5)/scale-.5;
+    if(yd<0) yd=0; else if(yd>h) yd=h; yd=ceil(yd);
+    x[m]=(int) xd; y[m]=(int) yd; m++;
+  }
+  /* compute rle encoding given y-boundary points */
+  k=m; a=malloc(sizeof(uint)*(k+1));
+  for( j=0; j<k; j++ ) a[j]=(uint)(x[j]*(int)(h)+y[j]);
+  a[k++]=(uint)(h*w); free(u); free(v); free(x); free(y);
+  qsort(a,k,sizeof(uint),uintCompare); uint p=0;
+  for( j=0; j<k; j++ ) { uint t=a[j]; a[j]-=p; p=t; }
+  b=malloc(sizeof(uint)*k); j=m=0; b[m++]=a[j++];
+  while(j<k) if(a[j]>0) b[m++]=a[j++]; else {
+    j++; if(j<k) b[m-1]+=a[j++]; }
+  rleInit(R,h,w,m,b); free(a); free(b);
+}
+
+char* rleToString( const RLE *R ) {
+  /* Similar to LEB128 but using 6 bits/char and ascii chars 48-111. */
+  siz i, m=R->m, p=0; long x; int more;
+  char *s=malloc(sizeof(char)*m*6);
+  for( i=0; i<m; i++ ) {
+    x=(long) R->cnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1;
+    while( more ) {
+      char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0;
+      if(more) c |= 0x20; c+=48; s[p++]=c;
+    }
+  }
+  s[p]=0; return s;
+}
+
+void rleFrString( RLE *R, char *s, siz h, siz w ) {
+  siz m=0, p=0, k; long x; int more; uint *cnts;
+  while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0;
+  while( s[p] ) {
+    x=0; k=0; more=1;
+    while( more ) {
+      char c=s[p]-48; x |= (c & 0x1f) << 5*k;
+      more = c & 0x20; p++; k++;
+      if(!more && (c & 0x10)) x |= -1 << 5*k;
+    }
+    if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x;
+  }
+  rleInit(R,h,w,m,cnts); free(cnts);
+}
--- a/cocoapi-0.7.0/common/maskApi.h
+++ b/cocoapi-0.7.0/common/maskApi.h
+/**************************************************************************
+* Microsoft COCO Toolbox.      version 2.0
+* Data, paper, and tutorials available at:  http://mscoco.org/
+* Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+* Licensed under the Simplified BSD License [see coco/license.txt]
+**************************************************************************/
+#pragma once
+
+typedef unsigned int uint;
+typedef unsigned long siz;
+typedef unsigned char byte;
+typedef double* BB;
+typedef struct { siz h, w, m; uint *cnts; } RLE;
+
+/* Initialize/destroy RLE. */
+void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
+void rleFree( RLE *R );
+
+/* Initialize/destroy RLE array. */
+void rlesInit( RLE **R, siz n );
+void rlesFree( RLE **R, siz n );
+
+/* Encode binary masks using RLE. */
+void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
+void rleEncodePaste( RLE *R, const byte *M, siz h, siz w, siz n, siz oy, siz ox, siz oh, siz ow );
+
+/* Decode binary masks encoded via RLE. */
+void rleDecode( const RLE *R, byte *mask, siz n );
+
+/* Compute union or intersection of encoded masks. */
+void rleMerge( const RLE *R, RLE *M, siz n, int intersect );
+
+/* Compute area of encoded masks. */
+void rleArea( const RLE *R, siz n, uint *a );
+
+/* Compute intersection over union between masks. */
+void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
+
+/* Compute non-maximum suppression between bounding masks */
+void rleNms( RLE *dt, siz n, uint *keep, double thr );
+
+/* Compute intersection over union between bounding boxes. */
+void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
+
+/* Compute non-maximum suppression between bounding boxes */
+void bbNms( BB dt, siz n, uint *keep, double thr );
+
+/* Get bounding boxes surrounding encoded masks. */
+void rleToBbox( const RLE *R, BB bb, siz n );
+
+/* Convert bounding boxes to encoded masks. */
+void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
+
+/* Convert polygon to encoded mask. */
+void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
+
+/* Get compressed string representation of encoded mask. */
+char* rleToString( const RLE *R );
+
+/* Convert from compressed string representation of encoded mask. */
+void rleFrString( RLE *R, char *s, siz h, siz w );
--- a/cocoapi-0.7.0/license.txt
+++ b/cocoapi-0.7.0/license.txt
+Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer. 
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those
+of the authors and should not be interpreted as representing official policies, 
+either expressed or implied, of the FreeBSD Project.
--- a/config_DGXA100_001x08x032.sh
+++ b/config_DGXA100_001x08x032.sh
+#!/bin/bash
+
+## DL params
+#export BATCHSIZE=${BATCHSIZE:-32}
+export BATCHSIZE=${BATCHSIZE:-16}
+export NUMEPOCHS=${NUMEPOCHS:-1}
+export LR=${LR:-0.000085}
+export WARMUP_EPOCHS=${WARMUP_EPOCHS:-0}
+#export EXTRA_PARAMS=${EXTRA_PARAMS:-'--jit --frozen-bn-opt --frozen-bn-fp16 --apex-adam --apex-focal-loss --apex-head-fusion --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --cuda-graphs --dali --dali-matched-idxs --dali-eval --skip-metric-loss --cuda-graphs-syn --sync-after-graph-replay --async-coco'}
+#export EXTRA_PARAMS=${EXTRA_PARAMS:-'--frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --skip-metric-loss --sync-after-graph-replay --async-coco'}
+#export EXTRA_PARAMS="--frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco"
+#export EXTRA_PARAMS="--apex-adam --fp16-allreduce --skip-metric-loss --async-coco"
+export EXTRA_PARAMS=""
+
+## System run parms
+export DGXNNODES=1
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME_MINUTES=160
+export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES}))
+
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2  # HT is on is 2, HT off is 1
+
+## System data paths
+MLPERF_LOGIN_HOST="${MLPERF_LOGIN_HOST:-$(hostname | sed -E 's/-.*$//')}"
+MLPERF_HOST_CONFIG=$(dirname "${BASH_SOURCE[0]}")/config_data_"${MLPERF_LOGIN_HOST}".sh
+echo "${MLPERF_HOST_CONFIG}"
+if [ -f "${MLPERF_HOST_CONFIG}" ]; then
+    source "${MLPERF_HOST_CONFIG}"
+fi
--- a/config_DGXA100_008x08x004.sh
+++ b/config_DGXA100_008x08x004.sh
+#!/bin/bash
+
+## DL params
+export BATCHSIZE=${BATCHSIZE:-4}
+export NUMEPOCHS=${NUMEPOCHS:-6}
+export LR=${LR:-0.0001}
+export WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
+export EXTRA_PARAMS=${EXTRA_PARAMS:-'--jit --frozen-bn-opt --frozen-bn-fp16 --apex-adam --apex-focal-loss --apex-head-fusion --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --cuda-graphs --dali --dali-matched-idxs --dali-eval --skip-metric-loss --cuda-graphs-syn --sync-after-graph-replay --async-coco'}
+
+## System run parms
+export DGXNNODES=8
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME_MINUTES=20
+export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES}))
+
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2  # HT is on is 2, HT off is 1
+
+## System data paths
+MLPERF_LOGIN_HOST="${MLPERF_LOGIN_HOST:-$(hostname | sed -E 's/-.*$//')}"
+MLPERF_HOST_CONFIG=$(dirname "${BASH_SOURCE[0]}")/config_data_"${MLPERF_LOGIN_HOST}".sh
+echo "${MLPERF_HOST_CONFIG}"
+if [ -f "${MLPERF_HOST_CONFIG}" ]; then
+    source "${MLPERF_HOST_CONFIG}"
+fi
--- a/config_DGXA100_160x08x001.sh
+++ b/config_DGXA100_160x08x001.sh
+#!/bin/bash
+
+## DL params
+export BATCHSIZE=${BATCHSIZE:-1}
+export NUMEPOCHS=${NUMEPOCHS:-10}
+export LR=${LR:-0.0001}
+export WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
+export EXTRA_PARAMS=${EXTRA_PARAMS:-'--jit --frozen-bn-opt --frozen-bn-fp16 --apex-adam --apex-focal-loss --apex-head-fusion --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --cuda-graphs --dali --dali-matched-idxs --dali-eval --dali-eval-cache --skip-metric-loss --cuda-graphs-syn --sync-after-graph-replay --async-coco'}
+
+## System run parms
+export DGXNNODES=160
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME_MINUTES=15
+export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES}))
+
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2  # HT is on is 2, HT off is 1
+
+## System data paths
+MLPERF_LOGIN_HOST="${MLPERF_LOGIN_HOST:-$(hostname | sed -E 's/-.*$//')}"
+MLPERF_HOST_CONFIG=$(dirname "${BASH_SOURCE[0]}")/config_data_"${MLPERF_LOGIN_HOST}".sh
+echo "${MLPERF_HOST_CONFIG}"
+if [ -f "${MLPERF_HOST_CONFIG}" ]; then
+    source "${MLPERF_HOST_CONFIG}"
+fi
--- a/config_DGXA100_256x08x001.sh
+++ b/config_DGXA100_256x08x001.sh
+#!/bin/bash
+
+## DL params
+export BATCHSIZE=${BATCHSIZE:-1}
+export NUMEPOCHS=${NUMEPOCHS:-10}
+export LR=${LR:-0.000135}
+export WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
+export EXTRA_PARAMS=${EXTRA_PARAMS:-'--jit --frozen-bn-opt --frozen-bn-fp16 --apex-adam --apex-focal-loss --apex-head-fusion --disable-ddp-broadcast-buffers --fp16-allreduce --reg-head-pad --cls-head-pad --cuda-graphs --dali --dali-matched-idxs --dali-eval --dali-eval-cache --skip-metric-loss --cuda-graphs-syn --sync-after-graph-replay --async-coco'}
+
+## System run parms
+export DGXNNODES=256
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME_MINUTES=15
+export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES}))
+
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2  # HT is on is 2, HT off is 1
+
+## System data paths
+MLPERF_LOGIN_HOST="${MLPERF_LOGIN_HOST:-$(hostname | sed -E 's/-.*$//')}"
+MLPERF_HOST_CONFIG=$(dirname "${BASH_SOURCE[0]}")/config_data_"${MLPERF_LOGIN_HOST}".sh
+echo "${MLPERF_HOST_CONFIG}"
+if [ -f "${MLPERF_HOST_CONFIG}" ]; then
+    source "${MLPERF_HOST_CONFIG}"
+fi
--- a/csrc/dali_box_iou/CMakeLists.txt
+++ b/csrc/dali_box_iou/CMakeLists.txt
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.10)
+set(CMAKE_CUDA_ARCHITECTURES "35;50;52;60;61;70;75;80;86")
+
+project(box_iou_plugin LANGUAGES CUDA CXX C)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD 11)
+
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+include_directories(SYSTEM "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
+
+execute_process(
+        COMMAND python -c "import nvidia.dali as dali; print(dali.sysconfig.get_lib_dir())"
+        OUTPUT_VARIABLE DALI_LIB_DIR)
+string(STRIP ${DALI_LIB_DIR} DALI_LIB_DIR)
+
+execute_process(
+        COMMAND python -c "import nvidia.dali as dali; print(\" \".join(dali.sysconfig.get_compile_flags()))"
+        OUTPUT_VARIABLE DALI_COMPILE_FLAGS)
+string(STRIP ${DALI_COMPILE_FLAGS} DALI_COMPILE_FLAGS)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DALI_COMPILE_FLAGS} ")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${DALI_COMPILE_FLAGS} ")
+link_directories("${DALI_LIB_DIR}")
+
+add_library(_box_iou SHARED box_iou.cc box_iou.cu)
+target_link_libraries(_box_iou dali)
+
--- a/csrc/dali_box_iou/box_iou.cc
+++ b/csrc/dali_box_iou/box_iou.cc
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//           http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "box_iou.h"
+
+
+DALI_SCHEMA(box_iou)
+    .DocStr("Compute box IoU")
+    .NumInput(2)
+    .NumOutput(1);
+
--- a/csrc/dali_box_iou/box_iou.cu
+++ b/csrc/dali_box_iou/box_iou.cu
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//           http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime_api.h>
+#include "box_iou.h"
+
+namespace other_ns {
+
+
+__global__ void box_iou_cuda_kernel(float *box_iou, float4 *box1, float4 *box2, long num_images, long M, 
+                                    long N, int idxJump) {
+
+    int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    size_t b1_idx, b2_idx, b1_row_offset, b2_row_offset, im_id, im_offset; 
+    float xmin1, xmin2, xmax1, xmax2, ymin1, ymin2, ymax1, ymax2;
+    float x_tl, y_tl, x_br, y_br, w, h, inter, area1, area2, iou;
+          
+    for (long i = idx; i < num_images * M * N; i += idxJump){
+        im_id = i / (M * N);
+        im_offset = i % (M * N);
+        b1_idx = im_offset / N;
+        b2_idx = i % N;
+        b1_row_offset = im_id * M + b1_idx;
+        b2_row_offset = im_id * N + b2_idx;
+
+        xmin1 = box1[b1_row_offset].x;
+        ymin1 = box1[b1_row_offset].y;
+        xmax1 = box1[b1_row_offset].z;
+        ymax1 = box1[b1_row_offset].w;
+        xmin2 = box2[b2_row_offset].x;
+        ymin2 = box2[b2_row_offset].y;
+        xmax2 = box2[b2_row_offset].z;
+        ymax2 = box2[b2_row_offset].w;
+
+        if (xmin1 == -1.0 && ymin1 == -1.0 && xmax1 == -1.0 && ymax1 == -1.0) {
+            // do not consider padded targets
+            box_iou[im_id * M * N + b1_idx * N + b2_idx] = -1;
+        } else {
+            x_tl = fmaxf(xmin1, xmin2);
+            y_tl = fmaxf(ymin1, ymin2);
+
+            x_br = fminf(xmax1, xmax2);
+            y_br = fminf(ymax1, ymax2);                                
+            w = (x_br - x_tl) < 0 ? 0.0f : (x_br - x_tl);
+            h = (y_br - y_tl) < 0 ? 0.0f : (y_br - y_tl);
+
+            inter = w * h;
+            area1 = (xmax1 - xmin1) * (ymax1 - ymin1);
+            area2 = (xmax2 - xmin2) * (ymax2 - ymin2);
+            iou = inter / (area1 + area2 - inter);
+            box_iou[im_id * M * N + b1_idx * N + b2_idx] = iou;
+        }
+    }  
+}
+
+
+template<>
+void box_iou<::dali::GPUBackend>::RunImpl(::dali::DeviceWorkspace &ws) {
+  const auto &box1 = ws.Input<::dali::GPUBackend>(0);
+  const auto &box2 = ws.Input<::dali::GPUBackend>(1);
+  const auto &shape1 = box1.shape();
+  const auto &shape2 = box2.shape();
+  auto &output = ws.Output<::dali::GPUBackend>(0);
+
+  int minGridSize;
+  int blockSize;
+
+  cudaOccupancyMaxPotentialBlockSize(&minGridSize,
+                                     &blockSize,
+                                     (void*) box_iou_cuda_kernel,
+                                     0,  // dynamic memory
+                                     0); // maximum utilized threads 
+
+  dim3 gridDim(minGridSize);
+  dim3 blockDim(blockSize);
+  int idxJump = minGridSize * blockSize;
+  int numImages = shape1.num_samples();
+
+  for (int sample_idx = 0; sample_idx < numImages; sample_idx++) {
+    long M = shape1[sample_idx][0];
+    long N = shape2[0][0];
+
+    box_iou_cuda_kernel<<<gridDim, blockDim, 0, ws.stream()>>>(
+		    (float*) output.raw_mutable_tensor(sample_idx),
+		    (float4*) box1.raw_tensor(sample_idx),
+		    (float4*) box2.raw_tensor(0),
+		    1, M, N,
+		    idxJump);
+  }
+}
+
+}  // namespace other_ns
+
+
+DALI_REGISTER_OPERATOR(box_iou, ::other_ns::box_iou<::dali::GPUBackend>, ::dali::GPU);
+
--- a/csrc/dali_box_iou/box_iou.h
+++ b/csrc/dali_box_iou/box_iou.h
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//           http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_BOX_IOU_H_
+#define DALI_BOX_IOU_H_
+
+#include <vector>
+
+#include "dali/pipeline/operator/operator.h"
+
+namespace other_ns {
+
+template <typename Backend>
+class box_iou : public ::dali::Operator<Backend> {
+ public:
+  inline explicit box_iou(const ::dali::OpSpec &spec) :
+    ::dali::Operator<Backend>(spec) {}
+
+  virtual inline ~box_iou() = default;
+
+  box_iou(const box_iou&) = delete;
+  box_iou& operator=(const box_iou&) = delete;
+  box_iou(box_iou&&) = delete;
+  box_iou& operator=(box_iou&&) = delete;
+
+ protected:
+  bool CanInferOutputs() const override {
+    return true;
+  }
+
+  bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,
+                 const ::dali::workspace_t<Backend> &ws) override {
+    const auto &box1 = ws.template Input<Backend>(0);
+    const auto &box2 = ws.template Input<Backend>(1);
+    auto box1_shape = box1.shape();
+    auto box2_shape = box2.shape();
+    
+    output_desc.resize(1);
+
+    const int N = box1.num_samples();
+    output_desc[0].shape = box1_shape;
+    for (int i = 0; i < N; i++) {
+       output_desc[0].shape.tensor_shape_span(i).back() = box2_shape[i][0];
+    }
+
+    output_desc[0].type = box2.type();
+    return true;
+  }
+
+  void RunImpl(::dali::workspace_t<Backend> &ws) override;
+};
+
+}  // namespace other_ns
+
+#endif  // DALI_BOX_IOU_H_
+
--- a/csrc/dali_proposal_matcher/CMakeLists.txt
+++ b/csrc/dali_proposal_matcher/CMakeLists.txt
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.10)
+set(CMAKE_CUDA_ARCHITECTURES "35;50;52;60;61;70;75;80;86")
+
+project(proposal_matcher_plugin LANGUAGES CUDA CXX C)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD 11)
+
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+include_directories(SYSTEM "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
+
+execute_process(
+        COMMAND python -c "import nvidia.dali as dali; print(dali.sysconfig.get_lib_dir())"
+        OUTPUT_VARIABLE DALI_LIB_DIR)
+string(STRIP ${DALI_LIB_DIR} DALI_LIB_DIR)
+
+execute_process(
+        COMMAND python -c "import nvidia.dali as dali; print(\" \".join(dali.sysconfig.get_compile_flags()))"
+        OUTPUT_VARIABLE DALI_COMPILE_FLAGS)
+string(STRIP ${DALI_COMPILE_FLAGS} DALI_COMPILE_FLAGS)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DALI_COMPILE_FLAGS} ")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${DALI_COMPILE_FLAGS} ")
+link_directories("${DALI_LIB_DIR}")
+
+add_library(_proposal_matcher SHARED proposal_matcher.cc proposal_matcher.cu)
+target_link_libraries(_proposal_matcher dali)
+
--- a/csrc/dali_proposal_matcher/proposal_matcher.cc
+++ b/csrc/dali_proposal_matcher/proposal_matcher.cc
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//           http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "proposal_matcher.h"
+
+
+DALI_SCHEMA(proposal_matcher)
+    .DocStr("Proposal matcher")
+    .NumInput(1)
+    .NumOutput(1);
+
--- a/csrc/dali_proposal_matcher/proposal_matcher.cu
+++ b/csrc/dali_proposal_matcher/proposal_matcher.cu
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//           http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime_api.h>
+#include "proposal_matcher.h"
+
+namespace other_ns {
+
+
+__launch_bounds__(256) static __global__
+    void max_along_gt_idx(float *match, unsigned char *pred_forgiven, long *max_gt_idx, long long gt,long long preds,
+                          bool include_low_quality, float low_th, float high_th) {
+    
+    long long tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int image_id = blockIdx.y;
+    int offset_match_matrix = image_id * preds * gt;
+    int offset_preds = image_id * preds;
+    if(tid < preds){
+        float max_iou = 0.0f;
+        int max_idx = 0;
+        float iou;
+        for(long long i = 0;i < gt; i++){
+            iou = match[offset_match_matrix + i * preds + tid]; 
+            if (iou > max_iou) {max_iou = iou; max_idx = i;}
+        }
+
+        if (max_iou >= high_th) max_gt_idx[offset_preds + tid] = max_idx;
+        else if ((pred_forgiven[offset_preds + tid] == 1 && include_low_quality)) max_gt_idx[offset_preds + tid] = max_idx;
+        else if (max_iou < low_th) max_gt_idx[offset_preds + tid] = -1;
+        else if (max_iou < high_th) max_gt_idx[offset_preds + tid] = -2;
+    }
+}
+
+
+__device__ void warpReduce(volatile float* sdata, int tid) {
+    sdata[tid] = fmax(sdata[tid],sdata[tid + 32]);
+    sdata[tid] = fmax(sdata[tid],sdata[tid + 16]);
+    sdata[tid] = fmax(sdata[tid],sdata[tid + 8]);
+    sdata[tid] = fmax(sdata[tid],sdata[tid + 4]);
+    sdata[tid] = fmax(sdata[tid],sdata[tid + 2]);
+    sdata[tid] = fmax(sdata[tid],sdata[tid + 1]);
+}
+
+
+static __global__
+    void max_along_preds(float* match, float* inter_gt, long long gt,long long preds) {
+    int gt_idx = blockIdx.x;
+    int chunk_idx = blockIdx.y;
+    int image_id = blockIdx.z;
+    int num_chunks = (preds + 2047) / 2048;
+    int gt_offset = chunk_idx * 2048;
+    int start_idx = image_id * preds * gt + gt_idx * preds + gt_offset;
+    int idx = threadIdx.x;
+    __shared__ float shbuf[1024]; 
+   shbuf[idx] = 0.0f;
+    __syncthreads();
+    if(gt_offset + idx + 1024 < preds) shbuf[idx] = fmax(match[start_idx + idx], match[start_idx + idx + 1024]);
+    else if (gt_offset + idx < preds) shbuf[idx] = match[start_idx + idx];
+    __syncthreads();
+    if(idx < 512) shbuf[idx] = fmax(shbuf[idx],shbuf[idx + 512]);
+    __syncthreads();
+    if(idx < 256) shbuf[idx] = fmax(shbuf[idx], shbuf[idx + 256]);
+    __syncthreads();
+    if(idx < 128) shbuf[idx] = fmax(shbuf[idx], shbuf[idx + 128]);
+    __syncthreads();
+    if(idx < 64) shbuf[idx] = fmax(shbuf[idx], shbuf[idx + 64]);
+    __syncthreads();
+    if(idx < 32) warpReduce(shbuf, idx);
+    if (idx == 0) inter_gt[image_id * num_chunks * gt +  num_chunks * gt_idx + chunk_idx] = shbuf[idx];
+}
+
+
+__launch_bounds__(256) static __global__
+    void max_along_preds_reduced(float *match, float *max_preds, long long gt,long long preds) {
+    long long tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int image_id = blockIdx.y;
+    if (tid < gt){
+        float max_iou = 0.0f;
+        float iou;
+        for(long long i = 0; i < preds; i++){
+            iou = match[image_id * gt * preds + tid * preds + i]; 
+            if (iou > max_iou) max_iou = iou;
+        }
+        max_preds[image_id * gt + tid] = max_iou;
+    }
+}
+
+
+
+__launch_bounds__(256) static __global__
+    void forgive_preds(float *match_quality_data, float *d_best_pred_per_gt, unsigned char *d_pred_forgiven, 
+                       long gt, long preds) {
+    long tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int image_id = blockIdx.y;
+    int offset = image_id * gt * preds;
+    if (tid < preds) {
+        unsigned char forgiven = 0;
+        float iou;
+        for(int i = 0; i < gt; i++) {
+            iou = match_quality_data[offset + i * preds + tid];
+            // do not consider predictions from padded targets (iou = -1)
+            if(((iou == d_best_pred_per_gt[image_id * gt + i])) && (iou != -1.0)) {
+                forgiven = 1;
+                break;
+            }            
+        }
+        d_pred_forgiven[image_id * preds + tid] = forgiven;
+    }    
+} 
+
+
+template<>
+void proposal_matcher<::dali::GPUBackend>::RunImpl(::dali::DeviceWorkspace &ws) {
+  const auto &input = ws.Input<::dali::GPUBackend>(0);
+  const auto &shape = input.shape();
+  auto &output = ws.Output<::dali::GPUBackend>(0);
+
+  bool allow_low_quality_matches = true;
+  float low_th = 0.4;
+  float high_th = 0.5;
+
+  int num_images = 1;  //shape.num_samples();
+
+  for (int sample_idx = 0; sample_idx < shape.num_samples(); sample_idx++) {
+      int gt = shape[sample_idx][0];
+      long long preds = shape[sample_idx][1];
+      float *match_quality_data = (float*) input.raw_tensor(sample_idx);
+      int num_chunks = (preds + 2047) / 2048;
+      
+      // do an intermediate reduction along all predictions for each gt
+      dim3 block(1024, 1, 1);
+      dim3 grid(gt, num_chunks, num_images);
+      
+      if (allow_low_quality_matches) max_along_preds<<<grid, block, 0, ws.stream()>>>(
+		      (float*) input.raw_tensor(sample_idx),
+		      d_intergt,
+		      gt,
+		      preds);
+
+      // final reduction to find best iou per gt
+      int numThreads = 256;
+      int numBlocks = (gt + numThreads - 1) / numThreads;
+      dim3 grid2(numBlocks, num_images, 1);
+
+      if (allow_low_quality_matches) max_along_preds_reduced<<<grid2, numThreads, 0, ws.stream()>>>(
+		      d_intergt,
+		      d_best_pred_per_gt,
+		      gt,
+		      num_chunks);
+
+      numBlocks=(preds + numThreads - 1) / numThreads;
+      dim3 grid_preds(numBlocks, num_images, 1);
+      // if low_quality_matches are allowed, mark some predictions to keep their best matching gt even though
+      // iou < threshold
+      if (allow_low_quality_matches) forgive_preds<<<grid_preds, numThreads, 0, ws.stream()>>>(
+		      (float*) input.raw_tensor(sample_idx),
+		      d_best_pred_per_gt,
+		      d_pred_forgiven,
+		      gt,
+		      preds);
+
+      // compute resulting tensor of indices
+      max_along_gt_idx<<<grid_preds, numThreads, 0, ws.stream()>>>(
+		      (float*) input.raw_tensor(sample_idx),
+		      d_pred_forgiven,
+		      (long*) output.raw_mutable_tensor(sample_idx),
+		      gt,
+		      preds,
+		      allow_low_quality_matches,
+		      low_th,
+		      high_th);
+  }
+}
+
+}  // namespace other_ns
+
+
+DALI_REGISTER_OPERATOR(proposal_matcher, ::other_ns::proposal_matcher<::dali::GPUBackend>, ::dali::GPU);
+
--- a/csrc/dali_proposal_matcher/proposal_matcher.h
+++ b/csrc/dali_proposal_matcher/proposal_matcher.h
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//           http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_PROPOSAL_MATCHER_H_
+#define DALI_PROPOSAL_MATCHER_H_
+
+#include <vector>
+
+#include "dali/pipeline/operator/operator.h"
+
+namespace other_ns {
+
+template <typename Backend>
+class proposal_matcher : public ::dali::Operator<Backend> {
+ public:
+  inline explicit proposal_matcher(const ::dali::OpSpec &spec) :
+    ::dali::Operator<Backend>(spec) {
+
+    int gt = 1000;
+    int preds = 120087;
+    int num_chunks = (preds + 2047) / 2048;
+
+    cudaMalloc(&d_best_pred_per_gt, gt * sizeof(float));
+    cudaMalloc(&d_intergt, gt * num_chunks * sizeof(float));
+    cudaMalloc(&d_pred_forgiven, preds * sizeof(unsigned char));
+  }
+
+  virtual inline ~proposal_matcher() {
+    cudaFree(d_best_pred_per_gt);
+    cudaFree(d_intergt);
+    cudaFree(d_pred_forgiven);
+  }
+
+  proposal_matcher(const proposal_matcher&) = delete;
+  proposal_matcher& operator=(const proposal_matcher&) = delete;
+  proposal_matcher(proposal_matcher&&) = delete;
+  proposal_matcher& operator=(proposal_matcher&&) = delete;
+
+ protected:
+
+  float *d_best_pred_per_gt, *d_intergt;
+  unsigned char *d_pred_forgiven;
+
+  bool CanInferOutputs() const override {
+    return true;
+  }
+
+  bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,
+                 const ::dali::workspace_t<Backend> &ws) override {
+    const auto &input = ws.template Input<Backend>(0);
+    auto shape = input.shape();
+    
+    output_desc.resize(1);
+
+    const int N = input.num_samples();
+    output_desc[0].shape = shape;
+    for (int i = 0; i < N; i++) {
+       output_desc[0].shape.tensor_shape_span(i)[0] = 1;
+       output_desc[0].shape.tensor_shape_span(i)[1] = shape[i][1];
+    }
+
+    //output_desc[0].type = input.type();
+    output_desc[0].type = dali::DALI_INT64;
+
+    return true;
+  }
+
+  void RunImpl(::dali::workspace_t<Backend> &ws) override;
+};
+
+}  // namespace other_ns
+
+#endif  // DALI_PROPOSAL_MATCHER_H_
+
--- a/dali.py
+++ b/dali.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import pdb
+
+import numpy as np
+import torch
+
+from nvidia.dali.pipeline import Pipeline
+import nvidia.dali.fn as fn
+import nvidia.dali.types as types
+from nvidia.dali.plugin.pytorch import DALIGenericIterator, LastBatchPolicy
+import nvidia.dali.plugin_manager as plugin_manager
+
+
+plugin_manager.load_library('/usr/local/lib/lib_box_iou.so')
+plugin_manager.load_library('/usr/local/lib/lib_proposal_matcher.so')
+
+
+class DaliDataIterator(object):
+    def __init__(self, data_path, anno_path, batch_size,
+                 num_shards, shard_id, is_training,
+                 image_size=(800, 800), num_threads=8, prefetch_queue_depth=2,
+                 compute_matched_idxs=False, anchors=None, cpu_decode=False,
+                 lazy_init=True, cache=False, seed=-1):
+        self.data_path = data_path
+        self.anno_path = anno_path
+        self.batch_size = batch_size
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+        self.is_training = is_training
+        self.compute_matched_idxs = compute_matched_idxs
+        self.num_threads = num_threads
+        self.seed = seed
+        self.lazy_init = lazy_init
+        self.image_size = image_size
+        self.prefetch_queue_depth = prefetch_queue_depth
+        self.cpu_decode = cpu_decode
+        self.cache = cache
+        self.cache_ready = False
+        self.cached_vals = []
+        assert not(self.is_training and self.cache), "cache can't be used with training"
+
+        self.pipe = Pipeline(batch_size=self.batch_size,
+                             num_threads=self.num_threads,
+                             seed=self.seed,
+                             device_id=torch.cuda.current_device())
+        with self.pipe:
+            inputs, bboxes, labels, image_ids = fn.readers.coco(
+                name="coco",
+                file_root=self.data_path,
+                annotations_file=self.anno_path,
+                num_shards=self.num_shards,
+                shard_id=self.shard_id,
+                stick_to_shard=not self.is_training,
+                pad_last_batch=not self.is_training,
+                lazy_init=self.lazy_init,
+                ltrb=True,
+                shuffle_after_epoch=self.is_training,
+                avoid_class_remapping=True,
+                image_ids=True,
+                ratio=True,
+                prefetch_queue_depth=self.prefetch_queue_depth,
+                read_ahead=True,
+                skip_empty=False)
+
+            # Images
+            images_shape = fn.peek_image_shape(inputs)  # HWC
+            if self.cpu_decode:
+                images = fn.decoders.image(inputs, device='cpu').gpu()
+            else:
+                images = fn.decoders.image(inputs, device='mixed')
+
+            if self.is_training:
+                flip = fn.random.coin_flip(probability=0.5)
+                images = fn.flip(images, horizontal=flip, device='gpu')
+
+            mean = np.array([[[255 * 0.485]], [[255 * 0.456]], [[255 * 0.406]]], dtype=np.float32)
+            stddev = np.array([[[255 * 0.229]], [[255 * 0.224]], [[255 * 0.225]]], dtype=np.float32)
+            images = fn.normalize(fn.transpose(images, perm=[2, 0, 1]),
+                                  axes=[1, 2],
+                                  mean=mean,
+                                  stddev=stddev)
+            images = fn.resize(images, resize_x=self.image_size[0], resize_y=self.image_size[1])
+
+            # Labels
+            labels_shape = fn.shapes(labels)
+            labels = fn.pad(labels, axes=(0,))
+            labels = labels.gpu()
+            labels = fn.cast(labels, dtype=types.INT64)
+
+            # BBoxes
+            if self.is_training:
+                bboxes = fn.bb_flip(bboxes, horizontal=flip, ltrb=True)
+            lt_x = bboxes[:, 0] * self.image_size[0]
+            lt_y = bboxes[:, 1] * self.image_size[1]
+            rb_x = bboxes[:, 2] * self.image_size[0]
+            rb_y = bboxes[:, 3] * self.image_size[1]
+            bboxes = fn.stack(lt_x, lt_y, rb_x, rb_y, axis=1)
+            bboxes_shape = fn.shapes(bboxes)
+            bboxes = bboxes.gpu()
+            if self.compute_matched_idxs:
+                self.anchors = anchors[0]
+                match_quality_matrix = fn.box_iou(bboxes, self.anchors, device='gpu')
+                matched_idxs = fn.proposal_matcher(match_quality_matrix, device='gpu')
+            bboxes = fn.pad(bboxes, axes=(0,))
+
+            set_outputs = [images, images_shape, image_ids, bboxes, bboxes_shape, labels, labels_shape]
+            if self.compute_matched_idxs:
+                set_outputs.append(matched_idxs)
+
+            self.pipe.set_outputs(*set_outputs)
+        self.pipe.build()
+
+        output_map = ['images', 'images_shape', 'images_id', 'boxes', 'boxes_shape', 'labels', 'labels_shape']
+        if self.compute_matched_idxs:
+            output_map.append('matched_idxs')
+
+        # With the data set [1,2,3,4,5,6,7] and the batch size 2:
+        # last_batch_policy = LastBatchPolicy.PARTIAL, last_batch_padded = True -> last batch = [7], next iteration will return [1, 2]    <= Validation
+        # last_batch_policy = LastBatchPolicy.PARTIAL, last_batch_padded = False -> last batch = [7], next iteration will return [2, 3]
+        # last_batch_policy = LastBatchPolicy.FILL, last_batch_padded = True -> last batch = [7, 7], next iteration will return [1, 2]
+        # last_batch_policy = LastBatchPolicy.FILL, last_batch_padded = False -> last batch = [7, 1], next iteration will return [2, 3]   <= Training
+        # last_batch_policy = LastBatchPolicy.DROP, last_batch_padded = True -> last batch = [5, 6], next iteration will return [1, 2]
+        # last_batch_policy = LastBatchPolicy.DROP, last_batch_padded = False -> last batch = [5, 6], next iteration will return [2, 3]
+        last_batch_policy = LastBatchPolicy.FILL if self.is_training else LastBatchPolicy.PARTIAL
+        self.dali_iter = DALIGenericIterator(pipelines=[self.pipe],
+                                             reader_name="coco",
+                                             output_map=output_map,
+                                             auto_reset=True,
+                                             last_batch_policy=last_batch_policy)
+
+    def __len__(self):
+        return len(self.dali_iter)
+
+    def __iter__(self):
+        if self.cache_ready:
+            return iter(self.cached_vals)
+        return itertools.chain(self.cached_vals, self.__iter())
+
+    def __iter(self):
+        for obj in self.dali_iter:
+            obj = obj[0]
+            
+            # images
+            images = obj['images']
+
+            # targets
+            boxes = [b[0][:b[1][0]] for b in zip(obj['boxes'], obj['boxes_shape'])]
+            labels = [b[0][:b[1][0]] for b in zip(obj['labels'].to(torch.int64), obj['labels_shape'])]
+            image_id = obj['images_id']
+            original_image_size = obj['images_shape']
+            targets = dict(boxes=boxes, labels=labels, image_id=image_id, original_image_size=original_image_size[:, 0:2])
+
+            if self.compute_matched_idxs:
+                matched_idxs = obj['matched_idxs'][:, 0, :]
+                targets['matched_idxs'] = matched_idxs
+
+            if self.cache:
+                self.cached_vals.append((images, targets))
+            yield images, targets
+
+        if self.cache:
+            self.cache_ready = True
+
+
+if __name__ == '__main__':
+    device = torch.device(0)
+    # dali_iter = DaliDataIterator(data_path='/datasets/open-images-v6-mlperf/train/data',
+    #                              anno_path='/datasets/open-images-v6-mlperf/train/labels/openimages-mlperf.json',
+    #                              batch_size=8, num_threads=4, world=1)
+    dali_iter = DaliDataIterator(data_path='/datasets/coco2017/train2017',
+                                 anno_path='/datasets/coco2017/annotations/instances_train2017.json',
+                                 batch_size=2, num_threads=1, world=1, training=True)
+    for images, targets in dali_iter:
+        pdb.set_trace()
--- a/dcu.sh
+++ b/dcu.sh
+#!/bin/bash
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+## DL params
+export BATCHSIZE=16
+export NUMEPOCHS=6
+export DATASET_DIR="/data/OpenImages_mlperf"
+export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco'
+
+# Set variables
+EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
+LOG_INTERVAL=${LOG_INTERVAL:-20}
+TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}
+
+# run benchmark
+echo "running benchmark"
+
+PARAMS=(
+--batch-size "${BATCHSIZE}"
+--eval-batch-size "${EVALBATCHSIZE}"
+--epochs "${NUMEPOCHS}"
+--print-freq "${LOG_INTERVAL}"
+--dataset-path "${DATASET_DIR}"
+--local_rank "${comm_rank}"
+--world-size "${comm_size}"
+)
+
+# run training
+APP="python3 train.py ${PARAMS[@]} ${EXTRA_PARAMS}"
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
+
--- a/dcu_fp32.sh
+++ b/dcu_fp32.sh
+#!/bin/bash
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+## DL params
+export BATCHSIZE=8
+export NUMEPOCHS=6
+export DATASET_DIR="/data/OpenImages_mlperf"
+export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco --no-amp'
+
+# Set variables
+EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
+LOG_INTERVAL=${LOG_INTERVAL:-20}
+TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}
+
+# run benchmark
+echo "running benchmark"
+
+PARAMS=(
+--batch-size "${BATCHSIZE}"
+--eval-batch-size "${EVALBATCHSIZE}"
+--epochs "${NUMEPOCHS}"
+--print-freq "${LOG_INTERVAL}"
+--dataset-path "${DATASET_DIR}"
+--local_rank "${comm_rank}"
+--world-size "${comm_size}"
+)
+
+# run training
+APP="python3 train.py ${PARAMS[@]} ${EXTRA_PARAMS}"
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
+
--- a/ddp.sh
+++ b/ddp.sh
+#!/bin/bash
+
+# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# runs benchmark and reports time to convergence
+# to use the script:
+#   run_and_time.sh
+
+set +x
+set -e
+
+source config_DGXA100_001x08x032.sh
+
+# Only rank print
+[ "${SLURM_LOCALID-0}" -ne 0 ] && set +x
+
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# Set variables
+[ "${DEBUG}" = "1" ] && set -x
+LR=${LR:-0.0001}
+WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
+BATCHSIZE=${BATCHSIZE:-2}
+EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
+NUMEPOCHS=${NUMEPOCHS:-10}
+LOG_INTERVAL=${LOG_INTERVAL:-20}
+DATASET_DIR=${DATASET_DIR:-"/public/home/liangjj/2023/training_results_v2.1-main/NVIDIA/benchmarks/ssd/implementations/pytorch-22.09/public-scripts/datasets/open-images-v6"}
+TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-home"}
+TIME_TAGS=${TIME_TAGS:-0}
+NVTX_FLAG=${NVTX_FLAG:-0}
+NCCL_TEST=${NCCL_TEST:-0}
+EPOCH_PROF=${EPOCH_PROF:-0}
+SYNTH_DATA=${SYNTH_DATA:-0}
+DISABLE_CG=${DISABLE_CG:-0}
+
+# run benchmark
+echo "running benchmark"
+#if [ ${NVTX_FLAG} -gt 0 ]; then
+## FIXME mfrank 2022-May-24: NSYSCMD needs to be an array, not a space-separated string
+# NSYSCMD=" /nsight/bin/nsys profile --capture-range cudaProfilerApi --capture-range-end stop --sample=none --cpuctxsw=none  --trace=cuda,nvtx  --force-overwrite true --output /results/single_stage_detector_pytorch_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}_${SLURM_PROCID}_${SYNTH_DATA}_${DISABLE_CG}.nsys-rep "
+#else
+# NSYSCMD=""
+#fi
+
+#if [ ${SYNTH_DATA} -gt 0 ]; then
+#EXTRA_PARAMS+=" --syn-dataset --cuda-graphs-syn "
+#EXTRA_PARAMS=$(echo $EXTRA_PARAMS | sed 's/--dali//')
+#fi
+
+declare -a CMD
+#if [ -n "${SLURM_LOCALID-}" ]; then
+#    # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
+#  if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
+#    CMD=( 'bindpcie' '--ib=single' '--' ${NSYSCMD} 'python' '-u' )
+#  else
+#    CMD=( ${NSYSCMD} 'python' '-u' )
+#  fi
+#else
+#  # Mode 2: Single-node Docker, we've been launched with `torch_run`
+#  # TODO: Replace below CMD with NSYSCMD..., but make sure NSYSCMD is an array, not a string
+#  CMD=( "python" )
+#fi
+CMD=( "python" )
+
+
+#if [ "$LOGGER" = "apiLog.sh" ];
+#then
+#  LOGGER="${LOGGER} -p MLPerf/${MODEL_NAME} -v ${FRAMEWORK}/train/${DGXSYSTEM}"
+  # TODO(ahmadki): track the apiLog.sh bug and remove the workaround
+  # there is a bug in apiLog.sh preventing it from collecting
+  # NCCL logs, the workaround is to log a single rank only
+  # LOCAL_RANK is set with an enroot hook for Pytorch containers
+  # SLURM_LOCALID is set by Slurm
+  # OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+#  readonly node_rank="${SLURM_NODEID:-0}"
+#  readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+#  if [ "$node_rank" -eq 0 ] && [ "$local_rank" -eq 0 ];
+#  then
+#    LOGGER=$LOGGER
+#  else
+#    LOGGER=""
+#  fi
+#fi
+
+PARAMS=(
+      --lr                      "${LR}"
+      --batch-size              "${BATCHSIZE}"
+      --eval-batch-size         "${EVALBATCHSIZE}"
+      --epochs                  "${NUMEPOCHS}"
+      --print-freq              "${LOG_INTERVAL}"
+      --dataset-path            "${DATASET_DIR}"
+      --warmup-epochs           "${WARMUP_EPOCHS}"
+)
+
+
+export HIP_VISIBLE_DEVICES=4,5
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+#export MIOPEN_FIND_MODE=5
+#export NCCL_NET_GDR_LEVEL=5
+export NCCL_P2P_LEVEL=5
+
+# run training
+#${LOGGER:-} "${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?
+python -m torch.distributed.launch --nnodes 1 --nproc_per_node=2  train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?
+
+
+sleep 3
+if [[ $ret_code != 0 ]]; then exit $ret_code; fi
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"
+
+# report result
+result=$(( $end - $start ))
+result_name="SINGLE_STAGE_DETECTOR"
+
+echo "RESULT,$result_name,,$result,nvidia,$start_fmt"