Commit 404ecbdc authored by zbian's avatar zbian
Browse files

Migrated project

parent 2ebaefc5
Wed Sep 1 01:07:01 CDT 2021
TACC: Starting up job 3476018
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
model is created
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
optimizer is created
start training
epoch: 0, train loss: 1.9497510997616514
epoch: 0, eval loss: 1.754234939813614, correct: 3521, total: 10000, acc = 0.3520999848842621
epoch: 1, train loss: 1.6049139609142227
epoch: 2, train loss: 1.3857794501343552
epoch: 2, eval loss: 1.2831632316112518, correct: 5410, total: 10000, acc = 0.5410000085830688
epoch: 3, train loss: 1.3016913873808724
epoch: 4, train loss: 1.2616293649284207
epoch: 4, eval loss: 1.2658930838108062, correct: 5409, total: 10000, acc = 0.5408999919891357
epoch: 5, train loss: 1.2320433721250417
epoch: 6, train loss: 1.181612290898148
epoch: 6, eval loss: 1.1402096092700957, correct: 5881, total: 10000, acc = 0.5880999565124512
epoch: 7, train loss: 1.1643818397911228
epoch: 8, train loss: 1.128499301112428
epoch: 8, eval loss: 1.0965303361415863, correct: 6053, total: 10000, acc = 0.6053000092506409
epoch: 9, train loss: 1.114193707704544
epoch: 10, train loss: 1.0830892950904614
epoch: 10, eval loss: 1.0390974164009095, correct: 6258, total: 10000, acc = 0.6258000135421753
epoch: 11, train loss: 1.0508871960396668
epoch: 12, train loss: 1.0322130365031106
epoch: 12, eval loss: 0.9689173698425293, correct: 6482, total: 10000, acc = 0.6481999754905701
epoch: 13, train loss: 1.0006194637746226
epoch: 14, train loss: 0.9652800906677635
epoch: 14, eval loss: 0.9150958389043808, correct: 6713, total: 10000, acc = 0.6712999939918518
epoch: 15, train loss: 0.9430981692002744
epoch: 16, train loss: 0.9156872307767674
epoch: 16, eval loss: 0.8703682094812393, correct: 6913, total: 10000, acc = 0.6912999749183655
epoch: 17, train loss: 0.8822251515729087
epoch: 18, train loss: 0.8485424190151448
epoch: 18, eval loss: 0.8234190821647644, correct: 7120, total: 10000, acc = 0.7119999527931213
epoch: 19, train loss: 0.8285953049757042
epoch: 20, train loss: 0.8009484337300671
epoch: 20, eval loss: 0.7808267176151276, correct: 7228, total: 10000, acc = 0.7227999567985535
epoch: 21, train loss: 0.7774611741912608
epoch: 22, train loss: 0.7435575358721674
epoch: 22, eval loss: 0.7523189872503281, correct: 7367, total: 10000, acc = 0.7366999983787537
epoch: 23, train loss: 0.7315681789602552
epoch: 24, train loss: 0.70117900627
epoch: 24, eval loss: 0.6928718358278274, correct: 7580, total: 10000, acc = 0.7579999566078186
epoch: 25, train loss: 0.677533069435431
epoch: 26, train loss: 0.6627033298112908
epoch: 26, eval loss: 0.6921748876571655, correct: 7586, total: 10000, acc = 0.7585999965667725
epoch: 27, train loss: 0.6410714266251545
epoch: 28, train loss: 0.6192339707394036
epoch: 28, eval loss: 0.6416671514511109, correct: 7719, total: 10000, acc = 0.7718999981880188
epoch: 29, train loss: 0.6093639281331277
epoch: 30, train loss: 0.582532714520182
epoch: 30, eval loss: 0.6166591048240662, correct: 7809, total: 10000, acc = 0.7809000015258789
epoch: 31, train loss: 0.572193189847226
epoch: 32, train loss: 0.5541256200902316
epoch: 32, eval loss: 0.5951347410678863, correct: 7922, total: 10000, acc = 0.792199969291687
epoch: 33, train loss: 0.5345369838938421
epoch: 34, train loss: 0.5273816007740644
epoch: 34, eval loss: 0.5837202191352844, correct: 7972, total: 10000, acc = 0.7971999645233154
epoch: 35, train loss: 0.5059237045292951
epoch: 36, train loss: 0.48622317095192114
epoch: 36, eval loss: 0.5698897138237953, correct: 8024, total: 10000, acc = 0.8023999929428101
epoch: 37, train loss: 0.47362951143663756
epoch: 38, train loss: 0.46030426907296085
epoch: 38, eval loss: 0.5610475659370422, correct: 8049, total: 10000, acc = 0.8048999905586243
epoch: 39, train loss: 0.44165324921510657
epoch: 40, train loss: 0.4327346086502075
epoch: 40, eval loss: 0.5642214670777321, correct: 8095, total: 10000, acc = 0.809499979019165
epoch: 41, train loss: 0.41423581935921494
epoch: 42, train loss: 0.40917488780556893
epoch: 42, eval loss: 0.5602998435497284, correct: 8131, total: 10000, acc = 0.8130999803543091
epoch: 43, train loss: 0.39171184477757437
epoch: 44, train loss: 0.3744060835059808
epoch: 44, eval loss: 0.5633655220270157, correct: 8134, total: 10000, acc = 0.8133999705314636
epoch: 45, train loss: 0.36267226934432983
epoch: 46, train loss: 0.3420030690577565
epoch: 46, eval loss: 0.5533872425556183, correct: 8157, total: 10000, acc = 0.8156999945640564
epoch: 47, train loss: 0.3287143409252167
epoch: 48, train loss: 0.316296321396925
epoch: 48, eval loss: 0.5576229721307755, correct: 8209, total: 10000, acc = 0.8208999633789062
epoch: 49, train loss: 0.3068045072105466
epoch: 50, train loss: 0.2929732614025778
epoch: 50, eval loss: 0.5654072970151901, correct: 8227, total: 10000, acc = 0.8226999640464783
epoch: 51, train loss: 0.2795026940958841
epoch: 52, train loss: 0.26673941375041493
epoch: 52, eval loss: 0.5736668109893799, correct: 8227, total: 10000, acc = 0.8226999640464783
epoch: 53, train loss: 0.2506744866164363
epoch: 54, train loss: 0.24351145980917677
epoch: 54, eval loss: 0.5846156671643257, correct: 8204, total: 10000, acc = 0.8203999996185303
epoch: 55, train loss: 0.2253616195248098
epoch: 56, train loss: 0.2177750574690955
epoch: 56, eval loss: 0.5943332687020302, correct: 8246, total: 10000, acc = 0.8245999813079834
epoch: 57, train loss: 0.20670234989755007
epoch: 58, train loss: 0.1973607996288611
epoch: 58, eval loss: 0.6195310011506081, correct: 8245, total: 10000, acc = 0.8244999647140503
epoch: 59, train loss: 0.19024320448539694
epoch: 60, train loss: 0.17597664877468225
epoch: 60, eval loss: 0.6139472931623459, correct: 8294, total: 10000, acc = 0.8294000029563904
epoch: 61, train loss: 0.1674150490791214
epoch: 62, train loss: 0.15718420511301684
epoch: 62, eval loss: 0.6285309329628944, correct: 8261, total: 10000, acc = 0.8260999917984009
epoch: 63, train loss: 0.1480691913439303
epoch: 64, train loss: 0.1384550367234921
epoch: 64, eval loss: 0.6587671056389809, correct: 8263, total: 10000, acc = 0.8262999653816223
epoch: 65, train loss: 0.13241269834795777
epoch: 66, train loss: 0.12871786830376605
epoch: 66, eval loss: 0.6718123883008957, correct: 8303, total: 10000, acc = 0.830299973487854
epoch: 67, train loss: 0.11577517866176001
epoch: 68, train loss: 0.11130036151378739
epoch: 68, eval loss: 0.6887702852487564, correct: 8332, total: 10000, acc = 0.8331999778747559
epoch: 69, train loss: 0.09883711646710124
epoch: 70, train loss: 0.09635799735480426
epoch: 70, eval loss: 0.7159708231687546, correct: 8307, total: 10000, acc = 0.8306999802589417
epoch: 71, train loss: 0.09449125119313902
epoch: 72, train loss: 0.08857650914210446
epoch: 72, eval loss: 0.7160102307796479, correct: 8351, total: 10000, acc = 0.835099995136261
epoch: 73, train loss: 0.08085554241373831
epoch: 74, train loss: 0.07873564483407809
epoch: 74, eval loss: 0.7119918942451477, correct: 8393, total: 10000, acc = 0.8392999768257141
epoch: 75, train loss: 0.07206312137446841
epoch: 76, train loss: 0.06772394200824962
epoch: 76, eval loss: 0.7328802436590195, correct: 8351, total: 10000, acc = 0.835099995136261
epoch: 77, train loss: 0.061777200397788265
epoch: 78, train loss: 0.05721901174710722
epoch: 78, eval loss: 0.7407010316848754, correct: 8385, total: 10000, acc = 0.8384999632835388
epoch: 79, train loss: 0.056560877406475495
epoch: 80, train loss: 0.0528045150318316
epoch: 80, eval loss: 0.7767532706260681, correct: 8354, total: 10000, acc = 0.8353999853134155
epoch: 81, train loss: 0.050682742870887934
epoch: 82, train loss: 0.04895328068915678
epoch: 82, eval loss: 0.7942879348993301, correct: 8368, total: 10000, acc = 0.8367999792098999
epoch: 83, train loss: 0.04686643050185272
epoch: 84, train loss: 0.04325723648071289
epoch: 84, eval loss: 0.7906839996576309, correct: 8356, total: 10000, acc = 0.835599958896637
epoch: 85, train loss: 0.040166335769605876
epoch: 86, train loss: 0.039296497894945194
epoch: 86, eval loss: 0.8033982694149018, correct: 8376, total: 10000, acc = 0.8375999927520752
epoch: 87, train loss: 0.038185219698566565
epoch: 88, train loss: 0.03735689769441984
epoch: 88, eval loss: 0.8039661139249802, correct: 8377, total: 10000, acc = 0.8376999497413635
epoch: 89, train loss: 0.03383794939145446
epoch: 90, train loss: 0.03318257091034736
epoch: 90, eval loss: 0.8097118645906448, correct: 8389, total: 10000, acc = 0.8388999700546265
epoch: 91, train loss: 0.03290939923109753
epoch: 92, train loss: 0.030776230903456405
epoch: 92, eval loss: 0.8237936168909072, correct: 8401, total: 10000, acc = 0.8400999903678894
epoch: 93, train loss: 0.033349379108344415
epoch: 94, train loss: 0.031906195783189366
epoch: 94, eval loss: 0.8250258564949036, correct: 8401, total: 10000, acc = 0.8400999903678894
epoch: 95, train loss: 0.03031293043334569
epoch: 96, train loss: 0.029958056238460904
epoch: 96, eval loss: 0.8200247555971145, correct: 8402, total: 10000, acc = 0.8402000069618225
epoch: 97, train loss: 0.029532150564981357
epoch: 98, train loss: 0.029668816346295025
epoch: 98, eval loss: 0.821219089627266, correct: 8399, total: 10000, acc = 0.8398999571800232
epoch: 99, train loss: 0.02980129667842875
finish training
TACC: Shutdown complete. Exiting.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from pathlib import Path
import pytest
import torch.autograd
import colossalai
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.nn.layer._parallel_utilities import _gather
CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py')
def eval(engine):
engine.eval()
accumulated_loss = 0
correct_sum = 0
total_sum = 0
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
accumulated_loss += loss.detach().cpu().numpy()
output = _gather(
output[0],
ParallelMode.PARALLEL_2D_ROW,
1
)
output = _gather(
output,
ParallelMode.PARALLEL_2D_COL,
0,
)
output = torch.argmax(output, dim=-1)
correct = torch.sum(label[0] == output)
correct_sum += correct
total_sum += label[0].size(0)
avg_loss = accumulated_loss / engine.schedule.num_steps
return correct_sum, total_sum, avg_loss
def train(engine):
engine.train()
accumulated_loss = 0
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / engine.schedule.num_steps
return avg_loss
@pytest.mark.dist
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2d_parallel_vision_transformer():
# init dist
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
CONFIG_PATH)
logger = get_global_dist_logger()
engine = Engine(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule)
logger.info('start training')
for epoch in range(gpc.config.num_epochs):
train_loss = train(engine)
logger.info(f'epoch {epoch} - train loss: {train_loss}')
if epoch % 2 == 0:
correct_sum, total_sum, eval_loss = eval(engine)
logger.info(
f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
if __name__ == '__main__':
test_2d_parallel_vision_transformer()
from pathlib import Path
import pytest
import torch.autograd
import colossalai
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.nn.layer._parallel_utilities import _gather
CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2p5d.py')
def eval(engine):
engine.eval()
accumulated_loss = 0
correct_sum = 0
total_sum = 0
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
accumulated_loss += loss.detach().cpu().numpy()
output = _gather(
output[0],
ParallelMode.PARALLEL_2P5D_ROW,
1
)
output = _gather(
output,
ParallelMode.PARALLEL_2P5D_COL,
0,
)
output = _gather(
output,
ParallelMode.PARALLEL_2P5D_DEP,
0,
)
output = torch.argmax(output, dim=-1)
correct = torch.sum(label[0] == output)
correct_sum += correct
total_sum += label[0].size(0)
avg_loss = accumulated_loss / engine.schedule.num_steps
return correct_sum, total_sum, avg_loss
def train(engine):
engine.train()
accumulated_loss = 0
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / engine.schedule.num_steps
return avg_loss
@pytest.mark.dist
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2p5d_parallel_vision_transformer():
# init dist
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
CONFIG_PATH)
logger = get_global_dist_logger()
engine = Engine(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule)
logger.info('start training')
for epoch in range(gpc.config.num_epochs):
train_loss = train(engine)
logger.info(f'epoch {epoch} - train loss: {train_loss}')
if epoch % 2 == 0:
correct_sum, total_sum, eval_loss = eval(engine)
logger.info(
f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
if __name__ == '__main__':
test_2p5d_parallel_vision_transformer()
\ No newline at end of file
TACC: Starting up job 3498212
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
model is created
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
optimizer is created
start training
epoch: 0, train loss: 1.9590576728995965
epoch: 1, train loss: 1.6275222167676808
epoch: 1, eval loss: 1.5277319371700286, correct: 4435, total: 10000, acc = 0.44349998235702515
epoch: 2, train loss: 1.4355541419009774
epoch: 3, train loss: 1.3253967445723864
epoch: 3, eval loss: 1.309086227416992, correct: 5283, total: 10000, acc = 0.5282999873161316
epoch: 4, train loss: 1.2578775298838714
epoch: 5, train loss: 1.2231916554120121
epoch: 5, eval loss: 1.1699816286563873, correct: 5695, total: 10000, acc = 0.5694999694824219
epoch: 6, train loss: 1.1872552669778162
epoch: 7, train loss: 1.1616783823285783
epoch: 7, eval loss: 1.069484794139862, correct: 6183, total: 10000, acc = 0.6182999610900879
epoch: 8, train loss: 1.1155579333402672
epoch: 9, train loss: 1.0878059365311448
epoch: 9, eval loss: 1.0522838592529298, correct: 6202, total: 10000, acc = 0.620199978351593
epoch: 10, train loss: 1.0780728623575093
epoch: 11, train loss: 1.0522098152004942
epoch: 11, eval loss: 1.0902862310409547, correct: 6148, total: 10000, acc = 0.614799976348877
epoch: 12, train loss: 1.0366473337825464
epoch: 13, train loss: 1.0067467458394108
epoch: 13, eval loss: 0.9696728616952897, correct: 6531, total: 10000, acc = 0.6530999541282654
epoch: 14, train loss: 0.9676224273078295
epoch: 15, train loss: 0.9494374029490412
epoch: 15, eval loss: 0.9511896312236786, correct: 6646, total: 10000, acc = 0.6645999550819397
epoch: 16, train loss: 0.9231320935852674
epoch: 17, train loss: 0.9023846679804276
epoch: 17, eval loss: 0.8728409796953202, correct: 6866, total: 10000, acc = 0.6865999698638916
epoch: 18, train loss: 0.8684309854799387
epoch: 19, train loss: 0.836099565637355
epoch: 19, eval loss: 0.8208363801240921, correct: 7091, total: 10000, acc = 0.7091000080108643
epoch: 20, train loss: 0.8285067890371595
epoch: 21, train loss: 0.7930980793067387
epoch: 21, eval loss: 0.7793890535831451, correct: 7235, total: 10000, acc = 0.7234999537467957
epoch: 22, train loss: 0.762698369366782
epoch: 23, train loss: 0.7376812471418964
epoch: 23, eval loss: 0.746866625547409, correct: 7340, total: 10000, acc = 0.7339999675750732
epoch: 24, train loss: 0.7071484223920472
epoch: 25, train loss: 0.6905171658311572
epoch: 25, eval loss: 0.6909466415643692, correct: 7526, total: 10000, acc = 0.7525999546051025
epoch: 26, train loss: 0.6608500091397033
epoch: 27, train loss: 0.65504517907999
epoch: 27, eval loss: 0.6612646311521531, correct: 7697, total: 10000, acc = 0.7696999907493591
epoch: 28, train loss: 0.6234641969203949
epoch: 29, train loss: 0.6107665622720913
epoch: 29, eval loss: 0.666494044661522, correct: 7704, total: 10000, acc = 0.7703999876976013
epoch: 30, train loss: 0.5875011883219894
epoch: 31, train loss: 0.5739485697478665
epoch: 31, eval loss: 0.6217960953712464, correct: 7828, total: 10000, acc = 0.7827999591827393
epoch: 32, train loss: 0.548510205684876
epoch: 33, train loss: 0.5237194764979032
epoch: 33, eval loss: 0.6254391580820083, correct: 7842, total: 10000, acc = 0.7841999530792236
epoch: 34, train loss: 0.5154265892140719
epoch: 35, train loss: 0.494700480176478
epoch: 35, eval loss: 0.5981663644313813, correct: 7963, total: 10000, acc = 0.7962999939918518
epoch: 36, train loss: 0.4785171020395902
epoch: 37, train loss: 0.46277919259606576
epoch: 37, eval loss: 0.6061880439519882, correct: 7958, total: 10000, acc = 0.795799970626831
epoch: 38, train loss: 0.4398626606075131
epoch: 39, train loss: 0.4206806777083144
epoch: 39, eval loss: 0.6158866941928863, correct: 7959, total: 10000, acc = 0.7958999872207642
epoch: 40, train loss: 0.40768756550185536
epoch: 41, train loss: 0.39494050035671313
epoch: 41, eval loss: 0.5725498422980309, correct: 8132, total: 10000, acc = 0.8131999969482422
epoch: 42, train loss: 0.3742571521778496
epoch: 43, train loss: 0.3583034301290707
epoch: 43, eval loss: 0.5765605017542839, correct: 8155, total: 10000, acc = 0.8154999613761902
epoch: 44, train loss: 0.3342630756752832
epoch: 45, train loss: 0.31316718063792404
epoch: 45, eval loss: 0.583588008582592, correct: 8199, total: 10000, acc = 0.8198999762535095
epoch: 46, train loss: 0.30922748148441315
epoch: 47, train loss: 0.2906164434187266
epoch: 47, eval loss: 0.5934860140085221, correct: 8143, total: 10000, acc = 0.814300000667572
epoch: 48, train loss: 0.2741488078419043
epoch: 49, train loss: 0.2597196321098172
epoch: 49, eval loss: 0.5978868633508683, correct: 8195, total: 10000, acc = 0.8194999694824219
epoch: 50, train loss: 0.2440016470393356
epoch: 51, train loss: 0.2293997729311184
epoch: 51, eval loss: 0.5915440261363983, correct: 8232, total: 10000, acc = 0.823199987411499
epoch: 52, train loss: 0.2132072006257213
epoch: 53, train loss: 0.19785404767917128
epoch: 53, eval loss: 0.6171442106366157, correct: 8258, total: 10000, acc = 0.8258000016212463
epoch: 54, train loss: 0.1838149410121295
epoch: 55, train loss: 0.17691133977199086
epoch: 55, eval loss: 0.623777586221695, correct: 8275, total: 10000, acc = 0.8274999856948853
epoch: 56, train loss: 0.16595362697024735
epoch: 57, train loss: 0.1531825682946614
epoch: 57, eval loss: 0.6466041743755341, correct: 8243, total: 10000, acc = 0.8242999911308289
epoch: 58, train loss: 0.14334788979316243
epoch: 59, train loss: 0.13799503377201605
epoch: 59, eval loss: 0.6496601745486259, correct: 8249, total: 10000, acc = 0.8248999714851379
finish training
c196-011[rtx](1013)$ bash ./test.sh 1 1 1 0.001
TACC: Starting up job 3503164
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
USE_VANILLA model
model is created
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
optimizer is created
start training
epoch: 0, train loss: 1.9408839624755236
epoch: 0, eval loss: 1.7896566271781922, correct: 3488, total: 10000, acc = 0.34880000352859497
epoch time: 40.82966494560242
epoch: 1, train loss: 1.6500030257263962
epoch: 1, eval loss: 1.5464953780174255, correct: 4545, total: 10000, acc = 0.4544999897480011
epoch time: 40.01254224777222
epoch: 2, train loss: 1.422887429899099
epoch: 2, eval loss: 1.37536381483078, correct: 5074, total: 10000, acc = 0.5073999762535095
epoch time: 40.107905864715576
epoch: 3, train loss: 1.3217590207956276
epoch: 3, eval loss: 1.3036327004432677, correct: 5377, total: 10000, acc = 0.5376999974250793
epoch time: 40.12306189537048
epoch: 4, train loss: 1.262234352072891
epoch: 4, eval loss: 1.2568134129047395, correct: 5475, total: 10000, acc = 0.5475000143051147
epoch time: 40.10755228996277
epoch: 5, train loss: 1.2381379117771072
epoch: 5, eval loss: 1.1941023647785187, correct: 5676, total: 10000, acc = 0.5676000118255615
epoch time: 40.119303464889526
epoch: 6, train loss: 1.2061052650821453
epoch: 6, eval loss: 1.1313925206661224, correct: 5938, total: 10000, acc = 0.5938000082969666
epoch time: 40.07719683647156
epoch: 7, train loss: 1.1659562563409611
epoch: 7, eval loss: 1.125486546754837, correct: 5958, total: 10000, acc = 0.59579998254776
epoch time: 40.1702299118042
epoch: 8, train loss: 1.1378972846634534
epoch: 8, eval loss: 1.082760637998581, correct: 6102, total: 10000, acc = 0.6101999878883362
epoch time: 40.22099733352661
epoch: 9, train loss: 1.1073276430976635
epoch: 9, eval loss: 1.1077564001083373, correct: 6038, total: 10000, acc = 0.6037999987602234
epoch time: 40.1106858253479
epoch: 10, train loss: 1.087894769347444
epoch: 10, eval loss: 1.0400531351566316, correct: 6311, total: 10000, acc = 0.6310999989509583
epoch time: 40.20973324775696
epoch: 11, train loss: 1.0556547295074075
epoch: 11, eval loss: 1.0295817345380782, correct: 6359, total: 10000, acc = 0.6358999609947205
epoch time: 40.23791980743408
epoch: 12, train loss: 1.0299884901971232
epoch: 12, eval loss: 1.003737959265709, correct: 6380, total: 10000, acc = 0.6380000114440918
epoch time: 40.08779859542847
epoch: 13, train loss: 0.9972386627781148
epoch: 13, eval loss: 0.9707699298858643, correct: 6499, total: 10000, acc = 0.649899959564209
epoch time: 40.10878801345825
epoch: 14, train loss: 0.9784559072280417
epoch: 14, eval loss: 0.9253897607326508, correct: 6641, total: 10000, acc = 0.6640999913215637
epoch time: 40.13168978691101
epoch: 15, train loss: 0.9409253481699495
epoch: 15, eval loss: 0.9120320588350296, correct: 6759, total: 10000, acc = 0.6758999824523926
epoch time: 40.162830114364624
epoch: 16, train loss: 0.925923115136672
epoch: 16, eval loss: 0.8850776582956315, correct: 6870, total: 10000, acc = 0.6869999766349792
epoch time: 40.145774602890015
epoch: 17, train loss: 0.8923340841215484
epoch: 17, eval loss: 0.8570599347352982, correct: 6950, total: 10000, acc = 0.6949999928474426
epoch time: 40.18058943748474
epoch: 18, train loss: 0.8638542884466599
epoch: 18, eval loss: 0.838410159945488, correct: 6971, total: 10000, acc = 0.6970999836921692
epoch time: 40.110822439193726
epoch: 19, train loss: 0.8400422529298432
epoch: 19, eval loss: 0.8189669162034988, correct: 7097, total: 10000, acc = 0.7096999883651733
epoch time: 40.066970109939575
epoch: 20, train loss: 0.8072922752828015
epoch: 20, eval loss: 0.7772788077592849, correct: 7240, total: 10000, acc = 0.7239999771118164
epoch time: 40.045086145401
epoch: 21, train loss: 0.788195074821005
epoch: 21, eval loss: 0.7793144911527634, correct: 7261, total: 10000, acc = 0.726099967956543
epoch time: 40.05983781814575
epoch: 22, train loss: 0.7574447350842612
epoch: 22, eval loss: 0.7660320281982422, correct: 7272, total: 10000, acc = 0.7271999716758728
epoch time: 40.11693739891052
epoch: 23, train loss: 0.7402738150285215
epoch: 23, eval loss: 0.7264292597770691, correct: 7418, total: 10000, acc = 0.7418000102043152
epoch time: 40.18724513053894
epoch: 24, train loss: 0.7125097580102026
epoch: 24, eval loss: 0.7105035990476608, correct: 7506, total: 10000, acc = 0.7505999803543091
epoch time: 40.1254940032959
epoch: 25, train loss: 0.6900304744438249
epoch: 25, eval loss: 0.6911167114973068, correct: 7562, total: 10000, acc = 0.7561999559402466
epoch time: 40.103896617889404
epoch: 26, train loss: 0.6648721482072558
epoch: 26, eval loss: 0.6780407190322876, correct: 7624, total: 10000, acc = 0.7623999714851379
epoch time: 40.18161463737488
epoch: 27, train loss: 0.6446310062797702
epoch: 27, eval loss: 0.6820667266845704, correct: 7612, total: 10000, acc = 0.761199951171875
epoch time: 40.19018864631653
epoch: 28, train loss: 0.6262476389505425
epoch: 28, eval loss: 0.6506347745656967, correct: 7704, total: 10000, acc = 0.7703999876976013
epoch time: 40.23526978492737
epoch: 29, train loss: 0.5968854001590184
epoch: 29, eval loss: 0.6507940381765366, correct: 7727, total: 10000, acc = 0.7726999521255493
epoch time: 40.26889181137085
epoch: 30, train loss: 0.587430303194085
epoch: 30, eval loss: 0.6333519726991653, correct: 7788, total: 10000, acc = 0.7787999510765076
epoch time: 40.28285789489746
epoch: 31, train loss: 0.5701514035463333
epoch: 31, eval loss: 0.6348810195922852, correct: 7799, total: 10000, acc = 0.7798999547958374
epoch time: 40.199995040893555
epoch: 32, train loss: 0.5482188679125845
epoch: 32, eval loss: 0.6192457497119903, correct: 7833, total: 10000, acc = 0.78329998254776
epoch time: 40.270729780197144
epoch: 33, train loss: 0.534268391375639
epoch: 33, eval loss: 0.6381673783063888, correct: 7790, total: 10000, acc = 0.7789999842643738
epoch time: 40.36342120170593
epoch: 34, train loss: 0.5104483384258893
epoch: 34, eval loss: 0.6173199415206909, correct: 7867, total: 10000, acc = 0.7866999506950378
epoch time: 40.34266257286072
epoch: 35, train loss: 0.4968841674984718
epoch: 35, eval loss: 0.604002220928669, correct: 7916, total: 10000, acc = 0.7915999889373779
epoch time: 40.39444589614868
epoch: 36, train loss: 0.4773432207959039
epoch: 36, eval loss: 0.5884111285209656, correct: 7965, total: 10000, acc = 0.7964999675750732
epoch time: 40.40647268295288
epoch: 37, train loss: 0.4621481445370888
epoch: 37, eval loss: 0.5748852327466011, correct: 8047, total: 10000, acc = 0.8046999573707581
epoch time: 40.29281520843506
epoch: 38, train loss: 0.4431859048045411
epoch: 38, eval loss: 0.5874941781163215, correct: 7995, total: 10000, acc = 0.7994999885559082
epoch time: 40.40029954910278
epoch: 39, train loss: 0.4305852785402415
epoch: 39, eval loss: 0.5991648495197296, correct: 7972, total: 10000, acc = 0.7971999645233154
epoch time: 40.399904012680054
epoch: 40, train loss: 0.4092241589512144
epoch: 40, eval loss: 0.5725525215268135, correct: 8069, total: 10000, acc = 0.8068999648094177
epoch time: 40.32663059234619
epoch: 41, train loss: 0.39218547179990887
epoch: 41, eval loss: 0.5886161357164383, correct: 8068, total: 10000, acc = 0.8068000078201294
epoch time: 40.32424521446228
epoch: 42, train loss: 0.3773612398274091
epoch: 42, eval loss: 0.5762413635849952, correct: 8126, total: 10000, acc = 0.8125999569892883
epoch time: 40.44430422782898
epoch: 43, train loss: 0.3593267098981507
epoch: 43, eval loss: 0.5729024946689606, correct: 8107, total: 10000, acc = 0.810699999332428
epoch time: 40.488121032714844
epoch: 44, train loss: 0.3396431426612698
epoch: 44, eval loss: 0.5944831907749176, correct: 8072, total: 10000, acc = 0.8071999549865723
epoch time: 40.41803979873657
epoch: 45, train loss: 0.32412939716358574
epoch: 45, eval loss: 0.5849291861057282, correct: 8171, total: 10000, acc = 0.8170999884605408
epoch time: 40.428131341934204
epoch: 46, train loss: 0.3099915471916296
epoch: 46, eval loss: 0.5797522723674774, correct: 8121, total: 10000, acc = 0.8120999932289124
epoch time: 40.623990058898926
epoch: 47, train loss: 0.29422828676749246
epoch: 47, eval loss: 0.5898703813552857, correct: 8175, total: 10000, acc = 0.8174999952316284
epoch time: 40.71224045753479
epoch: 48, train loss: 0.27581544600579205
epoch: 48, eval loss: 0.5950756087899208, correct: 8170, total: 10000, acc = 0.8169999718666077
epoch time: 40.53409385681152
epoch: 49, train loss: 0.26118586242807157
epoch: 49, eval loss: 0.5998703584074974, correct: 8213, total: 10000, acc = 0.8212999701499939
epoch time: 40.564385175704956
epoch: 50, train loss: 0.2513351797753451
epoch: 50, eval loss: 0.6011391341686249, correct: 8226, total: 10000, acc = 0.8226000070571899
epoch time: 40.55033254623413
epoch: 51, train loss: 0.22965944299892505
epoch: 51, eval loss: 0.5979882061481476, correct: 8233, total: 10000, acc = 0.8233000040054321
epoch time: 40.54532980918884
epoch: 52, train loss: 0.21661002188920975
epoch: 52, eval loss: 0.6121026620268821, correct: 8220, total: 10000, acc = 0.8219999670982361
epoch time: 40.649473667144775
epoch: 53, train loss: 0.20266114950788264
epoch: 53, eval loss: 0.6016955643892288, correct: 8260, total: 10000, acc = 0.8259999752044678
epoch time: 40.752054929733276
epoch: 54, train loss: 0.19287180794136866
epoch: 54, eval loss: 0.6043265879154205, correct: 8284, total: 10000, acc = 0.8283999562263489
epoch time: 40.68043255805969
epoch: 55, train loss: 0.175087109208107
epoch: 55, eval loss: 0.6146622076630592, correct: 8316, total: 10000, acc = 0.8315999507904053
epoch time: 40.58446717262268
epoch: 56, train loss: 0.16749868762432313
epoch: 56, eval loss: 0.6235148012638092, correct: 8313, total: 10000, acc = 0.8312999606132507
epoch time: 40.62826180458069
epoch: 57, train loss: 0.15567801619062618
epoch: 57, eval loss: 0.6325852945446968, correct: 8308, total: 10000, acc = 0.8307999968528748
epoch time: 40.72224497795105
epoch: 58, train loss: 0.1484297229623308
epoch: 58, eval loss: 0.6329193383455276, correct: 8325, total: 10000, acc = 0.8324999809265137
epoch time: 40.750558614730835
epoch: 59, train loss: 0.14238623818572688
epoch: 59, eval loss: 0.6318104699254036, correct: 8329, total: 10000, acc = 0.8328999876976013
epoch time: 40.77172636985779
finish training
\ No newline at end of file
TACC: Starting up job 3498663
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
model is created
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
optimizer is created
start training
epoch: 0, train loss: 2.095031557034473
epoch: 1, train loss: 1.8454539605549403
epoch: 1, eval loss: 1.7768513083457946, correct: 3564, total: 10000, acc = 0.3563999831676483
epoch: 2, train loss: 1.7044833728245325
epoch: 3, train loss: 1.5999061124665397
epoch: 3, eval loss: 1.5574450254440309, correct: 4389, total: 10000, acc = 0.4388999938964844
epoch: 4, train loss: 1.4929670217085858
epoch: 5, train loss: 1.401450170546162
epoch: 5, eval loss: 1.4644017696380616, correct: 4857, total: 10000, acc = 0.48569998145103455
epoch: 6, train loss: 1.319102376091237
epoch: 7, train loss: 1.2555806539496597
epoch: 7, eval loss: 1.2475590467453004, correct: 5486, total: 10000, acc = 0.5485999584197998
epoch: 8, train loss: 1.1992503173497258
epoch: 9, train loss: 1.1600336493278036
epoch: 9, eval loss: 1.1786625683307648, correct: 5834, total: 10000, acc = 0.5834000110626221
epoch: 10, train loss: 1.1214540807568296
epoch: 11, train loss: 1.0808329728184913
epoch: 11, eval loss: 1.096825110912323, correct: 6072, total: 10000, acc = 0.6071999669075012
epoch: 12, train loss: 1.0521019423494533
epoch: 13, train loss: 1.0262362957000732
epoch: 13, eval loss: 1.056444275379181, correct: 6268, total: 10000, acc = 0.626800000667572
epoch: 14, train loss: 0.9932536555796253
epoch: 15, train loss: 0.9653559442685575
epoch: 15, eval loss: 0.9576991081237793, correct: 6582, total: 10000, acc = 0.6581999659538269
epoch: 16, train loss: 0.9465620943478176
epoch: 17, train loss: 0.9181081974992946
epoch: 17, eval loss: 0.9245584070682525, correct: 6747, total: 10000, acc = 0.6746999621391296
epoch: 18, train loss: 0.8987109752333894
epoch: 19, train loss: 0.8840238646585115
epoch: 19, eval loss: 0.8989996433258056, correct: 6787, total: 10000, acc = 0.6786999702453613
epoch: 20, train loss: 0.8591911811001447
epoch: 21, train loss: 0.843510093129411
epoch: 21, eval loss: 0.8595858901739121, correct: 6969, total: 10000, acc = 0.6969000101089478
epoch: 22, train loss: 0.8306782276046519
epoch: 23, train loss: 0.8181647640101763
epoch: 23, eval loss: 0.8600298583507537, correct: 7005, total: 10000, acc = 0.7005000114440918
epoch: 24, train loss: 0.7964763343334198
epoch: 25, train loss: 0.7840689718723297
epoch: 25, eval loss: 0.824479615688324, correct: 7073, total: 10000, acc = 0.7073000073432922
epoch: 26, train loss: 0.7709570752114666
epoch: 27, train loss: 0.7591698108887186
epoch: 27, eval loss: 0.7967212647199631, correct: 7196, total: 10000, acc = 0.7195999622344971
epoch: 28, train loss: 0.7438001352913526
epoch: 29, train loss: 0.7341659853653032
epoch: 29, eval loss: 0.8041222035884857, correct: 7168, total: 10000, acc = 0.7167999744415283
epoch: 30, train loss: 0.7254330929444761
epoch: 31, train loss: 0.710246913895315
epoch: 31, eval loss: 0.7848481118679047, correct: 7287, total: 10000, acc = 0.7286999821662903
epoch: 32, train loss: 0.6976562008565786
epoch: 33, train loss: 0.6906438475968887
epoch: 33, eval loss: 0.7644171923398971, correct: 7370, total: 10000, acc = 0.7369999885559082
epoch: 34, train loss: 0.6795850834067987
epoch: 35, train loss: 0.6724951656497254
epoch: 35, eval loss: 0.7515032321214676, correct: 7368, total: 10000, acc = 0.736799955368042
epoch: 36, train loss: 0.6527298372619006
epoch: 37, train loss: 0.651018523440069
epoch: 37, eval loss: 0.7381327033042908, correct: 7449, total: 10000, acc = 0.7448999881744385
epoch: 38, train loss: 0.6365304406808348
epoch: 39, train loss: 0.6372388047831399
epoch: 39, eval loss: 0.7342826008796692, correct: 7453, total: 10000, acc = 0.7452999949455261
epoch: 40, train loss: 0.6199644664112403
epoch: 41, train loss: 0.6101092303894005
epoch: 41, eval loss: 0.7353240340948105, correct: 7466, total: 10000, acc = 0.7465999722480774
epoch: 42, train loss: 0.6093496211937496
epoch: 43, train loss: 0.6019633388032719
epoch: 43, eval loss: 0.7350291252136231, correct: 7479, total: 10000, acc = 0.7479000091552734
epoch: 44, train loss: 0.5928211437196148
epoch: 45, train loss: 0.5840530048827736
epoch: 45, eval loss: 0.7301350146532058, correct: 7525, total: 10000, acc = 0.7524999976158142
epoch: 46, train loss: 0.578370426078232
epoch: 47, train loss: 0.5703256440405943
epoch: 47, eval loss: 0.7226948082447052, correct: 7526, total: 10000, acc = 0.7525999546051025
epoch: 48, train loss: 0.5622531275968162
epoch: 49, train loss: 0.5543749076979501
epoch: 49, eval loss: 0.7278151929378509, correct: 7536, total: 10000, acc = 0.753600001335144
epoch: 50, train loss: 0.5494355583677486
epoch: 51, train loss: 0.5427058047177841
epoch: 51, eval loss: 0.7180711388587951, correct: 7608, total: 10000, acc = 0.7608000040054321
epoch: 52, train loss: 0.5323820530760045
epoch: 53, train loss: 0.5341374232452742
epoch: 53, eval loss: 0.7136827558279037, correct: 7618, total: 10000, acc = 0.7617999911308289
epoch: 54, train loss: 0.5295403867351766
epoch: 55, train loss: 0.5226148692320804
epoch: 55, eval loss: 0.7158426463603973, correct: 7624, total: 10000, acc = 0.7623999714851379
epoch: 56, train loss: 0.5206544593888887
epoch: 57, train loss: 0.5186455438331682
epoch: 57, eval loss: 0.7141193479299546, correct: 7611, total: 10000, acc = 0.7610999941825867
epoch: 58, train loss: 0.5130856335163116
epoch: 59, train loss: 0.5103850683995655
epoch: 59, eval loss: 0.7077989399433136, correct: 7628, total: 10000, acc = 0.7627999782562256
finish training
c196-012[rtx](1006)$ bash ./test.sh 1 1 1 0.0001
TACC: Starting up job 3503177
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
USE_VANILLA model
model is created
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
optimizer is created
start training
epoch: 0, train loss: 2.07912605757616
epoch: 0, eval loss: 1.9337591707706452, correct: 2845, total: 10000, acc = 0.28450000286102295
epoch time: 48.79993748664856
epoch: 1, train loss: 1.8506990890113675
epoch: 1, eval loss: 1.7832269430160523, correct: 3506, total: 10000, acc = 0.350600004196167
epoch time: 39.10968255996704
epoch: 2, train loss: 1.707400695401795
epoch: 2, eval loss: 1.6983122050762176, correct: 3935, total: 10000, acc = 0.3935000002384186
epoch time: 39.205119609832764
epoch: 3, train loss: 1.5925798574272467
epoch: 3, eval loss: 1.6361137092113496, correct: 4276, total: 10000, acc = 0.4275999963283539
epoch time: 39.220152378082275
epoch: 4, train loss: 1.4817699790000916
epoch: 4, eval loss: 1.4869949519634247, correct: 4706, total: 10000, acc = 0.4705999791622162
epoch time: 39.297648191452026
epoch: 5, train loss: 1.3685331247290786
epoch: 5, eval loss: 1.4110832333564758, correct: 5043, total: 10000, acc = 0.5042999982833862
epoch time: 39.31484127044678
epoch: 6, train loss: 1.283743022655954
epoch: 6, eval loss: 1.317776972055435, correct: 5320, total: 10000, acc = 0.5320000052452087
epoch time: 39.31891870498657
epoch: 7, train loss: 1.2292176107971036
epoch: 7, eval loss: 1.2397323846817017, correct: 5619, total: 10000, acc = 0.5618999600410461
epoch time: 39.31014013290405
epoch: 8, train loss: 1.1705418606193698
epoch: 8, eval loss: 1.2041720151901245, correct: 5696, total: 10000, acc = 0.569599986076355
epoch time: 39.29190945625305
epoch: 9, train loss: 1.1253369718181843
epoch: 9, eval loss: 1.1219275832176208, correct: 6039, total: 10000, acc = 0.6038999557495117
epoch time: 39.314892053604126
epoch: 10, train loss: 1.0875617825255102
epoch: 10, eval loss: 1.1398449420928956, correct: 5921, total: 10000, acc = 0.5920999646186829
epoch time: 39.29768466949463
epoch: 11, train loss: 1.055325626110544
epoch: 11, eval loss: 1.0739773243665696, correct: 6212, total: 10000, acc = 0.6211999654769897
epoch time: 39.26834416389465
epoch: 12, train loss: 1.0238730627663282
epoch: 12, eval loss: 1.0526267528533935, correct: 6244, total: 10000, acc = 0.6243999600410461
epoch time: 39.30522894859314
epoch: 13, train loss: 0.9906492087305808
epoch: 13, eval loss: 1.0342225402593612, correct: 6295, total: 10000, acc = 0.6294999718666077
epoch time: 39.28985071182251
epoch: 14, train loss: 0.968360669758855
epoch: 14, eval loss: 0.9747557610273361, correct: 6498, total: 10000, acc = 0.6498000025749207
epoch time: 39.33563685417175
epoch: 15, train loss: 0.9413909072778663
epoch: 15, eval loss: 0.9359912216663361, correct: 6659, total: 10000, acc = 0.6658999919891357
epoch time: 39.332377672195435
epoch: 16, train loss: 0.9215109226654987
epoch: 16, eval loss: 0.9215879321098328, correct: 6693, total: 10000, acc = 0.6692999601364136
epoch time: 39.35148882865906
epoch: 17, train loss: 0.9036085179873875
epoch: 17, eval loss: 0.8947311192750931, correct: 6787, total: 10000, acc = 0.6786999702453613
epoch time: 39.31995511054993
epoch: 18, train loss: 0.8774841433885147
epoch: 18, eval loss: 0.8880111247301101, correct: 6844, total: 10000, acc = 0.6843999624252319
epoch time: 39.32100558280945
epoch: 19, train loss: 0.8607137598553483
epoch: 19, eval loss: 0.8770220369100571, correct: 6883, total: 10000, acc = 0.6882999539375305
epoch time: 39.3321533203125
epoch: 20, train loss: 0.8482279163234088
epoch: 20, eval loss: 0.8661656975746155, correct: 6926, total: 10000, acc = 0.6926000118255615
epoch time: 39.319167613983154
epoch: 21, train loss: 0.8280732814146547
epoch: 21, eval loss: 0.8369802534580231, correct: 7041, total: 10000, acc = 0.7040999531745911
epoch time: 39.32543706893921
epoch: 22, train loss: 0.8162973212952517
epoch: 22, eval loss: 0.8281545102596283, correct: 7096, total: 10000, acc = 0.7095999717712402
epoch time: 39.344929695129395
epoch: 23, train loss: 0.8043988426120914
epoch: 23, eval loss: 0.8369941651821137, correct: 7070, total: 10000, acc = 0.7069999575614929
epoch time: 39.342397928237915
epoch: 24, train loss: 0.788704516328111
epoch: 24, eval loss: 0.8305304765701294, correct: 7040, total: 10000, acc = 0.7039999961853027
epoch time: 39.349589347839355
epoch: 25, train loss: 0.7747861517935383
epoch: 25, eval loss: 0.8025588423013688, correct: 7164, total: 10000, acc = 0.7163999676704407
epoch time: 39.35692596435547
epoch: 26, train loss: 0.7557641073149077
epoch: 26, eval loss: 0.7929455429315567, correct: 7204, total: 10000, acc = 0.7203999757766724
epoch time: 39.36091661453247
epoch: 27, train loss: 0.7422851062550837
epoch: 27, eval loss: 0.7790816932916641, correct: 7249, total: 10000, acc = 0.7249000072479248
epoch time: 39.355828046798706
epoch: 28, train loss: 0.7305653861590794
epoch: 28, eval loss: 0.7937072366476059, correct: 7204, total: 10000, acc = 0.7203999757766724
epoch time: 39.3598473072052
epoch: 29, train loss: 0.719313730998915
epoch: 29, eval loss: 0.7657937437295914, correct: 7320, total: 10000, acc = 0.7319999933242798
epoch time: 39.353551626205444
epoch: 30, train loss: 0.7127084263733455
epoch: 30, eval loss: 0.7556168884038925, correct: 7341, total: 10000, acc = 0.7340999841690063
epoch time: 39.37097501754761
epoch: 31, train loss: 0.7044506967067719
epoch: 31, eval loss: 0.7438590109348298, correct: 7359, total: 10000, acc = 0.7358999848365784
epoch time: 39.37364745140076
epoch: 32, train loss: 0.6920064693810989
epoch: 32, eval loss: 0.7408553540706635, correct: 7419, total: 10000, acc = 0.7418999671936035
epoch time: 39.372353076934814
epoch: 33, train loss: 0.6790882920732304
epoch: 33, eval loss: 0.7541307628154754, correct: 7332, total: 10000, acc = 0.733199954032898
epoch time: 39.310251235961914
epoch: 34, train loss: 0.6666433202977083
epoch: 34, eval loss: 0.7413494348526001, correct: 7401, total: 10000, acc = 0.7400999665260315
epoch time: 39.394805908203125
epoch: 35, train loss: 0.6561720742254841
epoch: 35, eval loss: 0.7245241671800613, correct: 7483, total: 10000, acc = 0.7482999563217163
epoch time: 39.34455704689026
epoch: 36, train loss: 0.6433814526820669
epoch: 36, eval loss: 0.7294039458036423, correct: 7483, total: 10000, acc = 0.7482999563217163
epoch time: 39.337549924850464
epoch: 37, train loss: 0.6366085136423305
epoch: 37, eval loss: 0.7336494833230972, correct: 7462, total: 10000, acc = 0.7461999654769897
epoch time: 39.338196754455566
epoch: 38, train loss: 0.6294400272320728
epoch: 38, eval loss: 0.719609409570694, correct: 7532, total: 10000, acc = 0.7531999945640564
epoch time: 39.33430027961731
epoch: 39, train loss: 0.6179663903859197
epoch: 39, eval loss: 0.7210630685091018, correct: 7507, total: 10000, acc = 0.7506999969482422
epoch time: 39.33643341064453
epoch: 40, train loss: 0.6102935781284254
epoch: 40, eval loss: 0.6994094282388688, correct: 7569, total: 10000, acc = 0.7568999528884888
epoch time: 39.38672637939453
epoch: 41, train loss: 0.5990810029360712
epoch: 41, eval loss: 0.7133035778999328, correct: 7550, total: 10000, acc = 0.7549999952316284
epoch time: 39.374757528305054
epoch: 42, train loss: 0.5964441865074391
epoch: 42, eval loss: 0.7060712993144989, correct: 7577, total: 10000, acc = 0.7576999664306641
epoch time: 39.4019033908844
epoch: 43, train loss: 0.5878602710305428
epoch: 43, eval loss: 0.7106044471263886, correct: 7580, total: 10000, acc = 0.7579999566078186
epoch time: 39.408252477645874
epoch: 44, train loss: 0.5797601254010687
epoch: 44, eval loss: 0.7093768745660782, correct: 7568, total: 10000, acc = 0.7567999958992004
epoch time: 39.40289378166199
epoch: 45, train loss: 0.5684604742089097
epoch: 45, eval loss: 0.7075642883777619, correct: 7612, total: 10000, acc = 0.761199951171875
epoch time: 39.35792422294617
epoch: 46, train loss: 0.5617077308041709
epoch: 46, eval loss: 0.707081851363182, correct: 7576, total: 10000, acc = 0.7576000094413757
epoch time: 39.37784481048584
epoch: 47, train loss: 0.5572127462649832
epoch: 47, eval loss: 0.7069586098194123, correct: 7606, total: 10000, acc = 0.7605999708175659
epoch time: 39.33794188499451
epoch: 48, train loss: 0.5519619742218329
epoch: 48, eval loss: 0.6923990368843078, correct: 7679, total: 10000, acc = 0.7678999900817871
epoch time: 39.39500594139099
epoch: 49, train loss: 0.5454421751961416
epoch: 49, eval loss: 0.7032370567321777, correct: 7626, total: 10000, acc = 0.7626000046730042
epoch time: 39.38570594787598
epoch: 50, train loss: 0.5419908360559114
epoch: 50, eval loss: 0.6949253618717194, correct: 7669, total: 10000, acc = 0.7669000029563904
epoch time: 39.334325551986694
epoch: 51, train loss: 0.5299993215166793
epoch: 51, eval loss: 0.6966427147388459, correct: 7654, total: 10000, acc = 0.7653999924659729
epoch time: 39.337984561920166
epoch: 52, train loss: 0.5282451452649369
epoch: 52, eval loss: 0.6932955116033555, correct: 7664, total: 10000, acc = 0.7663999795913696
epoch time: 39.34237813949585
epoch: 53, train loss: 0.5234840703862054
epoch: 53, eval loss: 0.6988086104393005, correct: 7654, total: 10000, acc = 0.7653999924659729
epoch time: 39.364726066589355
epoch: 54, train loss: 0.5139317989957576
epoch: 54, eval loss: 0.6950253814458847, correct: 7643, total: 10000, acc = 0.7642999887466431
epoch time: 39.40451097488403
epoch: 55, train loss: 0.5158528734226616
epoch: 55, eval loss: 0.6978882610797882, correct: 7672, total: 10000, acc = 0.7671999931335449
epoch time: 39.38926696777344
epoch: 56, train loss: 0.5082419429506574
epoch: 56, eval loss: 0.6909049898386002, correct: 7692, total: 10000, acc = 0.7691999673843384
epoch time: 39.42493271827698
epoch: 57, train loss: 0.5027476120360044
epoch: 57, eval loss: 0.6897687911987305, correct: 7695, total: 10000, acc = 0.7694999575614929
epoch time: 39.35954570770264
epoch: 58, train loss: 0.5053188776483342
epoch: 58, eval loss: 0.6899506479501725, correct: 7667, total: 10000, acc = 0.7666999697685242
epoch time: 39.44884634017944
epoch: 59, train loss: 0.4997740634241883
epoch: 59, eval loss: 0.687486720085144, correct: 7678, total: 10000, acc = 0.767799973487854
epoch time: 39.391881465911865
finish training
TACC: Starting up job 3497142
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
model is created
warning: variables which starts with __, is a module or class declaration are omitted
process rank 2 is bound to device 2
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 3 is bound to device 3
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
optimizer is created
start training
warning: variables which starts with __, is a module or class declaration are omitted
process rank 1 is bound to device 1
Files already downloaded and verified
Files already downloaded and verified
epoch: 0, train loss: 1.9320369898056498
epoch: 1, train loss: 1.6352128605453335
epoch: 1, eval loss: 1.5123237550258637, correct: 4542, total: 10000, acc = 0.45419999957084656
epoch: 2, train loss: 1.4457968728882926
epoch: 3, train loss: 1.3382204977833494
epoch: 3, eval loss: 1.2539702713489533, correct: 5451, total: 10000, acc = 0.5450999736785889
epoch: 4, train loss: 1.2739947474732691
epoch: 5, train loss: 1.2285400483073021
epoch: 5, eval loss: 1.1386113047599793, correct: 5908, total: 10000, acc = 0.5907999873161316
epoch: 6, train loss: 1.1903334296479517
epoch: 7, train loss: 1.1711674235305007
epoch: 7, eval loss: 1.1258068561553956, correct: 5967, total: 10000, acc = 0.5967000126838684
epoch: 8, train loss: 1.1419668745021432
epoch: 9, train loss: 1.1143895728247506
epoch: 9, eval loss: 1.040754759311676, correct: 6224, total: 10000, acc = 0.6223999857902527
epoch: 10, train loss: 1.1041023871120141
epoch: 11, train loss: 1.089750115968743
epoch: 11, eval loss: 1.0472844064235687, correct: 6265, total: 10000, acc = 0.6265000104904175
epoch: 12, train loss: 1.064698440687997
epoch: 13, train loss: 1.038266262229608
epoch: 13, eval loss: 1.0117274671792984, correct: 6415, total: 10000, acc = 0.6414999961853027
epoch: 14, train loss: 1.029945282303557
epoch: 15, train loss: 1.0171620620756734
epoch: 15, eval loss: 0.9712629705667496, correct: 6519, total: 10000, acc = 0.6518999934196472
epoch: 16, train loss: 0.9928132119227429
epoch: 17, train loss: 0.9921575498824217
epoch: 17, eval loss: 0.9429782271385193, correct: 6641, total: 10000, acc = 0.6640999913215637
epoch: 18, train loss: 0.9607366293060536
epoch: 19, train loss: 0.9427766927650997
epoch: 19, eval loss: 0.9346068739891052, correct: 6623, total: 10000, acc = 0.6622999906539917
epoch: 20, train loss: 0.9219280481338501
epoch: 21, train loss: 0.8945026689646195
epoch: 21, eval loss: 0.8710516095161438, correct: 6909, total: 10000, acc = 0.6908999681472778
epoch: 22, train loss: 0.8807675826306246
epoch: 23, train loss: 0.851514169756247
epoch: 23, eval loss: 0.8239740908145905, correct: 7052, total: 10000, acc = 0.7051999568939209
epoch: 24, train loss: 0.8388774534877466
epoch: 25, train loss: 0.8265813291072845
epoch: 25, eval loss: 0.8102335959672928, correct: 7137, total: 10000, acc = 0.713699996471405
epoch: 26, train loss: 0.8057564490911912
epoch: 27, train loss: 0.7816558753957554
epoch: 27, eval loss: 0.7648743063211441, correct: 7292, total: 10000, acc = 0.729200005531311
epoch: 28, train loss: 0.766656969883004
epoch: 29, train loss: 0.7515677390049915
epoch: 29, eval loss: 0.7517296761274338, correct: 7360, total: 10000, acc = 0.7360000014305115
epoch: 30, train loss: 0.7300611174836451
epoch: 31, train loss: 0.7038229193006244
epoch: 31, eval loss: 0.7385401755571366, correct: 7375, total: 10000, acc = 0.7374999523162842
epoch: 32, train loss: 0.6928578931458143
epoch: 33, train loss: 0.672958068093475
epoch: 33, eval loss: 0.6915913820266724, correct: 7596, total: 10000, acc = 0.7595999836921692
epoch: 34, train loss: 0.6505378533382805
epoch: 35, train loss: 0.6292881539889744
epoch: 35, eval loss: 0.7068031072616577, correct: 7567, total: 10000, acc = 0.7566999793052673
epoch: 36, train loss: 0.6092992303322773
epoch: 37, train loss: 0.5922880838720166
epoch: 37, eval loss: 0.6735526144504547, correct: 7662, total: 10000, acc = 0.7662000060081482
epoch: 38, train loss: 0.5777627850065425
epoch: 39, train loss: 0.562178050376931
epoch: 39, eval loss: 0.6323211371898652, correct: 7799, total: 10000, acc = 0.7798999547958374
epoch: 40, train loss: 0.5385949274106901
epoch: 41, train loss: 0.5233490755971597
epoch: 41, eval loss: 0.6360922038555146, correct: 7806, total: 10000, acc = 0.7805999517440796
epoch: 42, train loss: 0.50960702373057
epoch: 43, train loss: 0.48859657985823496
epoch: 43, eval loss: 0.607847985625267, correct: 7914, total: 10000, acc = 0.7913999557495117
epoch: 44, train loss: 0.47382923291654006
epoch: 45, train loss: 0.45052725380780745
epoch: 45, eval loss: 0.5986941397190094, correct: 8012, total: 10000, acc = 0.8011999726295471
epoch: 46, train loss: 0.43711013392526277
epoch: 47, train loss: 0.42507915229213483
epoch: 47, eval loss: 0.5871582478284836, correct: 8002, total: 10000, acc = 0.8001999855041504
epoch: 48, train loss: 0.40591827947266246
epoch: 49, train loss: 0.3911267008100237
epoch: 49, eval loss: 0.5832945287227631, correct: 8047, total: 10000, acc = 0.8046999573707581
epoch: 50, train loss: 0.3770884950550235
epoch: 51, train loss: 0.3587312725733738
epoch: 51, eval loss: 0.5942261666059494, correct: 8073, total: 10000, acc = 0.8072999715805054
epoch: 52, train loss: 0.34132662324272856
epoch: 53, train loss: 0.3267737687850485
epoch: 53, eval loss: 0.5920912757515907, correct: 8118, total: 10000, acc = 0.8118000030517578
epoch: 54, train loss: 0.3116904997399875
epoch: 55, train loss: 0.30321489380938665
epoch: 55, eval loss: 0.5957943320274353, correct: 8082, total: 10000, acc = 0.8082000017166138
epoch: 56, train loss: 0.2874147834218278
epoch: 57, train loss: 0.27991348140093747
epoch: 57, eval loss: 0.5895262002944947, correct: 8165, total: 10000, acc = 0.8165000081062317
epoch: 58, train loss: 0.274563160173747
epoch: 59, train loss: 0.2600744918596988
epoch: 59, eval loss: 0.5934095367789268, correct: 8150, total: 10000, acc = 0.8149999976158142
finish training
TACC: Starting up job 3498509
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
model is created
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
optimizer is created
start training
warning: variables which starts with __, is a module or class declaration are omitted
process rank 2 is bound to device 2
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 3 is bound to device 3
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 1 is bound to device 1
Files already downloaded and verified
Files already downloaded and verified
epoch: 0, train loss: 2.107759721425115
epoch: 1, train loss: 1.8388929500871776
epoch: 1, eval loss: 1.7622965753078461, correct: 3535, total: 10000, acc = 0.35349997878074646
epoch: 2, train loss: 1.7141443588295762
epoch: 3, train loss: 1.6003259931291853
epoch: 3, eval loss: 1.608506625890732, correct: 4263, total: 10000, acc = 0.4262999892234802
epoch: 4, train loss: 1.5016733225511045
epoch: 5, train loss: 1.4050611877927974
epoch: 5, eval loss: 1.386299443244934, correct: 4984, total: 10000, acc = 0.4983999729156494
epoch: 6, train loss: 1.3264902623332278
epoch: 7, train loss: 1.2681689250225923
epoch: 7, eval loss: 1.3251740992069245, correct: 5295, total: 10000, acc = 0.5295000076293945
epoch: 8, train loss: 1.2236176984650748
epoch: 9, train loss: 1.172800781775494
epoch: 9, eval loss: 1.1429427027702332, correct: 5966, total: 10000, acc = 0.5965999960899353
epoch: 10, train loss: 1.1335287532027887
epoch: 11, train loss: 1.0974334563527788
epoch: 11, eval loss: 1.1024536848068238, correct: 6107, total: 10000, acc = 0.6107000112533569
epoch: 12, train loss: 1.0638826300903244
epoch: 13, train loss: 1.0406859383291127
epoch: 13, eval loss: 1.0324654281139374, correct: 6282, total: 10000, acc = 0.6281999945640564
epoch: 14, train loss: 1.0157714376644211
epoch: 15, train loss: 0.990898135365272
epoch: 15, eval loss: 0.9790050059556961, correct: 6539, total: 10000, acc = 0.6538999676704407
epoch: 16, train loss: 0.963820260398242
epoch: 17, train loss: 0.9404383374720203
epoch: 17, eval loss: 0.9367435872554779, correct: 6691, total: 10000, acc = 0.6690999865531921
epoch: 18, train loss: 0.9299906589546982
epoch: 19, train loss: 0.9038882474510037
epoch: 19, eval loss: 0.9210823565721512, correct: 6709, total: 10000, acc = 0.6708999872207642
epoch: 20, train loss: 0.8825302799137271
epoch: 21, train loss: 0.8686576388320144
epoch: 21, eval loss: 0.8791542768478393, correct: 6913, total: 10000, acc = 0.6912999749183655
epoch: 22, train loss: 0.8509396040926174
epoch: 23, train loss: 0.8375457452268017
epoch: 23, eval loss: 0.8651147484779358, correct: 6948, total: 10000, acc = 0.6947999596595764
epoch: 24, train loss: 0.8163802222329744
epoch: 25, train loss: 0.8068491317787949
epoch: 25, eval loss: 0.8353333532810211, correct: 7089, total: 10000, acc = 0.708899974822998
epoch: 26, train loss: 0.7894753631280393
epoch: 27, train loss: 0.7779296344640304
epoch: 27, eval loss: 0.8161472469568253, correct: 7143, total: 10000, acc = 0.7142999768257141
epoch: 28, train loss: 0.763744876092794
epoch: 29, train loss: 0.7521962505214068
epoch: 29, eval loss: 0.7903082758188248, correct: 7219, total: 10000, acc = 0.7218999862670898
epoch: 30, train loss: 0.7443178624522929
epoch: 31, train loss: 0.7280340212948468
epoch: 31, eval loss: 0.7877005040645599, correct: 7233, total: 10000, acc = 0.7232999801635742
epoch: 32, train loss: 0.7196985489251663
epoch: 33, train loss: 0.7108793039711154
epoch: 33, eval loss: 0.7838329076766968, correct: 7292, total: 10000, acc = 0.729200005531311
epoch: 34, train loss: 0.6965019471791326
epoch: 35, train loss: 0.6875918537986522
epoch: 35, eval loss: 0.7513678789138794, correct: 7392, total: 10000, acc = 0.7391999959945679
epoch: 36, train loss: 0.6793362346230721
epoch: 37, train loss: 0.6741023343436572
epoch: 37, eval loss: 0.7752945452928544, correct: 7316, total: 10000, acc = 0.7315999865531921
epoch: 38, train loss: 0.6629589072295597
epoch: 39, train loss: 0.6507086388918818
epoch: 39, eval loss: 0.7758691757917404, correct: 7322, total: 10000, acc = 0.7321999669075012
epoch: 40, train loss: 0.6381483582817778
epoch: 41, train loss: 0.6374095179596726
epoch: 41, eval loss: 0.7589699536561966, correct: 7386, total: 10000, acc = 0.738599956035614
epoch: 42, train loss: 0.6251792050137812
epoch: 43, train loss: 0.6148473596086308
epoch: 43, eval loss: 0.7495014071464539, correct: 7478, total: 10000, acc = 0.7477999925613403
epoch: 44, train loss: 0.6119371378908351
epoch: 45, train loss: 0.6012086509441843
epoch: 45, eval loss: 0.725347763299942, correct: 7515, total: 10000, acc = 0.7515000104904175
epoch: 46, train loss: 0.597867566103838
epoch: 47, train loss: 0.5913592832429069
epoch: 47, eval loss: 0.7254288077354432, correct: 7529, total: 10000, acc = 0.7529000043869019
epoch: 48, train loss: 0.5801522807807339
epoch: 49, train loss: 0.575563525666996
epoch: 49, eval loss: 0.7291093468666077, correct: 7533, total: 10000, acc = 0.7532999515533447
epoch: 50, train loss: 0.573031121674849
epoch: 51, train loss: 0.5667383588698446
epoch: 51, eval loss: 0.7240727603435516, correct: 7570, total: 10000, acc = 0.7569999694824219
epoch: 52, train loss: 0.5578772419569443
epoch: 53, train loss: 0.5526659309255834
epoch: 53, eval loss: 0.7226850330829621, correct: 7576, total: 10000, acc = 0.7576000094413757
epoch: 54, train loss: 0.5473246245968099
epoch: 55, train loss: 0.5443006860358375
epoch: 55, eval loss: 0.720612645149231, correct: 7596, total: 10000, acc = 0.7595999836921692
epoch: 56, train loss: 0.5361242987671677
epoch: 57, train loss: 0.5323515981435776
epoch: 57, eval loss: 0.7203025311231613, correct: 7580, total: 10000, acc = 0.7579999566078186
epoch: 58, train loss: 0.5297852404871766
epoch: 59, train loss: 0.5288004583241989
epoch: 59, eval loss: 0.7189624041318894, correct: 7605, total: 10000, acc = 0.7604999542236328
finish training
TACC: Starting up job 3496458
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
model is created
warning: variables which starts with __, is a module or class declaration are omitted
process rank 3 is bound to device 3
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 2 is bound to device 2
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
warning: variables which starts with __, is a module or class declaration are omitted
process rank 7 is bound to device 3
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 6 is bound to device 2
Files already downloaded and verified
Files already downloaded and verified
optimizer is created
start training
warning: variables which starts with __, is a module or class declaration are omitted
process rank 4 is bound to device 0
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 5 is bound to device 1
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 1 is bound to device 1
Files already downloaded and verified
Files already downloaded and verified
epoch: 0, train loss: 1.936693473738067
epoch: 1, train loss: 1.627108974116189
epoch: 1, eval loss: 1.5279120564460755, correct: 4576, total: 10000, acc = 0.4575999975204468
epoch: 2, train loss: 1.438910031805233
epoch: 3, train loss: 1.3184991053172521
epoch: 3, eval loss: 1.3557079970836639, correct: 5129, total: 10000, acc = 0.5128999948501587
epoch: 4, train loss: 1.271946340191121
epoch: 5, train loss: 1.2340542175331894
epoch: 5, eval loss: 1.207822185754776, correct: 5703, total: 10000, acc = 0.5702999830245972
epoch: 6, train loss: 1.187913371592152
epoch: 7, train loss: 1.154962458172623
epoch: 7, eval loss: 1.0685692846775054, correct: 6100, total: 10000, acc = 0.6100000143051147
epoch: 8, train loss: 1.1158924905621275
epoch: 9, train loss: 1.0909727805731249
epoch: 9, eval loss: 1.0345157146453858, correct: 6328, total: 10000, acc = 0.6327999830245972
epoch: 10, train loss: 1.0725988399009316
epoch: 11, train loss: 1.0453423085261364
epoch: 11, eval loss: 0.9778846323490142, correct: 6543, total: 10000, acc = 0.6542999744415283
epoch: 12, train loss: 1.0397504823548454
epoch: 13, train loss: 1.011059400986652
epoch: 13, eval loss: 0.9668682873249054, correct: 6446, total: 10000, acc = 0.644599974155426
epoch: 14, train loss: 0.9938353963044225
epoch: 15, train loss: 0.9691349967401854
epoch: 15, eval loss: 0.9465512812137604, correct: 6657, total: 10000, acc = 0.6656999588012695
epoch: 16, train loss: 0.9470896617490419
epoch: 17, train loss: 0.927201622602891
epoch: 17, eval loss: 0.8875106543302536, correct: 6837, total: 10000, acc = 0.6836999654769897
epoch: 18, train loss: 0.8975223132542202
epoch: 19, train loss: 0.8810242603019792
epoch: 19, eval loss: 0.8688296616077423, correct: 6832, total: 10000, acc = 0.6832000017166138
epoch: 20, train loss: 0.8482622784011218
epoch: 21, train loss: 0.8266285700457436
epoch: 21, eval loss: 0.7801274597644806, correct: 7205, total: 10000, acc = 0.7204999923706055
epoch: 22, train loss: 0.8038581859092323
epoch: 23, train loss: 0.7879118153027126
epoch: 23, eval loss: 0.7779350578784943, correct: 7203, total: 10000, acc = 0.7202999591827393
epoch: 24, train loss: 0.7542270896386127
epoch: 25, train loss: 0.7369782894241567
epoch: 25, eval loss: 0.7534965008497239, correct: 7362, total: 10000, acc = 0.7361999750137329
epoch: 26, train loss: 0.7095995545387268
epoch: 27, train loss: 0.6873777825005201
epoch: 27, eval loss: 0.7344318777322769, correct: 7381, total: 10000, acc = 0.738099992275238
epoch: 28, train loss: 0.6713967414534822
epoch: 29, train loss: 0.650338428969286
epoch: 29, eval loss: 0.677948921918869, correct: 7653, total: 10000, acc = 0.7652999758720398
epoch: 30, train loss: 0.6301205882004329
epoch: 31, train loss: 0.5990057824825754
epoch: 31, eval loss: 0.6719370454549789, correct: 7643, total: 10000, acc = 0.7642999887466431
epoch: 32, train loss: 0.590088236696866
epoch: 33, train loss: 0.5689327443132595
epoch: 33, eval loss: 0.6191721886396409, correct: 7807, total: 10000, acc = 0.7806999683380127
epoch: 34, train loss: 0.5426055670392756
epoch: 35, train loss: 0.5270413601276825
epoch: 35, eval loss: 0.6150132775306701, correct: 7879, total: 10000, acc = 0.7878999710083008
epoch: 36, train loss: 0.5215025428606539
epoch: 37, train loss: 0.4952395400222467
epoch: 37, eval loss: 0.628344652056694, correct: 7868, total: 10000, acc = 0.786799967288971
epoch: 38, train loss: 0.47989121687655545
epoch: 39, train loss: 0.46510300618045186
epoch: 39, eval loss: 0.5977057978510857, correct: 7944, total: 10000, acc = 0.7943999767303467
epoch: 40, train loss: 0.4441945254802704
epoch: 41, train loss: 0.4285763985648447
epoch: 41, eval loss: 0.5695438250899315, correct: 8023, total: 10000, acc = 0.802299976348877
epoch: 42, train loss: 0.41337763776584546
epoch: 43, train loss: 0.3940146170100387
epoch: 43, eval loss: 0.5688270673155784, correct: 8091, total: 10000, acc = 0.8090999722480774
epoch: 44, train loss: 0.37741332303504554
epoch: 45, train loss: 0.36565779605690313
epoch: 45, eval loss: 0.5831407308578491, correct: 8104, total: 10000, acc = 0.8104000091552734
epoch: 46, train loss: 0.3468657017362361
epoch: 47, train loss: 0.32949359198005834
epoch: 47, eval loss: 0.5751512110233307, correct: 8097, total: 10000, acc = 0.8096999526023865
epoch: 48, train loss: 0.3140165246262842
epoch: 49, train loss: 0.29480520498995877
epoch: 49, eval loss: 0.5712087765336037, correct: 8184, total: 10000, acc = 0.818399965763092
epoch: 50, train loss: 0.2766021394303867
epoch: 51, train loss: 0.26527753776433516
epoch: 51, eval loss: 0.5643855139613152, correct: 8218, total: 10000, acc = 0.8217999935150146
epoch: 52, train loss: 0.2525861115784061
epoch: 53, train loss: 0.23714738658496312
epoch: 53, eval loss: 0.5732526823878288, correct: 8249, total: 10000, acc = 0.8248999714851379
epoch: 54, train loss: 0.2238179413335664
epoch: 55, train loss: 0.2119908875652722
epoch: 55, eval loss: 0.5957901775836945, correct: 8261, total: 10000, acc = 0.8260999917984009
epoch: 56, train loss: 0.19989302222217833
epoch: 57, train loss: 0.1875186789096618
epoch: 57, eval loss: 0.5905491337180138, correct: 8290, total: 10000, acc = 0.8289999961853027
epoch: 58, train loss: 0.18436841180129926
epoch: 59, train loss: 0.17459663231762088
epoch: 59, eval loss: 0.589044263958931, correct: 8313, total: 10000, acc = 0.8312999606132507
finish training
TACC: Starting up job 3498327
TACC: Starting parallel tasks...
warning: variables which starts with __, is a module or class declaration are omitted
process rank 0 is bound to device 0
distributed environment is initialzied
model is created
Files already downloaded and verified
Files already downloaded and verified
training and testing dataloaders are created
loss is created
optimizer is created
start training
warning: variables which starts with __, is a module or class declaration are omitted
process rank 2 is bound to device 2
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 3 is bound to device 3
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 4 is bound to device 0
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 5 is bound to device 1
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 7 is bound to device 3
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 6 is bound to device 2
Files already downloaded and verified
Files already downloaded and verified
warning: variables which starts with __, is a module or class declaration are omitted
process rank 1 is bound to device 1
Files already downloaded and verified
Files already downloaded and verified
epoch: 0, train loss: 2.1005014667705613
epoch: 1, train loss: 1.8539113086097094
epoch: 1, eval loss: 1.7973519027233125, correct: 3362, total: 10000, acc = 0.3361999988555908
epoch: 2, train loss: 1.7149482040989155
epoch: 3, train loss: 1.5927067617980801
epoch: 3, eval loss: 1.5848429083824158, correct: 4344, total: 10000, acc = 0.4343999922275543
epoch: 4, train loss: 1.4912729798531046
epoch: 5, train loss: 1.3957378158763962
epoch: 5, eval loss: 1.4951884388923644, correct: 4841, total: 10000, acc = 0.48409998416900635
epoch: 6, train loss: 1.3090402642074896
epoch: 7, train loss: 1.2566283296565621
epoch: 7, eval loss: 1.2464738070964814, correct: 5562, total: 10000, acc = 0.5561999678611755
epoch: 8, train loss: 1.2084139476017075
epoch: 9, train loss: 1.1706127719003327
epoch: 9, eval loss: 1.162048089504242, correct: 5876, total: 10000, acc = 0.5875999927520752
epoch: 10, train loss: 1.120817175933293
epoch: 11, train loss: 1.084984731309268
epoch: 11, eval loss: 1.0764922022819519, correct: 6155, total: 10000, acc = 0.6154999732971191
epoch: 12, train loss: 1.0559214432628787
epoch: 13, train loss: 1.0261321286765896
epoch: 13, eval loss: 1.0338306188583375, correct: 6334, total: 10000, acc = 0.6333999633789062
epoch: 14, train loss: 0.992842432187528
epoch: 15, train loss: 0.9660871296512837
epoch: 15, eval loss: 1.0059030145406722, correct: 6458, total: 10000, acc = 0.645799994468689
epoch: 16, train loss: 0.9467733100968965
epoch: 17, train loss: 0.9243187673237859
epoch: 17, eval loss: 0.9469569176435471, correct: 6610, total: 10000, acc = 0.6609999537467957
epoch: 18, train loss: 0.9059403721167116
epoch: 19, train loss: 0.8819177935318071
epoch: 19, eval loss: 0.9196836709976196, correct: 6727, total: 10000, acc = 0.6726999878883362
epoch: 20, train loss: 0.8721987532109631
epoch: 21, train loss: 0.8469706013494608
epoch: 21, eval loss: 0.8634845405817032, correct: 6976, total: 10000, acc = 0.6976000070571899
epoch: 22, train loss: 0.8352831839298716
epoch: 23, train loss: 0.8124590455269327
epoch: 23, eval loss: 0.8418784946203232, correct: 7034, total: 10000, acc = 0.7033999562263489
epoch: 24, train loss: 0.7961219853284408
epoch: 25, train loss: 0.7883704268202489
epoch: 25, eval loss: 0.8191130340099335, correct: 7116, total: 10000, acc = 0.7116000056266785
epoch: 26, train loss: 0.7733409623710477
epoch: 27, train loss: 0.7561956893424598
epoch: 27, eval loss: 0.8028618812561035, correct: 7200, total: 10000, acc = 0.7199999690055847
epoch: 28, train loss: 0.7479740460308231
epoch: 29, train loss: 0.7343520899208225
epoch: 29, eval loss: 0.7829996794462204, correct: 7256, total: 10000, acc = 0.725600004196167
epoch: 30, train loss: 0.7244430549290716
epoch: 31, train loss: 0.7121965617549663
epoch: 31, eval loss: 0.765428164601326, correct: 7299, total: 10000, acc = 0.7299000024795532
epoch: 32, train loss: 0.6988190838268825
epoch: 33, train loss: 0.6908610359746583
epoch: 33, eval loss: 0.7602580636739731, correct: 7395, total: 10000, acc = 0.7394999861717224
epoch: 34, train loss: 0.6785666395206841
epoch: 35, train loss: 0.6664504153387887
epoch: 35, eval loss: 0.7671193510293961, correct: 7345, total: 10000, acc = 0.734499990940094
epoch: 36, train loss: 0.6639333245705585
epoch: 37, train loss: 0.6509425913800999
epoch: 37, eval loss: 0.7612941324710846, correct: 7382, total: 10000, acc = 0.7382000088691711
epoch: 38, train loss: 0.6416311720196082
epoch: 39, train loss: 0.6312643265237614
epoch: 39, eval loss: 0.7380059510469437, correct: 7496, total: 10000, acc = 0.7495999932289124
epoch: 40, train loss: 0.620578939209179
epoch: 41, train loss: 0.6195461816933691
epoch: 41, eval loss: 0.7172901630401611, correct: 7550, total: 10000, acc = 0.7549999952316284
epoch: 42, train loss: 0.6013389248020795
epoch: 43, train loss: 0.6049416010477104
epoch: 43, eval loss: 0.7145429253578186, correct: 7569, total: 10000, acc = 0.7568999528884888
epoch: 44, train loss: 0.5950779300563189
epoch: 45, train loss: 0.5786038743598121
epoch: 45, eval loss: 0.7171747118234635, correct: 7569, total: 10000, acc = 0.7568999528884888
epoch: 46, train loss: 0.5752052083915594
epoch: 47, train loss: 0.5669339743195748
epoch: 47, eval loss: 0.7040806382894516, correct: 7601, total: 10000, acc = 0.7601000070571899
epoch: 48, train loss: 0.5596802952338238
epoch: 49, train loss: 0.5521421706189915
epoch: 49, eval loss: 0.7221358746290207, correct: 7592, total: 10000, acc = 0.7591999769210815
epoch: 50, train loss: 0.5504364164508119
epoch: 51, train loss: 0.5363630725412952
epoch: 51, eval loss: 0.710089972615242, correct: 7650, total: 10000, acc = 0.7649999856948853
epoch: 52, train loss: 0.5382009008709265
epoch: 53, train loss: 0.5292040118757559
epoch: 53, eval loss: 0.7044323921203614, correct: 7672, total: 10000, acc = 0.7671999931335449
epoch: 54, train loss: 0.5289747638970005
epoch: 55, train loss: 0.5239191630056926
epoch: 55, eval loss: 0.6983724802732467, correct: 7694, total: 10000, acc = 0.7694000005722046
epoch: 56, train loss: 0.5177402243930467
epoch: 57, train loss: 0.5132759012738053
epoch: 57, eval loss: 0.7066506981849671, correct: 7671, total: 10000, acc = 0.7670999765396118
epoch: 58, train loss: 0.5119742675095188
epoch: 59, train loss: 0.5074386891661858
epoch: 59, eval loss: 0.7012903690338135, correct: 7693, total: 10000, acc = 0.7692999839782715
finish training
from pathlib import Path
import pytest
import torch.autograd
import colossalai
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.nn.layer._parallel_utilities import _gather
CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2p5d.py')
def eval(engine):
engine.eval()
accumulated_loss = 0
correct_sum = 0
total_sum = 0
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
accumulated_loss += loss.detach().cpu().numpy()
output = _gather(
output[0],
ParallelMode.PARALLEL_2P5D_ROW,
1
)
output = _gather(
output,
ParallelMode.PARALLEL_2P5D_COL,
0,
)
output = _gather(
output,
ParallelMode.PARALLEL_2P5D_DEP,
0,
)
output = torch.argmax(output, dim=-1)
correct = torch.sum(label[0] == output)
correct_sum += correct
total_sum += label[0].size(0)
avg_loss = accumulated_loss / engine.schedule.num_steps
return correct_sum, total_sum, avg_loss
def train(engine):
engine.train()
accumulated_loss = 0
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / engine.schedule.num_steps
return avg_loss
@pytest.mark.dist
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2p5d_parallel_vision_transformer():
# init dist
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
CONFIG_PATH)
logger = get_global_dist_logger()
engine = Engine(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule)
logger.info('start training')
for epoch in range(gpc.config.num_epochs):
train_loss = train(engine)
logger.info(f'epoch {epoch} - train loss: {train_loss}')
if epoch % 2 == 0:
correct_sum, total_sum, eval_loss = eval(engine)
logger.info(
f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
if __name__ == '__main__':
test_2p5d_parallel_vision_transformer()
\ No newline at end of file
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import time
from pathlib import Path
import torch
from tqdm import tqdm
from colossalai import initialize
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.trainer import Trainer
from colossalai.trainer.metric import Accuracy3D
from colossalai.utils import print_rank_0
CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_3d.py')
def _train_epoch(epoch, engine):
logger = get_global_dist_logger()
print_rank_0('[Epoch %d] training start' % (epoch), logger)
engine.train()
train_loss = 0
batch_cnt = 0
num_samples = 0
now = time.time()
epoch_start = now
progress = range(engine.schedule.num_steps)
if gpc.get_global_rank() == 0:
progress = tqdm(progress, desc='[Epoch %d]' % epoch, miniters=1)
for step in progress:
cur_lr = engine.get_lr()
_, targets, loss = engine.step()
batch_size = targets[0].size(0)
train_loss += loss.item()
num_samples += batch_size
batch_cnt += 1
batch_time = time.time() - now
now = time.time()
if gpc.get_global_rank() == 0:
print_features = dict(lr='%g' % cur_lr,
loss='%.3f' % (train_loss / (step + 1)),
throughput='%.3f (images/sec)' %
(batch_size / (batch_time + 1e-12)))
progress.set_postfix(**print_features)
epoch_end = time.time()
epoch_loss = train_loss / batch_cnt
epoch_throughput = num_samples / (epoch_end - epoch_start + 1e-12)
print_rank_0(
'[Epoch %d] Loss: %.3f | Throughput: %.3f (samples/sec)' %
(epoch, epoch_loss, epoch_throughput), logger)
def _eval(epoch, engine):
logger = get_global_dist_logger()
engine.eval()
eval_loss = 0
acc = Accuracy3D(True, ParallelMode.PARALLEL_3D_OUTPUT,
ParallelMode.PARALLEL_3D_WEIGHT)
total = 0
with torch.no_grad():
for _ in range(engine.schedule.num_steps):
outputs, targets, loss = engine.step()
if isinstance(outputs, (list, tuple)):
outputs = outputs[0]
if isinstance(targets, (list, tuple)):
targets = targets[0]
eval_loss += loss.item()
acc.update(outputs, targets)
total += targets.size(0)
print_rank_0(
'[Epoch %d] Evaluation loss: %.3f | Acc: %.3f%%' %
(epoch, eval_loss / engine.schedule.num_steps,
acc.get_accumulated_value() * 100), logger)
def train():
model, train_dataloader, test_dataloader, criterion, \
optimizer, schedule, lr_scheduler = initialize(CONFIG_PATH)
logger = get_global_dist_logger()
engine = Engine(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule)
logger.info("Engine is built", ranks=[0])
trainer = Trainer(engine=engine, hooks_cfg=gpc.config.hooks, verbose=True)
logger.info("Trainer is built", ranks=[0])
logger.info("Train start", ranks=[0])
trainer.fit(train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
max_epochs=gpc.config.num_epochs,
display_progress=True,
test_interval=1)
if __name__ == '__main__':
train()
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from pathlib import Path
import pytest
import torch
from colossalai.builder import build_model
from colossalai.context import Config
CONFIG_PATH = Path(__file__).parent.joinpath('configs/vanilla_vit.py')
@pytest.mark.cpu
def test_with_vanilla_vit_config():
config = Config.from_file(CONFIG_PATH)
model = build_model(config.model)
model.build_from_cfg()
img = torch.randn(1, 3, config.IMG_SIZE, config.IMG_SIZE)
out = model(img)
loss = out.mean()
loss.backward()
if __name__ == '__main__':
test_with_vanilla_vit_config()
import os
from pathlib import Path
BATCH_SIZE = 128
IMG_SIZE = 32
# resnet 50
model = dict(
type='VanillaResNet',
block_type='ResNetBottleneck',
layers=[3, 4, 6, 3],
num_cls=10
)
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
shuffle=True
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
shuffle=True
)
)
optimizer = dict(
type='SGD',
lr=0.2,
momentum=0.9,
weight_decay=5e-4
)
loss = dict(
type='CrossEntropyLoss',
)
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=1, mode=None),
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='AccuracyHook'),
dict(type='LossHook'),
dict(type='TensorboardHook', log_dir='./tfb_logs'),
dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=1
# )
lr_scheduler = dict(
type='CosineAnnealingLR',
T_max=200
)
num_epochs = 200
import os
from pathlib import Path
from colossalai.engine import AMP_TYPE
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 8
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
train_data = dict(
dataset=dict(type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
# num_workers=1,
shuffle=True,
))
test_data = dict(
dataset=dict(type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]),
dataloader=dict(
batch_size=400,
pin_memory=True,
# num_workers=1,
))
optimizer = dict(type='Adam', lr=0.001, weight_decay=0)
loss = dict(type='CrossEntropyLoss2D', )
# model = dict(
# type='VanillaResNet',
# block_type='ResNetBasicBlock',
# layers=[2, 2, 2, 2],
# num_cls=10
# )
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(type='ViTInputSplitter2D', ),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
),
droppath_cfg=dict(type='VanillaViTDropPath', ),
mlp_cfg=dict(type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=1),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='LogTimingByEpochHook'),
dict(type='Accuracy2DHook'),
dict(type='LossHook'),
dict(type='TensorboardHook', log_dir='./tfb_logs'),
dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
fp16 = dict(mode=AMP_TYPE.PARALLEL, initial_scale=2 ** 8)
lr_scheduler = dict(type='LinearWarmupLR', warmup_epochs=5)
schedule = dict(num_microbatches=1)
num_epochs = 60
num_microbatches = 1
logging = dict(root_path='./logs')
#!/usr/bin/env sh
test_file=$1
config_file=$2
python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 --config $config_file
import colossalai
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.trainer import Trainer
def test_trainer():
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
logger = get_global_dist_logger()
engine = Engine(
model=model,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule
)
logger.info("engine is built", ranks=[0])
trainer = Trainer(engine=engine,
hooks_cfg=gpc.config.hooks,
verbose=True)
logger.info("trainer is built", ranks=[0])
logger.info("start training", ranks=[0])
trainer.fit(
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
max_epochs=gpc.config.num_epochs,
display_progress=False,
test_interval=5
)
if __name__ == '__main__':
test_trainer()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment