Commit 8aa22af0 authored by thomwolf's avatar thomwolf
Browse files

fixing model

parent 38f740a1
......@@ -12,8 +12,8 @@
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:09.239405Z",
"start_time": "2018-11-02T14:09:08.126668Z"
"end_time": "2018-11-03T02:09:37.498678Z",
"start_time": "2018-11-03T02:09:36.366672Z"
}
},
"outputs": [],
......@@ -26,8 +26,8 @@
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:09.370511Z",
"start_time": "2018-11-02T14:09:09.242527Z"
"end_time": "2018-11-03T02:09:37.621865Z",
"start_time": "2018-11-03T02:09:37.500988Z"
}
},
"outputs": [
......@@ -52,7 +52,7 @@
"max_seq_length=128\n",
"input_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/input.txt\"\n",
"\n",
"layer_indexes = [-1]\n",
"layer_indexes = list(range(12))\n",
"bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
"tokenizer = tokenization.FullTokenizer(\n",
" vocab_file=vocab_file, do_lower_case=True)\n",
......@@ -70,8 +70,8 @@
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:12.514617Z",
"start_time": "2018-11-02T14:09:09.372137Z"
"end_time": "2018-11-03T02:09:40.831618Z",
"start_time": "2018-11-03T02:09:37.624063Z"
}
},
"outputs": [
......@@ -79,15 +79,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b266ae8>) includes params argument, but params are not passed to Estimator.\n",
"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh\n",
"INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b0bcc80>) includes params argument, but params are not passed to Estimator.\n",
"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u\n",
"INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
"graph_options {\n",
" rewrite_options {\n",
" meta_optimizer_iterations: ONE\n",
" }\n",
"}\n",
", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e2c1160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e1160f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
"WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
"INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
"WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
......@@ -126,8 +126,8 @@
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:17.745970Z",
"start_time": "2018-11-02T14:09:12.516953Z"
"end_time": "2018-11-03T02:09:46.413197Z",
"start_time": "2018-11-03T02:09:40.834621Z"
}
},
"outputs": [
......@@ -135,42 +135,53 @@
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh, running initialization to predict.\n",
"INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u, running initialization to predict.\n",
"INFO:tensorflow:Calling model_fn.\n",
"INFO:tensorflow:Running infer on CPU\n",
"INFO:tensorflow:Done calling model_fn.\n",
"INFO:tensorflow:Graph was finalized.\n",
"INFO:tensorflow:Running local_init_op.\n",
"INFO:tensorflow:Done running local_init_op.\n",
"extracting layer 0\n",
"extracting layer 1\n",
"extracting layer 2\n",
"extracting layer 3\n",
"extracting layer 4\n",
"extracting layer 5\n",
"extracting layer 6\n",
"extracting layer 7\n",
"extracting layer 8\n",
"extracting layer 9\n",
"extracting layer 10\n",
"extracting layer 11\n",
"INFO:tensorflow:prediction_loop marked as finished\n",
"INFO:tensorflow:prediction_loop marked as finished\n"
]
}
],
"source": [
"all_out = []\n",
"tensorflow_all_out = []\n",
"for result in estimator.predict(input_fn, yield_single_examples=True):\n",
" unique_id = int(result[\"unique_id\"])\n",
" feature = unique_id_to_feature[unique_id]\n",
" output_json = collections.OrderedDict()\n",
" output_json[\"linex_index\"] = unique_id\n",
" all_out_features = []\n",
" for (i, token) in enumerate(feature.tokens):\n",
" all_layers = []\n",
" for (j, layer_index) in enumerate(layer_indexes):\n",
" layer_output = result[\"layer_output_%d\" % j]\n",
" layers = collections.OrderedDict()\n",
" layers[\"index\"] = layer_index\n",
" layers[\"values\"] = [\n",
" round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n",
" ]\n",
" all_layers.append(layers)\n",
" out_features = collections.OrderedDict()\n",
" out_features[\"token\"] = token\n",
" out_features[\"layers\"] = all_layers\n",
" all_out_features.append(out_features)\n",
" output_json[\"features\"] = all_out_features\n",
" all_out.append(output_json)"
" tensorflow_all_out_features = []\n",
" # for (i, token) in enumerate(feature.tokens):\n",
" all_layers = []\n",
" for (j, layer_index) in enumerate(layer_indexes):\n",
" print(\"extracting layer {}\".format(j))\n",
" layer_output = result[\"layer_output_%d\" % j]\n",
" layers = collections.OrderedDict()\n",
" layers[\"index\"] = layer_index\n",
" layers[\"values\"] = layer_output\n",
" all_layers.append(layers)\n",
" tensorflow_out_features = collections.OrderedDict()\n",
" tensorflow_out_features[\"layers\"] = all_layers\n",
" tensorflow_all_out_features.append(tensorflow_out_features)\n",
"\n",
" output_json[\"features\"] = tensorflow_all_out_features\n",
" tensorflow_all_out.append(output_json)"
]
},
{
......@@ -178,8 +189,8 @@
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:17.780532Z",
"start_time": "2018-11-02T14:09:17.748778Z"
"end_time": "2018-11-03T02:09:46.460128Z",
"start_time": "2018-11-03T02:09:46.416138Z"
}
},
"outputs": [
......@@ -190,15 +201,28 @@
"1\n",
"2\n",
"odict_keys(['linex_index', 'features'])\n",
"14\n"
"number of tokens 1\n",
"number of layers 12\n"
]
},
{
"data": {
"text/plain": [
"(128, 768)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(len(all_out))\n",
"print(len(all_out[0]))\n",
"print(all_out[0].keys())\n",
"print(len(all_out[0]['features']))"
"print(len(tensorflow_all_out))\n",
"print(len(tensorflow_all_out[0]))\n",
"print(tensorflow_all_out[0].keys())\n",
"print(\"number of tokens\", len(tensorflow_all_out[0]['features']))\n",
"print(\"number of layers\", len(tensorflow_all_out[0]['features'][0]['layers']))\n",
"tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape"
]
},
{
......@@ -206,34 +230,13 @@
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:17.818968Z",
"start_time": "2018-11-02T14:09:17.782121Z"
"end_time": "2018-11-03T02:09:46.498637Z",
"start_time": "2018-11-03T02:09:46.463115Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[-0.628111,\n",
" 0.193215,\n",
" -0.75185,\n",
" -0.040464,\n",
" -0.875331,\n",
" 0.15654,\n",
" 1.385444,\n",
" 1.066997,\n",
" -0.349549,\n",
" 0.270686]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"tensorflow_output = all_out[0]['features'][0]['layers'][0]['values']\n",
"tensorflow_output[:10]"
"tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)"
]
},
{
......@@ -248,12 +251,13 @@
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:17.954196Z",
"start_time": "2018-11-02T14:09:17.821115Z"
"end_time": "2018-11-03T02:09:46.660303Z",
"start_time": "2018-11-03T02:09:46.501325Z"
}
},
"outputs": [],
"source": [
"import extract_features_pytorch\n",
"from extract_features_pytorch import *"
]
},
......@@ -262,8 +266,8 @@
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:19.196475Z",
"start_time": "2018-11-02T14:09:17.956199Z"
"end_time": "2018-11-03T02:09:48.292135Z",
"start_time": "2018-11-03T02:09:46.661921Z"
}
},
"outputs": [
......@@ -574,7 +578,7 @@
"init_checkpoint_pt=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\"\n",
"\n",
"device = torch.device(\"cpu\")\n",
"model = BertModel(bert_config)\n",
"model = extract_features_pytorch.BertModel(bert_config)\n",
"model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
"model.to(device)"
]
......@@ -584,8 +588,8 @@
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:19.236256Z",
"start_time": "2018-11-02T14:09:19.198407Z"
"end_time": "2018-11-03T02:09:48.332982Z",
"start_time": "2018-11-03T02:09:48.294056Z"
},
"code_folding": []
},
......@@ -896,9 +900,10 @@
"source": [
"all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
"all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
"all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n",
"all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
"\n",
"eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)\n",
"eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\n",
"eval_sampler = SequentialSampler(eval_data)\n",
"eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
"\n",
......@@ -907,41 +912,86 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:19.671994Z",
"start_time": "2018-11-02T14:09:19.239454Z"
"end_time": "2018-11-03T02:09:54.371188Z",
"start_time": "2018-11-03T02:09:53.976875Z"
}
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 101, 2040, 2001, 3958, 27227, 1029, 102, 3958, 27227, 2001,\n",
" 1037, 13997, 11510, 102, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0]])\n",
"tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0]])\n",
"tensor([0])\n",
"layer 0 0\n",
"layer 1 1\n",
"layer 2 2\n",
"layer 3 3\n",
"layer 4 4\n",
"layer 5 5\n",
"layer 6 6\n",
"layer 7 7\n",
"layer 8 8\n",
"layer 9 9\n",
"layer 10 10\n",
"layer 11 11\n"
]
}
],
"source": [
"layer_indexes = list(range(12))\n",
"\n",
"pytorch_all_out = []\n",
"for input_ids, input_mask, example_indices in eval_dataloader:\n",
"for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\n",
" print(input_ids)\n",
" print(input_mask)\n",
" print(example_indices)\n",
" input_ids = input_ids.to(device)\n",
" input_mask = input_mask.float().to(device)\n",
"\n",
" all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)\n",
" all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\n",
"\n",
" for enc_layers, example_index in zip(all_encoder_layers, example_indices):\n",
" for b, example_index in enumerate(example_indices):\n",
" feature = features[example_index.item()]\n",
" unique_id = int(feature.unique_id)\n",
" # feature = unique_id_to_feature[unique_id]\n",
" output_json = collections.OrderedDict()\n",
" output_json[\"linex_index\"] = unique_id\n",
" all_out_features = []\n",
" for (i, token) in enumerate(feature.tokens):\n",
" all_layers = []\n",
" for (j, layer_index) in enumerate(layer_indexes):\n",
" layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()\n",
" layers = collections.OrderedDict()\n",
" layers[\"index\"] = layer_index\n",
" layers[\"values\"] = [\n",
" round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n",
" ]\n",
" all_layers.append(layers)\n",
" # for (i, token) in enumerate(feature.tokens):\n",
" all_layers = []\n",
" for (j, layer_index) in enumerate(layer_indexes):\n",
" print(\"layer\", j, layer_index)\n",
" layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n",
" layer_output = layer_output[b]\n",
" layers = collections.OrderedDict()\n",
" layers[\"index\"] = layer_index\n",
" layer_output = layer_output\n",
" layers[\"values\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\n",
" all_layers.append(layers)\n",
"\n",
" out_features = collections.OrderedDict()\n",
" out_features[\"token\"] = token\n",
" out_features[\"layers\"] = all_layers\n",
" all_out_features.append(out_features)\n",
" output_json[\"features\"] = all_out_features\n",
......@@ -950,11 +1000,11 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:09:19.706616Z",
"start_time": "2018-11-02T14:09:19.673670Z"
"end_time": "2018-11-03T02:09:57.139854Z",
"start_time": "2018-11-03T02:09:57.104636Z"
}
},
"outputs": [
......@@ -965,84 +1015,127 @@
"1\n",
"2\n",
"odict_keys(['linex_index', 'features'])\n",
"14\n"
"number of tokens 1\n",
"number of layers 12\n",
"hidden_size 128\n"
]
},
{
"data": {
"text/plain": [
"(128, 768)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(len(pytorch_all_out))\n",
"print(len(pytorch_all_out[0]))\n",
"print(pytorch_all_out[0].keys())\n",
"print(len(pytorch_all_out[0]['features']))"
"print(\"number of tokens\", len(pytorch_all_out))\n",
"print(\"number of layers\", len(pytorch_all_out[0]['features'][0]['layers']))\n",
"print(\"hidden_size\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\n",
"pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:10:28.295669Z",
"start_time": "2018-11-02T14:10:28.263140Z"
"end_time": "2018-11-03T02:09:59.000058Z",
"start_time": "2018-11-03T02:09:58.967575Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[-0.016153,\n",
" -0.697252,\n",
" -0.298296,\n",
" -0.167194,\n",
" -0.219306,\n",
" 0.061712,\n",
" -0.006953,\n",
" 0.366519,\n",
" -0.031027,\n",
" -0.33547]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"(128, 768)\n",
"(128, 768)\n"
]
}
],
"source": [
"pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\n",
"print(pytorch_outputs[0].shape)\n",
"print(pytorch_outputs[1].shape)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:59.462123Z",
"start_time": "2018-11-03T02:09:59.430932Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(128, 768)\n",
"(128, 768)\n"
]
}
],
"source": [
"pytorch_output = pytorch_all_out[0]['features'][0]['layers'][0]['values']\n",
"pytorch_output[:10]"
"print(tensorflow_outputs[0].shape)\n",
"print(tensorflow_outputs[1].shape)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-02T14:10:34.540457Z",
"start_time": "2018-11-02T14:10:34.510109Z"
"end_time": "2018-11-03T02:10:00.014784Z",
"start_time": "2018-11-03T02:09:59.983978Z"
}
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:10:09.582557Z",
"start_time": "2018-11-03T02:10:09.549308Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(128, 768) (128, 768)\n"
]
},
{
"data": {
"text/plain": [
"[-0.628111,\n",
" 0.193215,\n",
" -0.75185,\n",
" -0.040464,\n",
" -0.875331,\n",
" 0.15654,\n",
" 1.385444,\n",
" 1.066997,\n",
" -0.349549,\n",
" 0.270686]"
"4.1671223e-07"
]
},
"execution_count": 14,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tensorflow_output[:10]"
"i = 11\n",
"print(np.array(tensorflow_outputs[i]).shape, np.array(pytorch_outputs[i]).shape)\n",
"np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0))"
]
},
{
......
......@@ -268,29 +268,31 @@ def main():
input_mask = input_mask.float().to(device)
all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
all_encoder_layers = all_encoder_layers
for enc_layers, example_index in zip(all_encoder_layers, example_indices):
for b, example_index in enumerate(example_indices):
feature = features[example_index.item()]
unique_id = int(feature.unique_id)
# feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id
all_features = []
all_out_features = []
for (i, token) in enumerate(feature.tokens):
all_layers = []
for (j, layer_index) in enumerate(layer_indexes):
layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()
layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
layer_output = layer_output[b]
layers = collections.OrderedDict()
layers["index"] = layer_index
layers["values"] = [
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
round(x.item(), 6) for x in layer_output[i]
]
all_layers.append(layers)
features = collections.OrderedDict()
features["token"] = token
features["layers"] = all_layers
all_features.append(features)
output_json["features"] = all_features
out_features = collections.OrderedDict()
out_features["token"] = token
out_features["layers"] = all_layers
all_out_features.append(out_features)
output_json["features"] = all_out_features
writer.write(json.dumps(output_json) + "\n")
......
......@@ -27,8 +27,9 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss
def gelu(x):
return 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
# OpenAI GPT gelu version was : 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
# OpenAI GPT gelu version :
# return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
class BertConfig(object):
......@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
......@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# H = `size_per_head`
query_layer = self.query(hidden_states)
key_layer = self.key(hidden_states)
value_layer = self.value(hidden_states)
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(query_layer)
key_layer = self.transpose_for_scores(key_layer, is_key_tensor=True)
value_layer = self.transpose_for_scores(value_layer)
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer) #, is_key_tensor=True)
value_layer = self.transpose_for_scores(mixed_value_layer)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# `attention_scores` = [B, N, F, T]
attention_scores = torch.matmul(query_layer, key_layer)
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_scores_no_norm = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores_no_mask = attention_scores_no_norm / math.sqrt(self.attention_head_size)
# TODO clean up this (precompute)
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
......@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
# adder = (1.0 - attention_mask) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_scores += attention_mask
attention_scores = attention_scores_no_mask + attention_mask
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T]
attention_probs = nn.Softmax(dim=-1)(attention_scores)
attention_probs_no_drop = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
attention_probs = self.dropout(attention_probs_no_drop)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
# aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1)
# aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
return context_layer
......@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(input_tensor)
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
......@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
self.output = BERTSelfOutput(config)
def forward(self, input_tensor, attention_mask):
attention_output = self.self(input_tensor, attention_mask)
attention_output = self.output(attention_output, input_tensor)
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output
......@@ -388,13 +394,16 @@ class BertModel(nn.Module):
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attention_mask = (1.0 - attention_mask) * -10000.0
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
embedding_output = self.embeddings(input_ids, token_type_ids)
all_encoder_layers = self.encoder(embedding_output, attention_mask)
all_encoder_layers = self.encoder(embedding_output, extended_attention_mask)
sequence_output = all_encoder_layers[-1]
pooled_output = self.pooler(sequence_output)
# TODO DEbugging
# all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers
return all_encoder_layers, pooled_output
class BertForSequenceClassification(nn.Module):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment