debugging

allenai · jacobdanovitch · May 8, 2020 · May 13, 2020 · May 13, 2020 · dc7911ca04291a37cc2dd5d991fa084888caba5a
commit dc7911ca04291a37cc2dd5d991fa084888caba5a
diff --git a/allennlp/common/from_params.py b/allennlp/common/from_params.py
@@ -275,6 +275,7 @@ def pop_and_construct_arg(
                 f"The module from model at {archive_file} at path {module_path} "
                 f"was expected of type {annotation} but is of type {type(result)}"
             )
+        print("_PRETRAINED FINISHED")
         return result
 
     popped_params = params.pop(name, default) if default != _NO_DEFAULT else params.pop(name)

diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py
@@ -277,6 +277,7 @@ def from_instances(
         of what the other parameters do.
         """
         logger.info("Fitting token dictionary from dataset.")
+        print("FROM INSTANCES")
         padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
         oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
         namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
@@ -361,6 +362,7 @@ def from_files_and_instances(
         """
         vocab = cls.from_files(directory, padding_token, oov_token)
         logger.info("Fitting token dictionary from dataset.")
+        print("FROM FILES AND INSTANCES")
         namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
         for instance in Tqdm.tqdm(instances):
             instance.count_vocab_items(namespace_token_counts)
@@ -374,6 +376,7 @@ def from_files_and_instances(
             tokens_to_add=tokens_to_add,
             min_pretrained_embeddings=min_pretrained_embeddings,
         )
+        print(f"\n\nVOCABULARY: {vocab}\n\n")
         return vocab
 
     @classmethod
@@ -446,6 +449,7 @@ def set_from_file(
 
     def extend_from_instances(self, instances: Iterable["adi.Instance"]) -> None:
         logger.info("Fitting token dictionary from dataset.")
+        print("EXTEND FROM INSTANCES")
         namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
         for instance in Tqdm.tqdm(instances):
             instance.count_vocab_items(namespace_token_counts)

diff --git a/allennlp/models/basic_classifier.py b/allennlp/models/basic_classifier.py
@@ -92,6 +92,7 @@ def __init__(
         self._accuracy = CategoricalAccuracy()
         self._loss = torch.nn.CrossEntropyLoss()
         initializer(self)
+        self.extend_embedder_vocab()
 
     def forward(  # type: ignore
         self, tokens: TextFieldTensors, label: torch.IntTensor = None

diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py
@@ -241,6 +241,7 @@ def extend_vocab(
             to give a helpful error message when extend_vocab is implicitly called
             by train or any other command.
         """
+        print("\n\nEXTENDING VOCAB\n\n")
         # Caveat: For allennlp v0.8.1 and below, we weren't storing vocab_namespace as an attribute,
         # knowing which is necessary at time of embedding vocab extension. So old archive models are
         # currently unextendable.
@@ -255,6 +256,13 @@ def extend_vocab(
             return
 
         extended_num_embeddings = extended_vocab.get_vocab_size(vocab_namespace)
+        print(f"VOCAB: {extended_vocab}")
+        print(f"NAMESPACE: {vocab_namespace}")
+        print(f"EXTENDED_NUM_EMBEDDINGS: {extended_num_embeddings}")
+        print(f"NUM EMBEDDINGS: {self.num_embeddings}")
+        import traceback
+        for line in traceback.format_stack()[:-5]:
+            print(line)
         if extended_num_embeddings == self.num_embeddings:
             # It's already been extended. No need to initialize / read pretrained file in first place (no-op)
             return

diff --git a/allennlp/tests/transfer_learning/fixtures/esnli.jsonnet b/allennlp/tests/transfer_learning/fixtures/esnli.jsonnet
@@ -17,15 +17,15 @@ local seq_encoder = {
         }
     }
   },
-  "train_data_path": "tests/fixtures/data/esnli_train.jsonl",
-  "validation_data_path": "tests/fixtures/data/esnli_train.jsonl",
+  "train_data_path": "allennlp/tests/fixtures/data/esnli_train.jsonl",
+  "validation_data_path": "allennlp/tests/fixtures/data/esnli_train.jsonl",
   "model": {
     "type": "esim",
     "text_field_embedder": {
         "token_embedders": {
             "tokens": {
                 "type": "embedding",
-                "pretrained_file": "tests/fixtures/embeddings/glove.6B.100d.sample.txt.gz", //"https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
+                "pretrained_file": "allennlp/tests/fixtures/embeddings/glove.6B.100d.sample.txt.gz", //"https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
                 "embedding_dim": embedding_dim,
                 "trainable": true
             }

diff --git a/allennlp/tests/transfer_learning/fixtures/movies.jsonnet b/allennlp/tests/transfer_learning/fixtures/movies.jsonnet
@@ -12,9 +12,12 @@ local pretrained = function(module_path, frozen=false) {"_pretrained": {
     }
   },
 
-  "train_data_path": "tests/fixtures/data/movies_train.jsonl",
-  "validation_data_path": "tests/fixtures/data/movies_train.jsonl",
-
+  "train_data_path": "allennlp/tests/fixtures/data/movies_train.jsonl",
+  "validation_data_path": "allennlp/tests/fixtures/data/movies_train.jsonl",
+  "vocabulary": {
+    "type": "extend",
+    "directory": "/tmp/taskA/vocabulary"
+  },
   "model": {
     "type": "basic_classifier",
     "text_field_embedder": pretrained("_text_field_embedder"),

diff --git a/allennlp/tests/transfer_learning/models/transfer_learning_test.py b/allennlp/tests/transfer_learning/models/transfer_learning_test.py
@@ -11,12 +11,12 @@
 class TransferLearningTest(ModelTestCase):
     def setUp(self):
         super().setUp()
-        self.set_up_model('tests/fixtures/esnli.jsonnet',
-                          'tests/fixtures/esnli_train.jsonl')
+        self.set_up_model('allennlp/tests/fixtures/esnli.jsonnet',
+                          'allennlp/tests/fixtures/esnli_train.jsonl')
 
     def test_taskA_end_to_end(self):
-        train_model_from_file("tests/transfer_learning/fixtures/esnli.jsonnet", serialization_dir="/tmp/taskA", force=True)
+        train_model_from_file("allennlp/tests/transfer_learning/fixtures/esnli.jsonnet", serialization_dir="/tmp/taskA", force=True)
 
     def test_taskB_end_to_end(self):
-        train_model_from_file("tests/transfer_learning/fixtures/movies.jsonnet", serialization_dir="/tmp/taskB", force=True)
+        train_model_from_file("allennlp/tests/transfer_learning/fixtures/movies.jsonnet", serialization_dir="/tmp/taskB", force=True)