Merge pull request #2350 from drdan14/log-parser-python-improved

Python log parser improvements
author: Evan Shelhamer <shelhamer@imaginarynumber.net> 2015-05-30 00:26:27 -0700
committer: Evan Shelhamer <shelhamer@imaginarynumber.net> 2015-05-30 00:26:27 -0700
commit: 50ab52cbbf738cc92b0eda042661b8c0172774c1 (patch)
tree: bf7cf8ef583a1d747a6952109fe2b2db4e3fa18d /tools
parent: d3cc58033065977b129f1244fe3d50c17b1e8503 (diff)
parent: 23d28fdef8ba936b2a6a4a123de625084faec1dd (diff)
download: caffeonacl-50ab52cbbf738cc92b0eda042661b8c0172774c1.tar.gz
caffeonacl-50ab52cbbf738cc92b0eda042661b8c0172774c1.tar.bz2
caffeonacl-50ab52cbbf738cc92b0eda042661b8c0172774c1.zip
1 files changed, 102 insertions, 63 deletions
diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py
index 16ba077a..09ea216c 100755
--- a/tools/extra/parse_log.py
+++ b/tools/extra/parse_log.py
@@ -3,7 +3,7 @@
 """
 Parse training log
 
-Competitor to parse_log.sh
+Evolved from parse_log.sh
 """
 
 import os
@@ -11,18 +11,7 @@ import re
 import extract_seconds
 import argparse
 import csv
-
-
-def get_line_type(line):
-    """Return either 'test' or 'train' depending on line type
-    """
-
-    line_type = None
-    if line.find('Train') != -1:
-        line_type = 'train'
-    elif line.find('Test') != -1:
-        line_type = 'test'
-    return line_type
+from collections import OrderedDict
 
 
 def parse_log(path_to_log):
@@ -36,70 +25,112 @@ def parse_log(path_to_log):
     for the two dict_lists
     """
 
-    re_iteration = re.compile('Iteration (\d+)')
-    re_accuracy = re.compile('output #\d+: accuracy = ([\.\d]+)')
-    re_train_loss = re.compile('Iteration \d+, loss = ([\.\d]+)')
-    re_output_loss = re.compile('output #\d+: loss = ([\.\d]+)')
-    re_lr = re.compile('lr = ([\.\d]+)')
+    regex_iteration = re.compile('Iteration (\d+)')
+    regex_train_output = re.compile('Train net output #(\d+): (\S+) = ([\.\deE+-]+)')
+    regex_test_output = re.compile('Test net output #(\d+): (\S+) = ([\.\deE+-]+)')
+    regex_learning_rate = re.compile('lr = ([\.\d]+)')
 
     # Pick out lines of interest
     iteration = -1
-    test_accuracy = -1
     learning_rate = float('NaN')
     train_dict_list = []
     test_dict_list = []
-    train_dict_names = ('NumIters', 'Seconds', 'TrainingLoss', 'LearningRate')
-    test_dict_names = ('NumIters', 'Seconds', 'TestAccuracy', 'TestLoss')
+    train_row = None
+    test_row = None
 
     logfile_year = extract_seconds.get_log_created_year(path_to_log)
     with open(path_to_log) as f:
         start_time = extract_seconds.get_start_time(f, logfile_year)
 
         for line in f:
-            iteration_match = re_iteration.search(line)
+            iteration_match = regex_iteration.search(line)
             if iteration_match:
                 iteration = float(iteration_match.group(1))
             if iteration == -1:
-                # Only look for other stuff if we've found the first iteration
+                # Only start parsing for other stuff if we've found the first
+                # iteration
                 continue
 
             time = extract_seconds.extract_datetime_from_line(line,
                                                               logfile_year)
             seconds = (time - start_time).total_seconds()
 
-            lr_match = re_lr.search(line)
-            if lr_match:
-                learning_rate = float(lr_match.group(1))
-
-            accuracy_match = re_accuracy.search(line)
-            if accuracy_match and get_line_type(line) == 'test':
-                test_accuracy = float(accuracy_match.group(1))
-
-            train_loss_match = re_train_loss.search(line)
-            if train_loss_match:
-                train_loss = float(train_loss_match.group(1))
-                train_dict_list.append({'NumIters': iteration,
-                                        'Seconds': seconds,
-                                        'TrainingLoss': train_loss,
-                                        'LearningRate': learning_rate})
-
-            output_loss_match = re_output_loss.search(line)
-            if output_loss_match and get_line_type(line) == 'test':
-                test_loss = float(output_loss_match.group(1))
-                # NOTE: we assume that (1) accuracy always comes right before
-                # loss for test data so the test_accuracy variable is already
-                # correctly populated and (2) there's one and only one output
-                # named "accuracy" for the test net
-                test_dict_list.append({'NumIters': iteration,
-                                       'Seconds': seconds,
-                                       'TestAccuracy': test_accuracy,
-                                       'TestLoss': test_loss})
-
-    return train_dict_list, train_dict_names, test_dict_list, test_dict_names
-
-
-def save_csv_files(logfile_path, output_dir, train_dict_list, train_dict_names,
-                   test_dict_list, test_dict_names, verbose=False):
+            learning_rate_match = regex_learning_rate.search(line)
+            if learning_rate_match:
+                learning_rate = float(learning_rate_match.group(1))
+
+            train_dict_list, train_row = parse_line_for_net_output(
+                regex_train_output, train_row, train_dict_list,
+                line, iteration, seconds, learning_rate
+            )
+            test_dict_list, test_row = parse_line_for_net_output(
+                regex_test_output, test_row, test_dict_list,
+                line, iteration, seconds, learning_rate
+            )
+
+    fix_initial_nan_learning_rate(train_dict_list)
+    fix_initial_nan_learning_rate(test_dict_list)
+
+    return train_dict_list, test_dict_list
+
+
+def parse_line_for_net_output(regex_obj, row, row_dict_list,
+                              line, iteration, seconds, learning_rate):
+    """Parse a single line for training or test output
+
+    Returns a a tuple with (row_dict_list, row)
+    row: may be either a new row or an augmented version of the current row
+    row_dict_list: may be either the current row_dict_list or an augmented
+    version of the current row_dict_list
+    """
+
+    output_match = regex_obj.search(line)
+    if output_match:
+        if not row or row['NumIters'] != iteration:
+            # Push the last row and start a new one
+            if row:
+                # If we're on a new iteration, push the last row
+                # This will probably only happen for the first row; otherwise
+                # the full row checking logic below will push and clear full
+                # rows
+                row_dict_list.append(row)
+
+            row = OrderedDict([
+                ('NumIters', iteration),
+                ('Seconds', seconds),
+                ('LearningRate', learning_rate)
+            ])
+
+        # output_num is not used; may be used in the future
+        # output_num = output_match.group(1)
+        output_name = output_match.group(2)
+        output_val = output_match.group(3)
+        row[output_name] = float(output_val)
+
+    if row and len(row_dict_list) >= 1 and len(row) == len(row_dict_list[0]):
+        # The row is full, based on the fact that it has the same number of
+        # columns as the first row; append it to the list
+        row_dict_list.append(row)
+        row = None
+
+    return row_dict_list, row
+
+
+def fix_initial_nan_learning_rate(dict_list):
+    """Correct initial value of learning rate
+
+    Learning rate is normally not printed until after the initial test and
+    training step, which means the initial testing and training rows have
+    LearningRate = NaN. Fix this by copying over the LearningRate from the
+    second row, if it exists.
+    """
+
+    if len(dict_list) > 1:
+        dict_list[0]['LearningRate'] = dict_list[1]['LearningRate']
+
+
+def save_csv_files(logfile_path, output_dir, train_dict_list, test_dict_list,
+                   delimiter=',', verbose=False):
     """Save CSV files to output_dir
 
     If the input log file is, e.g., caffe.INFO, the names will be
@@ -108,18 +139,22 @@ def save_csv_files(logfile_path, output_dir, train_dict_list, train_dict_names,
 
     log_basename = os.path.basename(logfile_path)
     train_filename = os.path.join(output_dir, log_basename + '.train')
-    write_csv(train_filename, train_dict_list, train_dict_names, verbose)
+    write_csv(train_filename, train_dict_list, delimiter, verbose)
 
     test_filename = os.path.join(output_dir, log_basename + '.test')
-    write_csv(test_filename, test_dict_list, test_dict_names, verbose)
+    write_csv(test_filename, test_dict_list, delimiter, verbose)
 
 
-def write_csv(output_filename, dict_list, header_names, verbose=False):
+def write_csv(output_filename, dict_list, delimiter, verbose=False):
     """Write a CSV file
     """
 
+    dialect = csv.excel
+    dialect.delimiter = delimiter
+
     with open(output_filename, 'w') as f:
-        dict_writer = csv.DictWriter(f, header_names)
+        dict_writer = csv.DictWriter(f, fieldnames=dict_list[0].keys(),
+                                     dialect=dialect)
         dict_writer.writeheader()
         dict_writer.writerows(dict_list)
     if verbose:
@@ -141,16 +176,20 @@ def parse_args():
                         action='store_true',
                         help='Print some extra info (e.g., output filenames)')
 
+    parser.add_argument('--delimiter',
+                        default=',',
+                        help=('Column delimiter in output files '
+                              '(default: \'%(default)s\')'))
+
     args = parser.parse_args()
     return args
 
 
 def main():
     args = parse_args()
-    train_dict_list, train_dict_names, test_dict_list, test_dict_names = \
-        parse_log(args.logfile_path)
+    train_dict_list, test_dict_list = parse_log(args.logfile_path)
     save_csv_files(args.logfile_path, args.output_dir, train_dict_list,
-                   train_dict_names, test_dict_list, test_dict_names)
+                   test_dict_list, delimiter=args.delimiter)
 
 
 if __name__ == '__main__':
author	Evan Shelhamer <shelhamer@imaginarynumber.net>	2015-05-30 00:26:27 -0700
committer	Evan Shelhamer <shelhamer@imaginarynumber.net>	2015-05-30 00:26:27 -0700
commit	50ab52cbbf738cc92b0eda042661b8c0172774c1 (patch)
tree	bf7cf8ef583a1d747a6952109fe2b2db4e3fa18d /tools
parent	d3cc58033065977b129f1244fe3d50c17b1e8503 (diff)
parent	23d28fdef8ba936b2a6a4a123de625084faec1dd (diff)
download	caffeonacl-50ab52cbbf738cc92b0eda042661b8c0172774c1.tar.gz caffeonacl-50ab52cbbf738cc92b0eda042661b8c0172774c1.tar.bz2 caffeonacl-50ab52cbbf738cc92b0eda042661b8c0172774c1.zip