Merge pull request #5546 from cshallue/master

Improvements to AstroNet and add AstroWaveNet

Merge pull request #5546 from cshallue/master
Improvements to AstroNet and add AstroWaveNet
69b01644 · Chris Shallue · GitHub · 91b2debd · 763663de · 69b01644
Unverified Commit 69b01644 authored Oct 16, 2018 by Chris Shallue Committed by GitHub Oct 16, 2018
18 changed files
--- a/research/astronet/astrowavenet/data/synthetic_transits.py
+++ b/research/astronet/astrowavenet/data/synthetic_transits.py
+# Copyright 2018 The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Synthetic transit inputs to the AstroWaveNet model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+from astronet.util import configdict
+from astrowavenet.data import base
+from astrowavenet.data import synthetic_transit_maker
+def _prepare_wavenet_inputs(light_curve, mask):
+  """Gathers synthetic transits into the format expected by AstroWaveNet."""
+  return {
+      "autoregressive_input": tf.expand_dims(light_curve, -1),
+      "conditioning_stack": tf.expand_dims(mask, -1),
+  }
+class SyntheticTransits(base.DatasetBuilder):
+  """Synthetic transit inputs to the AstroWaveNet model."""
+  @staticmethod
+  def default_config():
+    return configdict.ConfigDict({
+        "period_range": (0.5, 4),
+        "amplitude_range": (1, 1),
+        "threshold_ratio_range": (0, 0.99),
+        "phase_range": (0, 1),
+        "noise_sd_range": (0.1, 0.1),
+        "mask_probability": 0.1,
+        "light_curve_time_range": (0, 100),
+        "light_curve_num_points": 1000
+    })
+  def build(self, batch_size):
+    transit_maker = synthetic_transit_maker.SyntheticTransitMaker(
+        period_range=self.config.period_range,
+        amplitude_range=self.config.amplitude_range,
+        threshold_ratio_range=self.config.threshold_ratio_range,
+        phase_range=self.config.phase_range,
+        noise_sd_range=self.config.noise_sd_range)
+    t_start, t_end = self.config.light_curve_time_range
+    time = np.linspace(t_start, t_end, self.config.light_curve_num_points)
+    dataset = tf.data.Dataset.from_generator(
+        transit_maker.random_light_curve_generator(
+            time, mask_prob=self.config.mask_probability),
+        output_types=(tf.float32, tf.float32),
+        output_shapes=(tf.TensorShape((self.config.light_curve_num_points,)),
+                       tf.TensorShape((self.config.light_curve_num_points,))))
+    dataset = dataset.map(_prepare_wavenet_inputs)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(-1)
+    return dataset
--- a/research/astronet/astrowavenet/data/test_data/test-dataset.tfrecord
+++ b/research/astronet/astrowavenet/data/test_data/test-dataset.tfrecord
--- a/research/astronet/astrowavenet/trainer.py
+++ b/research/astronet/astrowavenet/trainer.py
+# Copyright 2018 The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Script for training and evaluating AstroWaveNet models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import os.path
+from absl import flags
+import tensorflow as tf
+from astronet.util import config_util
+from astronet.util import configdict
+from astronet.util import estimator_runner
+from astrowavenet import astrowavenet_model
+from astrowavenet import configurations
+from astrowavenet.data import kepler_light_curves
+from astrowavenet.data import synthetic_transits
+from astrowavenet.util import estimator_util
+FLAGS = flags.FLAGS
+flags.DEFINE_enum("dataset", None,
+                  ["synthetic_transits", "kepler_light_curves"],
+                  "Dataset for training and/or evaluation.")
+flags.DEFINE_string("model_dir", None, "Base output directory.")
+flags.DEFINE_string(
+    "train_files", None,
+    "Comma-separated list of file patterns matching the TFRecord files in the "
+    "training dataset.")
+flags.DEFINE_string(
+    "eval_files", None,
+    "Comma-separated list of file patterns matching the TFRecord files in the "
+    "evaluation dataset.")
+flags.DEFINE_string("config_name", "base",
+                    "Name of the AstroWaveNet configuration.")
+flags.DEFINE_string(
+    "config_overrides", "{}",
+    "JSON string or JSON file containing overrides to the base configuration.")
+flags.DEFINE_enum("schedule", None,
+                  ["train", "train_and_eval", "continuous_eval"],
+                  "Schedule for running the model.")
+flags.DEFINE_string("eval_name", "val", "Name of the evaluation task.")
+flags.DEFINE_integer("train_steps", None, "Total number of steps for training.")
+flags.DEFINE_integer("eval_steps", None, "Number of steps for each evaluation.")
+flags.DEFINE_integer(
+    "local_eval_frequency", 1000,
+    "The number of training steps in between evaluation runs. Only applies "
+    "when schedule == 'train_and_eval'.")
+flags.DEFINE_integer("save_summary_steps", None,
+                     "The frequency at which to save model summaries.")
+flags.DEFINE_integer("save_checkpoints_steps", None,
+                     "The frequency at which to save model checkpoints.")
+flags.DEFINE_integer("save_checkpoints_secs", None,
+                     "The frequency at which to save model checkpoints.")
+flags.DEFINE_integer("keep_checkpoint_max", 1,
+                     "The maximum number of model checkpoints to keep.")
+# ------------------------------------------------------------------------------
+# TPU-only flags
+# ------------------------------------------------------------------------------
+flags.DEFINE_boolean("use_tpu", False, "Whether to execute on TPU.")
+flags.DEFINE_string("master", None, "Address of the TensorFlow TPU master.")
+flags.DEFINE_integer("tpu_num_shards", 8, "Number of TPU shards.")
+flags.DEFINE_integer("tpu_iterations_per_loop", 1000,
+                     "Number of iterations per TPU training loop.")
+flags.DEFINE_integer(
+    "eval_batch_size", None,
+    "Batch size for TPU evaluation. Defaults to the training batch size.")
+def _create_run_config():
+  """Creates a TPU RunConfig if FLAGS.use_tpu is True, else a RunConfig."""
+  session_config = tf.ConfigProto(allow_soft_placement=True)
+  run_config_kwargs = {
+      "save_summary_steps": FLAGS.save_summary_steps,
+      "save_checkpoints_steps": FLAGS.save_checkpoints_steps,
+      "save_checkpoints_secs": FLAGS.save_checkpoints_secs,
+      "session_config": session_config,
+      "keep_checkpoint_max": FLAGS.keep_checkpoint_max
+  }
+  if FLAGS.use_tpu:
+    if not FLAGS.master:
+      raise ValueError("FLAGS.master must be set for TPUEstimator.")
+    tpu_config = tf.contrib.tpu.TPUConfig(
+        iterations_per_loop=FLAGS.tpu_iterations_per_loop,
+        num_shards=FLAGS.tpu_num_shards,
+        per_host_input_for_training=(FLAGS.tpu_num_shards <= 8))
+    run_config = tf.contrib.tpu.RunConfig(
+        tpu_config=tpu_config, master=FLAGS.master, **run_config_kwargs)
+  else:
+    if FLAGS.master:
+      raise ValueError("FLAGS.master should only be set for TPUEstimator.")
+    run_config = tf.estimator.RunConfig(**run_config_kwargs)
+  return run_config
+def _get_file_pattern(mode):
+  """Gets the value of the file pattern flag for the specified mode."""
+  flag_name = ("train_files"
+               if mode == tf.estimator.ModeKeys.TRAIN else "eval_files")
+  file_pattern = FLAGS[flag_name].value
+  if file_pattern is None:
+    raise ValueError("--{} is required for mode '{}'".format(flag_name, mode))
+  return file_pattern
+def _create_dataset_builder(mode, config_overrides=None):
+  """Creates a dataset builder for the input pipeline."""
+  if FLAGS.dataset == "synthetic_transits":
+    return synthetic_transits.SyntheticTransits(config_overrides)
+  file_pattern = _get_file_pattern(mode)
+  if FLAGS.dataset == "kepler_light_curves":
+    builder_class = kepler_light_curves.KeplerLightCurves
+  else:
+    raise ValueError("Unsupported dataset: {}".format(FLAGS.dataset))
+  return builder_class(
+      file_pattern,
+      mode,
+      config_overrides=config_overrides,
+      use_tpu=FLAGS.use_tpu)
+def _create_input_fn(mode, config_overrides=None):
+  """Creates an Estimator input_fn."""
+  builder = _create_dataset_builder(mode, config_overrides)
+  tf.logging.info("Dataset config for mode '%s': %s", mode,
+                  config_util.to_json(builder.config))
+  return estimator_util.create_input_fn(builder)
+def _create_eval_args(config_overrides=None):
+  """Builds eval_args for estimator_runner.evaluate()."""
+  if FLAGS.dataset == "synthetic_transits" and not FLAGS.eval_steps:
+    raise ValueError("Dataset '{}' requires --eval_steps for evaluation".format(
+        FLAGS.dataset))
+  input_fn = _create_input_fn(tf.estimator.ModeKeys.EVAL, config_overrides)
+  return {FLAGS.eval_name: (input_fn, FLAGS.eval_steps)}
+def main(argv):
+  del argv  # Unused.
+  config = configdict.ConfigDict(configurations.get_config(FLAGS.config_name))
+  config_overrides = json.loads(FLAGS.config_overrides)
+  for key in config_overrides:
+    if key not in ["dataset", "hparams"]:
+      raise ValueError("Unrecognized config override: {}".format(key))
+  config.hparams.update(config_overrides.get("hparams", {}))
+  # Log configs.
+  configs_json = [
+      ("config_overrides", config_util.to_json(config_overrides)),
+      ("config", config_util.to_json(config)),
+  ]
+  for config_name, config_json in configs_json:
+    tf.logging.info("%s: %s", config_name, config_json)
+  # Create the estimator.
+  run_config = _create_run_config()
+  estimator = estimator_util.create_estimator(
+      astrowavenet_model.AstroWaveNet, config.hparams, run_config,
+      FLAGS.model_dir, FLAGS.eval_batch_size)
+  if FLAGS.schedule in ["train", "train_and_eval"]:
+    # Save configs.
+    tf.gfile.MakeDirs(FLAGS.model_dir)
+    for config_name, config_json in configs_json:
+      filename = os.path.join(FLAGS.model_dir, "{}.json".format(config_name))
+      with tf.gfile.Open(filename, "w") as f:
+        f.write(config_json)
+    train_input_fn = _create_input_fn(tf.estimator.ModeKeys.TRAIN,
+                                      config_overrides.get("dataset"))
+    train_hooks = []
+    if FLAGS.schedule == "train":
+      estimator.train(
+          train_input_fn, hooks=train_hooks, max_steps=FLAGS.train_steps)
+    else:
+      assert FLAGS.schedule == "train_and_eval"
+      eval_args = _create_eval_args(config_overrides.get("dataset"))
+      for _ in estimator_runner.continuous_train_and_eval(
+          estimator=estimator,
+          train_input_fn=train_input_fn,
+          eval_args=eval_args,
+          local_eval_frequency=FLAGS.local_eval_frequency,
+          train_hooks=train_hooks,
+          train_steps=FLAGS.train_steps):
+        # continuous_train_and_eval() yields evaluation metrics after each
+        # FLAGS.local_eval_frequency. It also saves and logs them, so we don't
+        # do anything here.
+        pass
+  else:
+    assert FLAGS.schedule == "continuous_eval"
+    eval_args = _create_eval_args(config_overrides.get("dataset"))
+    for _ in estimator_runner.continuous_eval(
+        estimator=estimator, eval_args=eval_args,
+        train_steps=FLAGS.train_steps):
+      # continuous_train_and_eval() yields evaluation metrics after each
+      # checkpoint. It also saves and logs them, so we don't do anything here.
+      pass
+if __name__ == "__main__":
+  tf.logging.set_verbosity(tf.logging.INFO)
+  flags.mark_flags_as_required(["dataset", "model_dir", "schedule"])
+  def _validate_schedule(flag_values):
+    """Validates the --schedule flag and the flags it interacts with."""
+    schedule = flag_values["schedule"]
+    save_checkpoints_steps = flag_values["save_checkpoints_steps"]
+    save_checkpoints_secs = flag_values["save_checkpoints_secs"]
+    if schedule in ["train", "train_and_eval"]:
+      if not (save_checkpoints_steps or save_checkpoints_secs):
+        raise flags.ValidationError(
+            "--schedule='%s' requires --save_checkpoints_steps or "
+            "--save_checkpoints_secs." % schedule)
+    return True
+  flags.register_multi_flags_validator(
+      ["schedule", "save_checkpoints_steps", "save_checkpoints_secs"],
+      _validate_schedule)
+  tf.app.run()
--- a/research/astronet/astrowavenet/util/BUILD
+++ b/research/astronet/astrowavenet/util/BUILD
+package(default_visibility = ["//visibility:public"])
+licenses(["notice"])  # Apache 2.0
+py_library(
+    name = "estimator_util",
+    srcs = ["estimator_util.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//astronet/ops:training"],
+)
--- a/research/astronet/astrowavenet/util/estimator_util.py
+++ b/research/astronet/astrowavenet/util/estimator_util.py
+# Copyright 2018 The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper functions for creating a TensorFlow Estimator."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+import tensorflow as tf
+from astronet.ops import training
+class _InputFn(object):
+  """Class that acts as a callable input function for Estimator train / eval."""
+  def __init__(self, dataset_builder):
+    """Initializes the input function.
+    Args:
+      dataset_builder: Instance of DatasetBuilder.
+    """
+    self._builder = dataset_builder
+  def __call__(self, params):
+    """Builds the input pipeline."""
+    return self._builder.build(batch_size=params["batch_size"])
+def create_input_fn(dataset_builder):
+  """Creates an input_fn that that builds an input pipeline.
+  Args:
+    dataset_builder: Instance of DatasetBuilder.
+  Returns:
+    A callable that builds an input pipeline and returns a tf.data.Dataset
+    object.
+  """
+  return _InputFn(dataset_builder)
+class _ModelFn(object):
+  """Class that acts as a callable model function for Estimator train / eval."""
+  def __init__(self, model_class, hparams, use_tpu=False):
+    """Initializes the model function.
+    Args:
+      model_class: Model class.
+      hparams: A HParams object containing hyperparameters for building and
+        training the model.
+      use_tpu: If True, a TPUEstimator will be returned. Otherwise an Estimator
+        will be returned.
+    """
+    self._model_class = model_class
+    self._base_hparams = hparams
+    self._use_tpu = use_tpu
+  def __call__(self, features, mode, params):
+    """Builds the model and returns an EstimatorSpec or TPUEstimatorSpec."""
+    hparams = copy.deepcopy(self._base_hparams)
+    if "batch_size" in params:
+      hparams.batch_size = params["batch_size"]
+    model = self._model_class(features, hparams, mode)
+    model.build()
+    # Possibly create train_op.
+    use_tpu = self._use_tpu
+    train_op = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      learning_rate = training.create_learning_rate(hparams, model.global_step)
+      optimizer = training.create_optimizer(hparams, learning_rate, use_tpu)
+      train_op = training.create_train_op(model, optimizer)
+    if use_tpu:
+      estimator = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode, loss=model.total_loss, train_op=train_op)
+    else:
+      estimator = tf.estimator.EstimatorSpec(
+          mode=mode, loss=model.total_loss, train_op=train_op)
+    return estimator
+def create_model_fn(model_class, hparams, use_tpu=False):
+  """Wraps model_class as an Estimator or TPUEstimator model_fn.
+  Args:
+    model_class: AstroModel or a subclass.
+    hparams: ConfigDict of configuration parameters for building the model.
+    use_tpu: If True, a TPUEstimator model_fn is returned. Otherwise an
+      Estimator model_fn is returned.
+  Returns:
+    model_fn: A callable that constructs the model and returns a
+      TPUEstimatorSpec if use_tpu is True, otherwise an EstimatorSpec.
+  """
+  return _ModelFn(model_class, hparams, use_tpu)
+def create_estimator(model_class,
+                     hparams,
+                     run_config=None,
+                     model_dir=None,
+                     eval_batch_size=None):
+  """Wraps model_class as an Estimator or TPUEstimator.
+  If run_config is None or a tf.estimator.RunConfig, an Estimator is returned.
+  If run_config is a tf.contrib.tpu.RunConfig, a TPUEstimator is returned.
+  Args:
+    model_class: AstroWaveNet or a subclass.
+    hparams: ConfigDict of configuration parameters for building the model.
+    run_config: Optional tf.estimator.RunConfig or tf.contrib.tpu.RunConfig.
+    model_dir: Optional directory for saving the model. If not passed
+      explicitly, it must be specified in run_config.
+    eval_batch_size: Optional batch size for evaluation on TPU. Only applicable
+      if run_config is a tf.contrib.tpu.RunConfig. Defaults to
+      hparams.batch_size.
+  Returns:
+    An Estimator object if run_config is None or a tf.estimator.RunConfig, or a
+    TPUEstimator object if run_config is a tf.contrib.tpu.RunConfig.
+  Raises:
+    ValueError:
+      If model_dir is not passed explicitly or in run_config.model_dir, or if
+      eval_batch_size is specified and run_config is not a
+      tf.contrib.tpu.RunConfig.
+  """
+  if run_config is None:
+    run_config = tf.estimator.RunConfig()
+  else:
+    run_config = copy.deepcopy(run_config)
+  if not model_dir and not run_config.model_dir:
+    raise ValueError(
+        "model_dir must be passed explicitly or specified in run_config")
+  use_tpu = isinstance(run_config, tf.contrib.tpu.RunConfig)
+  model_fn = create_model_fn(model_class, hparams, use_tpu)
+  if use_tpu:
+    eval_batch_size = eval_batch_size or hparams.batch_size
+    estimator = tf.contrib.tpu.TPUEstimator(
+        model_fn=model_fn,
+        model_dir=model_dir,
+        config=run_config,
+        train_batch_size=hparams.batch_size,
+        eval_batch_size=eval_batch_size)
+  else:
+    if eval_batch_size is not None:
+      raise ValueError("eval_batch_size can only be specified for TPU.")
+    estimator = tf.estimator.Estimator(
+        model_fn=model_fn,
+        model_dir=model_dir,
+        config=run_config,
+        params={"batch_size": hparams.batch_size})
+  return estimator
--- a/research/astronet/light_curve_util/BUILD
+++ b/research/astronet/light_curve_util/BUILD
@@ -6,6 +6,7 @@ py_library(
    name = "kepler_io",
    srcs = ["kepler_io.py"],
    srcs_version = "PY2AND3",
+    deps = [":util"],
 )
 py_test(

--- a/research/astronet/light_curve_util/cc/python/median_filter_test.py
+++ b/research/astronet/light_curve_util/cc/python/median_filter_test.py
@@ -44,5 +44,5 @@ class MedianFilterTest(absltest.TestCase):
    np.testing.assert_almost_equal(result, expected)
-if __name__ == '__main__':
+if __name__ == "__main__":
  absltest.main()
--- a/research/astronet/light_curve_util/cc/python/phase_fold_test.py
+++ b/research/astronet/light_curve_util/cc/python/phase_fold_test.py
@@ -66,5 +66,5 @@ class PhaseFoldAndSortLightCurveTest(absltest.TestCase):
    np.testing.assert_almost_equal(folded_flux, expected_flux)
-if __name__ == '__main__':
+if __name__ == "__main__":
  absltest.main()
--- a/research/astronet/light_curve_util/cc/python/postproc.py
+++ b/research/astronet/light_curve_util/cc/python/postproc.py
@@ -24,7 +24,8 @@ def ValueErrorOnFalse(ok, *output_args):
  """Raises ValueError if not ok, otherwise returns the output arguments."""
  n_outputs = len(output_args)
  if n_outputs < 2:
-    raise ValueError("Expected 2 or more output_args. Got: %d" % n_outputs)
+    raise ValueError(
+        "Expected 2 or more output_args. Got: {}".format(n_outputs))
  if not ok:
    error = output_args[-1]

--- a/research/astronet/light_curve_util/cc/python/view_generator_test.py
+++ b/research/astronet/light_curve_util/cc/python/view_generator_test.py
@@ -76,5 +76,5 @@ class ViewGeneratorTest(absltest.TestCase):
    np.testing.assert_almost_equal(result, expected)
-if __name__ == '__main__':
+if __name__ == "__main__":
  absltest.main()
--- a/research/astronet/light_curve_util/kepler_io.py
+++ b/research/astronet/light_curve_util/kepler_io.py
@@ -23,10 +23,9 @@ import os.path
 from astropy.io import fits
 import numpy as np
+from light_curve_util import util
 from tensorflow import gfile
-LONG_CADENCE_TIME_DELTA_DAYS = 0.02043422  # Approximately 29.4 minutes.
 # Quarter index to filename prefix for long cadence Kepler data.
 # Reference: https://archive.stsci.edu/kepler/software/get_kepler.py
 LONG_CADENCE_QUARTER_PREFIXES = {
@@ -73,6 +72,14 @@ SHORT_CADENCE_QUARTER_PREFIXES = {
    17: ["2013121191144", "2013131215648"]
 }
+# Quarter order for different scrambling procedures.
+# Page 9: https://ntrs.nasa.gov/archive/nasa/casi.ntrs.nasa.gov/20170009549.pdf.
+SIMULATED_DATA_SCRAMBLE_ORDERS = {
+    "SCR1": [0, 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4, 17],
+    "SCR2": [0, 1, 2, 3, 4, 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 17],
+    "SCR3": [0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 17],
+}
 def kepler_filenames(base_dir,
                     kep_id,
@@ -112,7 +119,7 @@ def kepler_filenames(base_dir,
    A list of filenames.
  """
  # Pad the Kepler id with zeros to length 9.
-  kep_id = "%.9d" % int(kep_id)
+  kep_id = "{:09d}".format(int(kep_id))
  quarter_prefixes, cadence_suffix = ((LONG_CADENCE_QUARTER_PREFIXES, "llc")
                                      if long_cadence else
@@ -128,11 +135,10 @@ def kepler_filenames(base_dir,
  for quarter in quarters:
    for quarter_prefix in quarter_prefixes[quarter]:
      if injected_group:
-        base_name = "kplr%s-%s_INJECTED-%s_%s.fits" % (kep_id, quarter_prefix,
+        base_name = "kplr{}-{}_INJECTED-{}_{}.fits".format(
-                                                       injected_group,
+            kep_id, quarter_prefix, injected_group, cadence_suffix)
-                                                       cadence_suffix)
      else:
-        base_name = "kplr%s-%s_%s.fits" % (kep_id, quarter_prefix,
+        base_name = "kplr{}-{}_{}.fits".format(kep_id, quarter_prefix,
                                               cadence_suffix)
      filename = os.path.join(base_dir, base_name)
      # Not all stars have data for all quarters.
@@ -142,40 +148,86 @@ def kepler_filenames(base_dir,
  return filenames
+def scramble_light_curve(all_time, all_flux, all_quarters, scramble_type):
+  """Scrambles a light curve according to a given scrambling procedure.
+  Args:
+    all_time: List holding arrays of time values, each containing a quarter of
+      time data.
+    all_flux: List holding arrays of flux values, each containing a quarter of
+      flux data.
+    all_quarters: List of integers specifying which quarters are present in
+      the light curve (max is 18: Q0...Q17).
+    scramble_type: String specifying the scramble order, one of {'SCR1', 'SCR2',
+      'SCR3'}.
+  Returns:
+    scr_flux: Scrambled flux values; the same list as the input flux in another
+      order.
+    scr_time: Time values, re-partitioned to match sizes of the scr_flux lists.
+  """
+  order = SIMULATED_DATA_SCRAMBLE_ORDERS[scramble_type]
+  scr_flux = []
+  for quarter in order:
+    # Ignore missing quarters in the scramble order.
+    if quarter in all_quarters:
+      scr_flux.append(all_flux[all_quarters.index(quarter)])
+  scr_time = util.reshard_arrays(all_time, scr_flux)
+  return scr_time, scr_flux
 def read_kepler_light_curve(filenames,
                            light_curve_extension="LIGHTCURVE",
-                            invert=False):
+                            scramble_type=None,
+                            interpolate_missing_time=False):
  """Reads time and flux measurements for a Kepler target star.
  Args:
    filenames: A list of .fits files containing time and flux measurements.
    light_curve_extension: Name of the HDU 1 extension containing light curves.
-    invert: Whether to invert the flux measurements by multiplying by -1.
+    scramble_type: What scrambling procedure to use: 'SCR1', 'SCR2', or 'SCR3'
+      (pg 9: https://exoplanetarchive.ipac.caltech.edu/docs/KSCI-19114-002.pdf).
+    interpolate_missing_time: Whether to interpolate missing (NaN) time values.
+      This should only affect the output if scramble_type is specified (NaN time
+      values typically come with NaN flux values, which are removed anyway, but
+      scrambing decouples NaN time values from NaN flux values).
  Returns:
    all_time: A list of numpy arrays; the time values of the light curve.
-    all_flux: A list of numpy arrays corresponding to the time arrays in
+    all_flux: A list of numpy arrays; the flux values of the light curve.
-        all_time.
  """
  all_time = []
  all_flux = []
+  all_quarters = []
  for filename in filenames:
    with fits.open(gfile.Open(filename, "rb")) as hdu_list:
+      quarter = hdu_list["PRIMARY"].header["QUARTER"]
      light_curve = hdu_list[light_curve_extension].data
    time = light_curve.TIME
    flux = light_curve.PDCSAP_FLUX
+    if not time.size:
+      continue  # No data.
-    # Remove NaN flux values.
+    # Possibly interpolate missing time values.
-    valid_indices = np.where(np.isfinite(flux))
+    if interpolate_missing_time:
-    time = time[valid_indices]
+      time = util.interpolate_missing_time(time, light_curve.CADENCENO)
-    flux = flux[valid_indices]
-    if invert:
-      flux *= -1
-    if time.size:
    all_time.append(time)
    all_flux.append(flux)
+    all_quarters.append(quarter)
+  if scramble_type:
+    all_time, all_flux = scramble_light_curve(all_time, all_flux, all_quarters,
+                                              scramble_type)
+  # Remove timestamps with NaN time or flux values.
+  for i, (time, flux) in enumerate(zip(all_time, all_flux)):
+    flux_and_time_finite = np.logical_and(np.isfinite(flux), np.isfinite(time))
+    all_time[i] = time[flux_and_time_finite]
+    all_flux[i] = flux[flux_and_time_finite]
  return all_time, all_flux
--- a/research/astronet/light_curve_util/kepler_io_test.py
+++ b/research/astronet/light_curve_util/kepler_io_test.py
@@ -19,8 +19,10 @@ from __future__ import division
 from __future__ import print_function
 import os.path
 from absl import flags
 from absl.testing import absltest
+import numpy as np
 from light_curve_util import kepler_io
@@ -34,6 +36,26 @@ class KeplerIoTest(absltest.TestCase):
  def setUp(self):
    self.data_dir = os.path.join(FLAGS.test_srcdir, _DATA_DIR)
+  def testScrambleLightCurve(self):
+    all_flux = [[11, 12], [21], [np.nan, np.nan, 33], [41, 42]]
+    all_time = [[101, 102], [201], [301, 302, 303], [401, 402]]
+    all_quarters = [3, 4, 7, 14]
+    scramble_type = "SCR1"  # New quarters order will be [14,7,3,4].
+    scr_time, scr_flux = kepler_io.scramble_light_curve(
+        all_time, all_flux, all_quarters, scramble_type)
+    # NaNs are not removed in this function.
+    gold_flux = [[41, 42], [np.nan, np.nan, 33], [11, 12], [21]]
+    gold_time = [[101, 102], [201, 301, 302], [303, 401], [402]]
+    self.assertEqual(len(gold_flux), len(scr_flux))
+    self.assertEqual(len(gold_time), len(scr_time))
+    for i in range(len(gold_flux)):
+      np.testing.assert_array_equal(gold_flux[i], scr_flux[i])
+      np.testing.assert_array_equal(gold_time[i], scr_time[i])
  def testKeplerFilenames(self):
    # All quarters.
    filenames = kepler_io.kepler_filenames(
@@ -100,15 +122,17 @@ class KeplerIoTest(absltest.TestCase):
    filenames = kepler_io.kepler_filenames(
        self.data_dir, 11442793, check_existence=True)
    expected_filenames = [
-        os.path.join(self.data_dir, "0114/011442793/kplr011442793-%s_llc.fits")
+        os.path.join(self.data_dir,
-        % q for q in ["2009350155506", "2010009091648", "2010174085026"]
+                     "0114/011442793/kplr011442793-{}_llc.fits".format(q))
+        for q in ["2009350155506", "2010009091648", "2010174085026"]
    ]
    self.assertItemsEqual(expected_filenames, filenames)
  def testReadKeplerLightCurve(self):
    filenames = [
-        os.path.join(self.data_dir, "0114/011442793/kplr011442793-%s_llc.fits")
+        os.path.join(self.data_dir,
-        % q for q in ["2009350155506", "2010009091648", "2010174085026"]
+                     "0114/011442793/kplr011442793-{}_llc.fits".format(q))
+        for q in ["2009350155506", "2010009091648", "2010174085026"]
    ]
    all_time, all_flux = kepler_io.read_kepler_light_curve(filenames)
    self.assertLen(all_time, 3)
@@ -120,6 +144,55 @@ class KeplerIoTest(absltest.TestCase):
    self.assertLen(all_time[2], 4486)
    self.assertLen(all_flux[2], 4486)
+    for time, flux in zip(all_time, all_flux):
+      self.assertTrue(np.isfinite(time).all())
+      self.assertTrue(np.isfinite(flux).all())
+  def testReadKeplerLightCurveScrambled(self):
+    filenames = [
+        os.path.join(self.data_dir,
+                     "0114/011442793/kplr011442793-{}_llc.fits".format(q))
+        for q in ["2009350155506", "2010009091648", "2010174085026"]
+    ]
+    all_time, all_flux = kepler_io.read_kepler_light_curve(
+        filenames, scramble_type="SCR1")
+    self.assertLen(all_time, 3)
+    self.assertLen(all_flux, 3)
+    # Arrays are shorter than above due to separation of time and flux NaNs.
+    self.assertLen(all_time[0], 4344)
+    self.assertLen(all_flux[0], 4344)
+    self.assertLen(all_time[1], 4041)
+    self.assertLen(all_flux[1], 4041)
+    self.assertLen(all_time[2], 1008)
+    self.assertLen(all_flux[2], 1008)
+    for time, flux in zip(all_time, all_flux):
+      self.assertTrue(np.isfinite(time).all())
+      self.assertTrue(np.isfinite(flux).all())
+  def testReadKeplerLightCurveScrambledInterpolateMissingTime(self):
+    filenames = [
+        os.path.join(self.data_dir,
+                     "0114/011442793/kplr011442793-{}_llc.fits".format(q))
+        for q in ["2009350155506", "2010009091648", "2010174085026"]
+    ]
+    all_time, all_flux = kepler_io.read_kepler_light_curve(
+        filenames, scramble_type="SCR1", interpolate_missing_time=True)
+    self.assertLen(all_time, 3)
+    self.assertLen(all_flux, 3)
+    self.assertLen(all_time[0], 4486)
+    self.assertLen(all_flux[0], 4486)
+    self.assertLen(all_time[1], 4134)
+    self.assertLen(all_flux[1], 4134)
+    self.assertLen(all_time[2], 1008)
+    self.assertLen(all_flux[2], 1008)
+    for time, flux in zip(all_time, all_flux):
+      self.assertTrue(np.isfinite(time).all())
+      self.assertTrue(np.isfinite(flux).all())
 if __name__ == "__main__":
  FLAGS.test_srcdir = ""

--- a/research/astronet/light_curve_util/median_filter.py
+++ b/research/astronet/light_curve_util/median_filter.py
@@ -51,35 +51,35 @@ def median_filter(x, y, num_bins, bin_width=None, x_min=None, x_max=None):
    ValueError: If an argument has an inappropriate value.
  """
  if num_bins < 2:
-    raise ValueError("num_bins must be at least 2. Got: %d" % num_bins)
+    raise ValueError("num_bins must be at least 2. Got: {}".format(num_bins))
  # Validate the lengths of x and y.
  x_len = len(x)
  if x_len < 2:
-    raise ValueError("len(x) must be at least 2. Got: %s" % x_len)
+    raise ValueError("len(x) must be at least 2. Got: {}".format(x_len))
  if x_len != len(y):
-    raise ValueError("len(x) (got: %d) must equal len(y) (got: %d)" % (x_len,
+    raise ValueError("len(x) (got: {}) must equal len(y) (got: {})".format(
-                                                                       len(y)))
+        x_len, len(y)))
  # Validate x_min and x_max.
  x_min = x_min if x_min is not None else x[0]
  x_max = x_max if x_max is not None else x[-1]
  if x_min >= x_max:
-    raise ValueError("x_min (got: %d) must be less than x_max (got: %d)" %
+    raise ValueError("x_min (got: {}) must be less than x_max (got: {})".format(
-                     (x_min, x_max))
+        x_min, x_max))
  if x_min > x[-1]:
    raise ValueError(
-        "x_min (got: %d) must be less than or equal to the largest value of x "
+        "x_min (got: {}) must be less than or equal to the largest value of x "
-        "(got: %d)" % (x_min, x[-1]))
+        "(got: {})".format(x_min, x[-1]))
  # Validate bin_width.
  bin_width = bin_width if bin_width is not None else (x_max - x_min) / num_bins
  if bin_width <= 0:
-    raise ValueError("bin_width must be positive. Got: %d" % bin_width)
+    raise ValueError("bin_width must be positive. Got: {}".format(bin_width))
  if bin_width >= x_max - x_min:
    raise ValueError(
-        "bin_width (got: %d) must be less than x_max - x_min (got: %d)" %
+        "bin_width (got: {}) must be less than x_max - x_min (got: {})".format(
-        (bin_width, x_max - x_min))
+            bin_width, x_max - x_min))
  bin_spacing = (x_max - x_min - bin_width) / (num_bins - 1)

--- a/research/astronet/light_curve_util/median_filter_test.py
+++ b/research/astronet/light_curve_util/median_filter_test.py
@@ -124,5 +124,5 @@ class MedianFilterTest(absltest.TestCase):
    np.testing.assert_array_equal([7, 1, 5, 2, 3], result)
-if __name__ == '__main__':
+if __name__ == "__main__":
  absltest.main()
--- a/research/astronet/light_curve_util/periodic_event.py
+++ b/research/astronet/light_curve_util/periodic_event.py
--- a/research/astronet/light_curve_util/util.py
+++ b/research/astronet/light_curve_util/util.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 import numpy as np
+import scipy.interpolate
 from six.moves import range  # pylint:disable=redefined-builtin
@@ -130,6 +131,46 @@ def remove_events(all_time,
  return output_time, output_flux
+def interpolate_missing_time(time, cadence_no=None, fill_value="extrapolate"):
+  """Interpolates missing (NaN or Inf) time values.
+  Args:
+    time: A numpy array of monotonically increasing values, with missing values
+      denoted by NaN or Inf.
+    cadence_no: Optional numpy array of cadence numbers corresponding to the
+      time values. If not provided, missing time values are assumed to be evenly
+      spaced between present time values.
+    fill_value: Specifies how missing time values should be treated at the
+      beginning and end of the array. See scipy.interpolate.interp1d.
+  Returns:
+    A numpy array of the same length as the input time array, with NaN/Inf
+    values replaced with interpolated values.
+  Raises:
+    ValueError: If fewer than 2 values of time are finite.
+  """
+  if cadence_no is None:
+    cadence_no = np.arange(len(time))
+  is_finite = np.isfinite(time)
+  num_finite = np.sum(is_finite)
+  if num_finite < 2:
+    raise ValueError(
+        "Cannot interpolate time with fewer than 2 finite values. Got "
+        "len(time) = {} with {} finite values.".format(len(time), num_finite))
+  interpolate_fn = scipy.interpolate.interp1d(
+      cadence_no[is_finite],
+      time[is_finite],
+      copy=False,
+      bounds_error=False,
+      fill_value=fill_value,
+      assume_sorted=True)
+  return interpolate_fn(cadence_no)
 def interpolate_masked_spline(all_time, all_masked_time, all_masked_spline):
  """Linearly interpolates spline values across masked points.
@@ -145,8 +186,8 @@ def interpolate_masked_spline(all_time, all_masked_time, all_masked_spline):
        points linearly interpolated.
  """
  interp_spline = []
-  for time, masked_time, masked_spline in zip(
+  for time, masked_time, masked_spline in zip(all_time, all_masked_time,
-      all_time, all_masked_time, all_masked_spline):
+                                              all_masked_spline):
    if masked_time.size:
      interp_spline.append(np.interp(time, masked_time, masked_spline))
    else:
@@ -154,6 +195,76 @@ def interpolate_masked_spline(all_time, all_masked_time, all_masked_spline):
  return interp_spline
+def reshard_arrays(xs, ys):
+  """Reshards arrays in xs to match the lengths of arrays in ys.
+  Args:
+    xs: List of 1d numpy arrays with the same total length as ys.
+    ys: List of 1d numpy arrays with the same total length as xs.
+  Returns:
+    A list of numpy arrays containing the same elements as xs, in the same
+    order, but with array lengths matching the pairwise array in ys.
+  Raises:
+    ValueError: If xs and ys do not have the same total length.
+  """
+  # Compute indices of boundaries between segments of ys, plus the end boundary.
+  boundaries = np.cumsum([len(y) for y in ys])
+  concat_x = np.concatenate(xs)
+  if len(concat_x) != boundaries[-1]:
+    raise ValueError(
+        "xs and ys do not have the same total length ({} vs. {}).".format(
+            len(concat_x), boundaries[-1]))
+  boundaries = boundaries[:-1]  # Remove exclusive end boundary.
+  return np.split(concat_x, boundaries)
+def uniform_cadence_light_curve(cadence_no, time, flux):
+  """Combines data into a single light curve with uniform cadence numbers.
+  Args:
+    cadence_no: numpy array; the cadence numbers of the light curve.
+    time: numpy array; the time values of the light curve.
+    flux: numpy array; the flux values of the light curve.
+  Returns:
+    cadence_no: numpy array; the cadence numbers of the light curve with no
+      gaps. It starts and ends at the minimum and maximum cadence numbers in the
+      input light curve, respectively.
+    time: numpy array; the time values of the light curve. Missing data points
+      have value zero and correspond to a False value in the mask.
+    flux: numpy array; the time values of the light curve. Missing data points
+      have value zero and correspond to a False value in the mask.
+    mask: Boolean numpy array; False indicates missing data points, where
+      missing data points are those that have no corresponding cadence number in
+      the input or those where at least one of the cadence number, time value,
+      or flux value is NaN/Inf.
+  Raises:
+    ValueError: If there are duplicate cadence numbers in the input.
+  """
+  min_cadence_no = np.min(cadence_no)
+  max_cadence_no = np.max(cadence_no)
+  out_cadence_no = np.arange(
+      min_cadence_no, max_cadence_no + 1, dtype=cadence_no.dtype)
+  out_time = np.zeros_like(out_cadence_no, dtype=time.dtype)
+  out_flux = np.zeros_like(out_cadence_no, dtype=flux.dtype)
+  out_mask = np.zeros_like(out_cadence_no, dtype=np.bool)
+  for c, t, f in zip(cadence_no, time, flux):
+    if np.isfinite(c) and np.isfinite(t) and np.isfinite(f):
+      i = int(c - min_cadence_no)
+      if out_mask[i]:
+        raise ValueError("Duplicate cadence number: {}".format(c))
+      out_time[i] = t
+      out_flux[i] = f
+      out_mask[i] = True
+  return out_cadence_no, out_time, out_flux, out_mask
 def count_transit_points(time, event):
  """Computes the number of points in each transit of a given event.
@@ -174,8 +285,8 @@ def count_transit_points(time, event):
  # Tiny periods or erroneous time values could make this loop take forever.
  if (t_max - t_min) / event.period > 10**6:
    raise ValueError(
-        "Too many transits! Time range is [%.2f, %.2f] and period is %.2e." %
+        "Too many transits! Time range is [{:.4f}, {:.4f}] and period is "
-        (t_min, t_max, event.period))
+        "{:.4e}.".format(t_min, t_max, event.period))
  # Make sure t0 is in [t_min, t_min + period).
  t0 = np.mod(event.t0 - t_min, event.period) + t_min

--- a/research/astronet/light_curve_util/util_test.py
+++ b/research/astronet/light_curve_util/util_test.py
@@ -176,6 +176,61 @@ class LightCurveUtilTest(absltest.TestCase):
    self.assertSequenceAlmostEqual([16, 17, 18, 19], output_time[0])
    self.assertSequenceAlmostEqual([160, 170, 180, 190], output_flux[0])
+  def testInterpolateMissingTime(self):
+    # Fewer than 2 finite values.
+    with self.assertRaises(ValueError):
+      util.interpolate_missing_time(np.array([]))
+    with self.assertRaises(ValueError):
+      util.interpolate_missing_time(np.array([5.0]))
+    with self.assertRaises(ValueError):
+      util.interpolate_missing_time(np.array([5.0, np.nan]))
+    with self.assertRaises(ValueError):
+      util.interpolate_missing_time(np.array([np.nan, np.nan, np.nan]))
+    # Small time arrays.
+    self.assertSequenceAlmostEqual([0.5, 0.6],
+                                   util.interpolate_missing_time(
+                                       np.array([0.5, 0.6])))
+    self.assertSequenceAlmostEqual([0.5, 0.6, 0.7],
+                                   util.interpolate_missing_time(
+                                       np.array([0.5, np.nan, 0.7])))
+    # Time array of length 20 with some values NaN.
+    time = np.array([
+        np.nan, 0.5, 1.0, 1.5, 2.0, 2.5, np.nan, 3.5, 4.0, 4.5, 5.0, np.nan,
+        np.nan, np.nan, np.nan, 7.5, 8.0, 8.5, np.nan, np.nan
+    ])
+    interp_time = util.interpolate_missing_time(time)
+    self.assertSequenceAlmostEqual([
+        0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5,
+        7.0, 7.5, 8.0, 8.5, 9.0, 9.5
+    ], interp_time)
+    # Fill with 0.0 for missing values at the beginning and end.
+    interp_time = util.interpolate_missing_time(time, fill_value=0.0)
+    self.assertSequenceAlmostEqual([
+        0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5,
+        7.0, 7.5, 8.0, 8.5, 0.0, 0.0
+    ], interp_time)
+    # Interpolate with cadences.
+    cadences = np.array([
+        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
+        114, 115, 116, 117, 118, 119
+    ])
+    interp_time = util.interpolate_missing_time(time, cadences)
+    self.assertSequenceAlmostEqual([
+        0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5,
+        7.0, 7.5, 8.0, 8.5, 9.0, 9.5
+    ], interp_time)
+    # Interpolate with missing cadences.
+    time = np.array([0.6, 0.7, np.nan, np.nan, np.nan, 1.3, 1.4, 1.5])
+    cadences = np.array([106, 107, 108, 109, 110, 113, 114, 115])
+    interp_time = util.interpolate_missing_time(time, cadences)
+    self.assertSequenceAlmostEqual([0.6, 0.7, 0.8, 0.9, 1.0, 1.3, 1.4, 1.5],
+                                   interp_time)
  def testInterpolateMaskedSpline(self):
    all_time = [
        np.arange(0, 10, dtype=np.float),
@@ -198,6 +253,53 @@ class LightCurveUtilTest(absltest.TestCase):
        [120, 122, 124, 126, 128, 130, 132, 132, 132, 132], interp_spline[1])
    self.assertTrue(np.all(np.isnan(interp_spline[2])))
+  def testReshardArrays(self):
+    xs = [
+        np.array([1, 2, 3]),
+        np.array([4]),
+        np.array([5, 6, 7, 8, 9]),
+        np.array([]),
+    ]
+    ys = [
+        np.array([]),
+        np.array([10, 20]),
+        np.array([30, 40, 50, 60]),
+        np.array([70]),
+        np.array([80, 90]),
+    ]
+    reshard_xs = util.reshard_arrays(xs, ys)
+    self.assertEqual(5, len(reshard_xs))
+    np.testing.assert_array_equal([], reshard_xs[0])
+    np.testing.assert_array_equal([1, 2], reshard_xs[1])
+    np.testing.assert_array_equal([3, 4, 5, 6], reshard_xs[2])
+    np.testing.assert_array_equal([7], reshard_xs[3])
+    np.testing.assert_array_equal([8, 9], reshard_xs[4])
+    with self.assertRaisesRegexp(ValueError,
+                                 "xs and ys do not have the same total length"):
+      util.reshard_arrays(xs, [np.array([10, 20, 30]), np.array([40, 50])])
+  def testUniformCadenceLightCurve(self):
+    input_cadence_no = np.array([13, 4, 5, 6, 8, 9, 11, 12])
+    input_time = np.array([130, 40, 50, 60, 80, 90, 110, 120])
+    input_flux = np.array([1300, 400, 500, 600, 800, np.nan, 1100, 1200])
+    cadence_no, time, flux, mask = util.uniform_cadence_light_curve(
+        input_cadence_no, input_time, input_flux)
+    np.testing.assert_array_equal([4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+                                  cadence_no)
+    np.testing.assert_array_equal([40, 50, 60, 0, 80, 0, 0, 110, 120, 130],
+                                  time)
+    np.testing.assert_array_equal(
+        [400, 500, 600, 0, 800, 0, 0, 1100, 1200, 1300], flux)
+    np.testing.assert_array_equal([1, 1, 1, 0, 1, 0, 0, 1, 1, 1], mask)
+    # Add duplicate cadence number.
+    input_cadence_no = np.concatenate([input_cadence_no, np.array([13, 14])])
+    input_time = np.concatenate([input_time, np.array([130, 140])])
+    input_flux = np.concatenate([input_flux, np.array([1300, 1400])])
+    with self.assertRaisesRegexp(ValueError, "Duplicate cadence number"):
+      util.uniform_cadence_light_curve(input_cadence_no, input_time, input_flux)
  def testCountTransitPoints(self):
    time = np.concatenate([
        np.arange(0, 10, 0.1, dtype=np.float),

--- a/research/astronet/third_party/kepler_spline/kepler_spline.py
+++ b/research/astronet/third_party/kepler_spline/kepler_spline.py