Modularize light curve and TCE preprocessing functions for easier re-use.

PiperOrigin-RevId: 201839789

Modularize light curve and TCE preprocessing functions for easier re-use.
PiperOrigin-RevId: 201839789
ddee8c74 · Chris Shallue · Christopher Shallue · 672ac40b · ddee8c74 · ddee8c74
Commit ddee8c74 authored Jun 23, 2018 by Chris Shallue Committed by Christopher Shallue Jun 23, 2018
4 changed files
--- a/research/astronet/astronet/data/BUILD
+++ b/research/astronet/astronet/data/BUILD
@@ -8,10 +8,11 @@ py_binary(
    deps = [":preprocess"],
 )

-py_binary(
+py_library(
    name = "preprocess",
    srcs = ["preprocess.py"],
    deps = [
+        "//astronet/util:example_util",
        "//light_curve_util:kepler_io",
        "//light_curve_util:median_filter",
        "//light_curve_util:util",

--- a/research/astronet/astronet/data/generate_input_records.py
+++ b/research/astronet/astronet/data/generate_input_records.py
@@ -131,25 +131,6 @@ _LABEL_COLUMN = "av_training_set"
 _ALLOWED_LABELS = {"PC", "AFP", "NTP"}


-def _set_float_feature(ex, name, value):
-  """Sets the value of a float feature in a tensorflow.train.Example proto."""
-  assert name not in ex.features.feature, "Duplicate feature: %s" % name
-  ex.features.feature[name].float_list.value.extend([float(v) for v in value])
-
-
-def _set_bytes_feature(ex, name, value):
-  """Sets the value of a bytes feature in a tensorflow.train.Example proto."""
-  assert name not in ex.features.feature, "Duplicate feature: %s" % name
-  ex.features.feature[name].bytes_list.value.extend([
-      str(v).encode("latin-1") for v in value])
-
-
-def _set_int64_feature(ex, name, value):
-  """Sets the value of an int64 feature in a tensorflow.train.Example proto."""
-  assert name not in ex.features.feature, "Duplicate feature: %s" % name
-  ex.features.feature[name].int64_list.value.extend([int(v) for v in value])
-
-
 def _process_tce(tce):
  """Processes the light curve for a Kepler TCE and returns an Example proto.

@@ -158,39 +139,11 @@ def _process_tce(tce):

  Returns:
    A tensorflow.train.Example proto containing TCE features.
-
-  Raises:
-    IOError: If the light curve files for this Kepler ID cannot be found.
  """
-  # Read and process the light curve.
-  time, flux = preprocess.read_and_process_light_curve(tce.kepid,
-                                                       FLAGS.kepler_data_dir)
-  time, flux = preprocess.phase_fold_and_sort_light_curve(
-      time, flux, tce.tce_period, tce.tce_time0bk)
-
-  # Generate the local and global views.
-  global_view = preprocess.global_view(time, flux, tce.tce_period)
-  local_view = preprocess.local_view(time, flux, tce.tce_period,
-                                     tce.tce_duration)
-
-  # Make output proto.
-  ex = tf.train.Example()
-
-  # Set time series features.
-  _set_float_feature(ex, "global_view", global_view)
-  _set_float_feature(ex, "local_view", local_view)
-
-  # Set other columns.
-  for col_name, value in tce.items():
-    if np.issubdtype(type(value), np.integer):
-      _set_int64_feature(ex, col_name, [value])
-    else:
-      try:
-        _set_float_feature(ex, col_name, [float(value)])
-      except ValueError:
-        _set_bytes_feature(ex, col_name, [value])
-
-  return ex
+  all_time, all_flux = preprocess.read_light_curve(tce.kepid,
+                                                   FLAGS.kepler_data_dir)
+  time, flux = preprocess.process_light_curve(all_time, all_flux)
+  return preprocess.generate_example_for_tce(time, flux, tce)


 def _process_file_shard(tce_table, file_name):

--- a/research/astronet/astronet/data/preprocess.py
+++ b/research/astronet/astronet/data/preprocess.py
@@ -21,29 +21,28 @@ from __future__ import print_function
 import numpy as np
 import tensorflow as tf

+from astronet.util import example_util
 from light_curve_util import kepler_io
 from light_curve_util import median_filter
 from light_curve_util import util
 from third_party.kepler_spline import kepler_spline


-def read_and_process_light_curve(kepid, kepler_data_dir, max_gap_width=0.75):
-  """Reads a light curve, fits a B-spline and divides the curve by the spline.
+def read_light_curve(kepid, kepler_data_dir):
+  """Reads a Kepler light curve.

  Args:
    kepid: Kepler id of the target star.
    kepler_data_dir: Base directory containing Kepler data. See
        kepler_io.kepler_filenames().
-    max_gap_width: Gap size (in days) above which the light curve is split for
-        the fitting of B-splines.

  Returns:
-    time: 1D NumPy array; the time values of the light curve.
-    flux: 1D NumPy array; the normalized flux values of the light curve.
+    all_time: A list of numpy arrays; the time values of the raw light curve.
+    all_flux: A list of numpy arrays corresponding to the time arrays in
+        all_time.

  Raises:
    IOError: If the light curve files for this Kepler ID cannot be found.
-    ValueError: If the spline could not be fit.
  """
  # Read the Kepler light curve.
  file_names = kepler_io.kepler_filenames(kepler_data_dir, kepid)
@@ -51,21 +50,26 @@ def read_and_process_light_curve(kepid, kepler_data_dir, max_gap_width=0.75):
    raise IOError("Failed to find .fits files in %s for Kepler ID %s" %
                  (kepler_data_dir, kepid))

-  all_time, all_flux = kepler_io.read_kepler_light_curve(file_names)
+  return kepler_io.read_kepler_light_curve(file_names)

-  # Split on gaps.
-  all_time, all_flux = util.split(all_time, all_flux, gap_width=max_gap_width)

-  # Logarithmically sample candidate break point spacings between 0.5 and 20
-  # days.
-  bkspaces = np.logspace(np.log10(0.5), np.log10(20), num=20)
+def process_light_curve(all_time, all_flux):
+  """Removes low-frequency variability from a light curve.

-  # Generate spline.
-  spline = kepler_spline.choose_kepler_spline(
-      all_time, all_flux, bkspaces, penalty_coeff=1.0, verbose=False)[0]
+  Args:
+    all_time: A list of numpy arrays; the time values of the raw light curve.
+    all_flux: A list of numpy arrays corresponding to the time arrays in
+        all_time.
+
+  Returns:
+    time: 1D NumPy array; the time values of the light curve.
+    flux: 1D NumPy array; the normalized flux values of the light curve.
+  """
+  # Split on gaps.
+  all_time, all_flux = util.split(all_time, all_flux, gap_width=0.75)

-  if spline is None:
-    raise ValueError("Failed to fit spline with Kepler ID %s", kepid)
+  # Fit a piecewise-cubic spline with default arguments.
+  spline = kepler_spline.fit_kepler_spline(all_time, all_flux, verbose=False)[0]

  # Concatenate the piecewise light curve and spline.
  time = np.concatenate(all_time)
@@ -77,7 +81,6 @@ def read_and_process_light_curve(kepid, kepler_data_dir, max_gap_width=0.75):
  # there. Instead we just remove them.
  finite_i = np.isfinite(spline)
  if not np.all(finite_i):
-    tf.logging.warn("Incomplete spline with Kepler ID %s", kepid)
    time = time[finite_i]
    flux = flux[finite_i]
    spline = spline[finite_i]
@@ -202,3 +205,38 @@ def local_view(time,
      bin_width=duration * bin_width_factor,
      t_min=max(-period / 2, -duration * num_durations),
      t_max=min(period / 2, duration * num_durations))
+
+
+def generate_example_for_tce(time, flux, tce):
+  """Generates a tf.train.Example representing an input TCE.
+
+  Args:
+    time: 1D NumPy array; the time values of the light curve.
+    flux: 1D NumPy array; the normalized flux values of the light curve.
+    tce: Dict-like object containing at least 'tce_period', 'tce_duration', and
+        'tce_time0bk'. Additional items are included as features in the output.
+
+  Returns:
+    A tf.train.Example containing features 'global_view', 'local_view', and all
+    values present in `tce`.
+  """
+  period = tce["tce_period"]
+  duration = tce["tce_duration"]
+  t0 = tce["tce_time0bk"]
+
+  time, flux = phase_fold_and_sort_light_curve(time, flux, period, t0)
+
+  # Make output proto.
+  ex = tf.train.Example()
+
+  # Set time series features.
+  example_util.set_float_feature(ex, "global_view",
+                                 global_view(time, flux, period))
+  example_util.set_float_feature(ex, "local_view",
+                                 local_view(time, flux, period, duration))
+
+  # Set other features in `tce`.
+  for name, value in tce.items():
+    example_util.set_feature(ex, name, [value])
+
+  return ex
--- a/research/astronet/third_party/kepler_spline/kepler_spline.py
+++ b/research/astronet/third_party/kepler_spline/kepler_spline.py
@@ -159,8 +159,7 @@ def choose_kepler_spline(all_time,

  Args:
    all_time: List of 1D numpy arrays; the time values of the light curve.
-    all_flux: List of 1D numpy arrays; the flux (brightness) values of the light
-        curve.
+    all_flux: List of 1D numpy arrays; the flux values of the light curve.
    bkspaces: List of break-point spacings to try.
    maxiter: Maximum number of attempts to fit each spline after removing badly
        fit points.
@@ -276,3 +275,48 @@ def choose_kepler_spline(all_time,
    ]

  return best_spline, metadata
+
+
+def fit_kepler_spline(all_time,
+                      all_flux,
+                      bkspace_min=0.5,
+                      bkspace_max=20,
+                      bkspace_num=20,
+                      maxiter=5,
+                      penalty_coeff=1.0,
+                      verbose=True):
+  """Fits a Kepler spline with logarithmically-sampled breakpoint spacings.
+
+  Args:
+    all_time: List of 1D numpy arrays; the time values of the light curve.
+    all_flux: List of 1D numpy arrays; the flux values of the light curve.
+    bkspace_min: Minimum breakpoint spacing to try.
+    bkspace_max: Maximum breakpoint spacing to try.
+    bkspace_num: Number of breakpoint spacings to try.
+    maxiter: Maximum number of attempts to fit each spline after removing badly
+        fit points.
+    penalty_coeff: Coefficient of the penalty term for using more parameters in
+        the Bayesian Information Criterion. Decreasing this value will allow
+        more parameters to be used (i.e. smaller break-point spacing), and
+        vice-versa.
+    verbose: Whether to log individual spline errors. Note that if bkspaces
+        contains many values (particularly small ones) then this may cause
+        logging pollution if calling this function for many light curves.
+
+  Returns:
+    spline: List of numpy arrays; values of the best-fit spline corresponding to
+        to the input flux arrays.
+    metadata: Object containing metadata about the spline fit.
+  """
+  # Logarithmically sample bkspace_num candidate break point spacings between
+  # bkspace_min and bkspace_max.
+  bkspaces = np.logspace(
+      np.log10(bkspace_min), np.log10(bkspace_max), num=bkspace_num)
+
+  return choose_kepler_spline(
+      all_time,
+      all_flux,
+      bkspaces,
+      maxiter=maxiter,
+      penalty_coeff=penalty_coeff,
+      verbose=verbose)