Commit ddee8c74 authored by Chris Shallue's avatar Chris Shallue Committed by Christopher Shallue
Browse files

Modularize light curve and TCE preprocessing functions for easier re-use.

PiperOrigin-RevId: 201839789
parent 672ac40b
......@@ -8,10 +8,11 @@ py_binary(
deps = [":preprocess"],
)
py_binary(
py_library(
name = "preprocess",
srcs = ["preprocess.py"],
deps = [
"//astronet/util:example_util",
"//light_curve_util:kepler_io",
"//light_curve_util:median_filter",
"//light_curve_util:util",
......
......@@ -131,25 +131,6 @@ _LABEL_COLUMN = "av_training_set"
_ALLOWED_LABELS = {"PC", "AFP", "NTP"}
def _set_float_feature(ex, name, value):
"""Sets the value of a float feature in a tensorflow.train.Example proto."""
assert name not in ex.features.feature, "Duplicate feature: %s" % name
ex.features.feature[name].float_list.value.extend([float(v) for v in value])
def _set_bytes_feature(ex, name, value):
"""Sets the value of a bytes feature in a tensorflow.train.Example proto."""
assert name not in ex.features.feature, "Duplicate feature: %s" % name
ex.features.feature[name].bytes_list.value.extend([
str(v).encode("latin-1") for v in value])
def _set_int64_feature(ex, name, value):
"""Sets the value of an int64 feature in a tensorflow.train.Example proto."""
assert name not in ex.features.feature, "Duplicate feature: %s" % name
ex.features.feature[name].int64_list.value.extend([int(v) for v in value])
def _process_tce(tce):
"""Processes the light curve for a Kepler TCE and returns an Example proto.
......@@ -158,39 +139,11 @@ def _process_tce(tce):
Returns:
A tensorflow.train.Example proto containing TCE features.
Raises:
IOError: If the light curve files for this Kepler ID cannot be found.
"""
# Read and process the light curve.
time, flux = preprocess.read_and_process_light_curve(tce.kepid,
FLAGS.kepler_data_dir)
time, flux = preprocess.phase_fold_and_sort_light_curve(
time, flux, tce.tce_period, tce.tce_time0bk)
# Generate the local and global views.
global_view = preprocess.global_view(time, flux, tce.tce_period)
local_view = preprocess.local_view(time, flux, tce.tce_period,
tce.tce_duration)
# Make output proto.
ex = tf.train.Example()
# Set time series features.
_set_float_feature(ex, "global_view", global_view)
_set_float_feature(ex, "local_view", local_view)
# Set other columns.
for col_name, value in tce.items():
if np.issubdtype(type(value), np.integer):
_set_int64_feature(ex, col_name, [value])
else:
try:
_set_float_feature(ex, col_name, [float(value)])
except ValueError:
_set_bytes_feature(ex, col_name, [value])
return ex
all_time, all_flux = preprocess.read_light_curve(tce.kepid,
FLAGS.kepler_data_dir)
time, flux = preprocess.process_light_curve(all_time, all_flux)
return preprocess.generate_example_for_tce(time, flux, tce)
def _process_file_shard(tce_table, file_name):
......
......@@ -21,29 +21,28 @@ from __future__ import print_function
import numpy as np
import tensorflow as tf
from astronet.util import example_util
from light_curve_util import kepler_io
from light_curve_util import median_filter
from light_curve_util import util
from third_party.kepler_spline import kepler_spline
def read_and_process_light_curve(kepid, kepler_data_dir, max_gap_width=0.75):
"""Reads a light curve, fits a B-spline and divides the curve by the spline.
def read_light_curve(kepid, kepler_data_dir):
"""Reads a Kepler light curve.
Args:
kepid: Kepler id of the target star.
kepler_data_dir: Base directory containing Kepler data. See
kepler_io.kepler_filenames().
max_gap_width: Gap size (in days) above which the light curve is split for
the fitting of B-splines.
Returns:
time: 1D NumPy array; the time values of the light curve.
flux: 1D NumPy array; the normalized flux values of the light curve.
all_time: A list of numpy arrays; the time values of the raw light curve.
all_flux: A list of numpy arrays corresponding to the time arrays in
all_time.
Raises:
IOError: If the light curve files for this Kepler ID cannot be found.
ValueError: If the spline could not be fit.
"""
# Read the Kepler light curve.
file_names = kepler_io.kepler_filenames(kepler_data_dir, kepid)
......@@ -51,21 +50,26 @@ def read_and_process_light_curve(kepid, kepler_data_dir, max_gap_width=0.75):
raise IOError("Failed to find .fits files in %s for Kepler ID %s" %
(kepler_data_dir, kepid))
all_time, all_flux = kepler_io.read_kepler_light_curve(file_names)
return kepler_io.read_kepler_light_curve(file_names)
# Split on gaps.
all_time, all_flux = util.split(all_time, all_flux, gap_width=max_gap_width)
# Logarithmically sample candidate break point spacings between 0.5 and 20
# days.
bkspaces = np.logspace(np.log10(0.5), np.log10(20), num=20)
def process_light_curve(all_time, all_flux):
"""Removes low-frequency variability from a light curve.
# Generate spline.
spline = kepler_spline.choose_kepler_spline(
all_time, all_flux, bkspaces, penalty_coeff=1.0, verbose=False)[0]
Args:
all_time: A list of numpy arrays; the time values of the raw light curve.
all_flux: A list of numpy arrays corresponding to the time arrays in
all_time.
Returns:
time: 1D NumPy array; the time values of the light curve.
flux: 1D NumPy array; the normalized flux values of the light curve.
"""
# Split on gaps.
all_time, all_flux = util.split(all_time, all_flux, gap_width=0.75)
if spline is None:
raise ValueError("Failed to fit spline with Kepler ID %s", kepid)
# Fit a piecewise-cubic spline with default arguments.
spline = kepler_spline.fit_kepler_spline(all_time, all_flux, verbose=False)[0]
# Concatenate the piecewise light curve and spline.
time = np.concatenate(all_time)
......@@ -77,7 +81,6 @@ def read_and_process_light_curve(kepid, kepler_data_dir, max_gap_width=0.75):
# there. Instead we just remove them.
finite_i = np.isfinite(spline)
if not np.all(finite_i):
tf.logging.warn("Incomplete spline with Kepler ID %s", kepid)
time = time[finite_i]
flux = flux[finite_i]
spline = spline[finite_i]
......@@ -202,3 +205,38 @@ def local_view(time,
bin_width=duration * bin_width_factor,
t_min=max(-period / 2, -duration * num_durations),
t_max=min(period / 2, duration * num_durations))
def generate_example_for_tce(time, flux, tce):
"""Generates a tf.train.Example representing an input TCE.
Args:
time: 1D NumPy array; the time values of the light curve.
flux: 1D NumPy array; the normalized flux values of the light curve.
tce: Dict-like object containing at least 'tce_period', 'tce_duration', and
'tce_time0bk'. Additional items are included as features in the output.
Returns:
A tf.train.Example containing features 'global_view', 'local_view', and all
values present in `tce`.
"""
period = tce["tce_period"]
duration = tce["tce_duration"]
t0 = tce["tce_time0bk"]
time, flux = phase_fold_and_sort_light_curve(time, flux, period, t0)
# Make output proto.
ex = tf.train.Example()
# Set time series features.
example_util.set_float_feature(ex, "global_view",
global_view(time, flux, period))
example_util.set_float_feature(ex, "local_view",
local_view(time, flux, period, duration))
# Set other features in `tce`.
for name, value in tce.items():
example_util.set_feature(ex, name, [value])
return ex
......@@ -159,8 +159,7 @@ def choose_kepler_spline(all_time,
Args:
all_time: List of 1D numpy arrays; the time values of the light curve.
all_flux: List of 1D numpy arrays; the flux (brightness) values of the light
curve.
all_flux: List of 1D numpy arrays; the flux values of the light curve.
bkspaces: List of break-point spacings to try.
maxiter: Maximum number of attempts to fit each spline after removing badly
fit points.
......@@ -276,3 +275,48 @@ def choose_kepler_spline(all_time,
]
return best_spline, metadata
def fit_kepler_spline(all_time,
all_flux,
bkspace_min=0.5,
bkspace_max=20,
bkspace_num=20,
maxiter=5,
penalty_coeff=1.0,
verbose=True):
"""Fits a Kepler spline with logarithmically-sampled breakpoint spacings.
Args:
all_time: List of 1D numpy arrays; the time values of the light curve.
all_flux: List of 1D numpy arrays; the flux values of the light curve.
bkspace_min: Minimum breakpoint spacing to try.
bkspace_max: Maximum breakpoint spacing to try.
bkspace_num: Number of breakpoint spacings to try.
maxiter: Maximum number of attempts to fit each spline after removing badly
fit points.
penalty_coeff: Coefficient of the penalty term for using more parameters in
the Bayesian Information Criterion. Decreasing this value will allow
more parameters to be used (i.e. smaller break-point spacing), and
vice-versa.
verbose: Whether to log individual spline errors. Note that if bkspaces
contains many values (particularly small ones) then this may cause
logging pollution if calling this function for many light curves.
Returns:
spline: List of numpy arrays; values of the best-fit spline corresponding to
to the input flux arrays.
metadata: Object containing metadata about the spline fit.
"""
# Logarithmically sample bkspace_num candidate break point spacings between
# bkspace_min and bkspace_max.
bkspaces = np.logspace(
np.log10(bkspace_min), np.log10(bkspace_max), num=bkspace_num)
return choose_kepler_spline(
all_time,
all_flux,
bkspaces,
maxiter=maxiter,
penalty_coeff=penalty_coeff,
verbose=verbose)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment