Unverified Commit 69b01644 authored by Chris Shallue's avatar Chris Shallue Committed by GitHub
Browse files

Merge pull request #5546 from cshallue/master

Improvements to AstroNet and add AstroWaveNet
parents 91b2debd 763663de
# Copyright 2018 The TensorFlow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Synthetic transit inputs to the AstroWaveNet model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from astronet.util import configdict
from astrowavenet.data import base
from astrowavenet.data import synthetic_transit_maker
def _prepare_wavenet_inputs(light_curve, mask):
"""Gathers synthetic transits into the format expected by AstroWaveNet."""
return {
"autoregressive_input": tf.expand_dims(light_curve, -1),
"conditioning_stack": tf.expand_dims(mask, -1),
}
class SyntheticTransits(base.DatasetBuilder):
"""Synthetic transit inputs to the AstroWaveNet model."""
@staticmethod
def default_config():
return configdict.ConfigDict({
"period_range": (0.5, 4),
"amplitude_range": (1, 1),
"threshold_ratio_range": (0, 0.99),
"phase_range": (0, 1),
"noise_sd_range": (0.1, 0.1),
"mask_probability": 0.1,
"light_curve_time_range": (0, 100),
"light_curve_num_points": 1000
})
def build(self, batch_size):
transit_maker = synthetic_transit_maker.SyntheticTransitMaker(
period_range=self.config.period_range,
amplitude_range=self.config.amplitude_range,
threshold_ratio_range=self.config.threshold_ratio_range,
phase_range=self.config.phase_range,
noise_sd_range=self.config.noise_sd_range)
t_start, t_end = self.config.light_curve_time_range
time = np.linspace(t_start, t_end, self.config.light_curve_num_points)
dataset = tf.data.Dataset.from_generator(
transit_maker.random_light_curve_generator(
time, mask_prob=self.config.mask_probability),
output_types=(tf.float32, tf.float32),
output_shapes=(tf.TensorShape((self.config.light_curve_num_points,)),
tf.TensorShape((self.config.light_curve_num_points,))))
dataset = dataset.map(_prepare_wavenet_inputs)
dataset = dataset.batch(batch_size, drop_remainder=True)
dataset = dataset.prefetch(-1)
return dataset
# Copyright 2018 The TensorFlow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Script for training and evaluating AstroWaveNet models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os.path
from absl import flags
import tensorflow as tf
from astronet.util import config_util
from astronet.util import configdict
from astronet.util import estimator_runner
from astrowavenet import astrowavenet_model
from astrowavenet import configurations
from astrowavenet.data import kepler_light_curves
from astrowavenet.data import synthetic_transits
from astrowavenet.util import estimator_util
FLAGS = flags.FLAGS
flags.DEFINE_enum("dataset", None,
["synthetic_transits", "kepler_light_curves"],
"Dataset for training and/or evaluation.")
flags.DEFINE_string("model_dir", None, "Base output directory.")
flags.DEFINE_string(
"train_files", None,
"Comma-separated list of file patterns matching the TFRecord files in the "
"training dataset.")
flags.DEFINE_string(
"eval_files", None,
"Comma-separated list of file patterns matching the TFRecord files in the "
"evaluation dataset.")
flags.DEFINE_string("config_name", "base",
"Name of the AstroWaveNet configuration.")
flags.DEFINE_string(
"config_overrides", "{}",
"JSON string or JSON file containing overrides to the base configuration.")
flags.DEFINE_enum("schedule", None,
["train", "train_and_eval", "continuous_eval"],
"Schedule for running the model.")
flags.DEFINE_string("eval_name", "val", "Name of the evaluation task.")
flags.DEFINE_integer("train_steps", None, "Total number of steps for training.")
flags.DEFINE_integer("eval_steps", None, "Number of steps for each evaluation.")
flags.DEFINE_integer(
"local_eval_frequency", 1000,
"The number of training steps in between evaluation runs. Only applies "
"when schedule == 'train_and_eval'.")
flags.DEFINE_integer("save_summary_steps", None,
"The frequency at which to save model summaries.")
flags.DEFINE_integer("save_checkpoints_steps", None,
"The frequency at which to save model checkpoints.")
flags.DEFINE_integer("save_checkpoints_secs", None,
"The frequency at which to save model checkpoints.")
flags.DEFINE_integer("keep_checkpoint_max", 1,
"The maximum number of model checkpoints to keep.")
# ------------------------------------------------------------------------------
# TPU-only flags
# ------------------------------------------------------------------------------
flags.DEFINE_boolean("use_tpu", False, "Whether to execute on TPU.")
flags.DEFINE_string("master", None, "Address of the TensorFlow TPU master.")
flags.DEFINE_integer("tpu_num_shards", 8, "Number of TPU shards.")
flags.DEFINE_integer("tpu_iterations_per_loop", 1000,
"Number of iterations per TPU training loop.")
flags.DEFINE_integer(
"eval_batch_size", None,
"Batch size for TPU evaluation. Defaults to the training batch size.")
def _create_run_config():
"""Creates a TPU RunConfig if FLAGS.use_tpu is True, else a RunConfig."""
session_config = tf.ConfigProto(allow_soft_placement=True)
run_config_kwargs = {
"save_summary_steps": FLAGS.save_summary_steps,
"save_checkpoints_steps": FLAGS.save_checkpoints_steps,
"save_checkpoints_secs": FLAGS.save_checkpoints_secs,
"session_config": session_config,
"keep_checkpoint_max": FLAGS.keep_checkpoint_max
}
if FLAGS.use_tpu:
if not FLAGS.master:
raise ValueError("FLAGS.master must be set for TPUEstimator.")
tpu_config = tf.contrib.tpu.TPUConfig(
iterations_per_loop=FLAGS.tpu_iterations_per_loop,
num_shards=FLAGS.tpu_num_shards,
per_host_input_for_training=(FLAGS.tpu_num_shards <= 8))
run_config = tf.contrib.tpu.RunConfig(
tpu_config=tpu_config, master=FLAGS.master, **run_config_kwargs)
else:
if FLAGS.master:
raise ValueError("FLAGS.master should only be set for TPUEstimator.")
run_config = tf.estimator.RunConfig(**run_config_kwargs)
return run_config
def _get_file_pattern(mode):
"""Gets the value of the file pattern flag for the specified mode."""
flag_name = ("train_files"
if mode == tf.estimator.ModeKeys.TRAIN else "eval_files")
file_pattern = FLAGS[flag_name].value
if file_pattern is None:
raise ValueError("--{} is required for mode '{}'".format(flag_name, mode))
return file_pattern
def _create_dataset_builder(mode, config_overrides=None):
"""Creates a dataset builder for the input pipeline."""
if FLAGS.dataset == "synthetic_transits":
return synthetic_transits.SyntheticTransits(config_overrides)
file_pattern = _get_file_pattern(mode)
if FLAGS.dataset == "kepler_light_curves":
builder_class = kepler_light_curves.KeplerLightCurves
else:
raise ValueError("Unsupported dataset: {}".format(FLAGS.dataset))
return builder_class(
file_pattern,
mode,
config_overrides=config_overrides,
use_tpu=FLAGS.use_tpu)
def _create_input_fn(mode, config_overrides=None):
"""Creates an Estimator input_fn."""
builder = _create_dataset_builder(mode, config_overrides)
tf.logging.info("Dataset config for mode '%s': %s", mode,
config_util.to_json(builder.config))
return estimator_util.create_input_fn(builder)
def _create_eval_args(config_overrides=None):
"""Builds eval_args for estimator_runner.evaluate()."""
if FLAGS.dataset == "synthetic_transits" and not FLAGS.eval_steps:
raise ValueError("Dataset '{}' requires --eval_steps for evaluation".format(
FLAGS.dataset))
input_fn = _create_input_fn(tf.estimator.ModeKeys.EVAL, config_overrides)
return {FLAGS.eval_name: (input_fn, FLAGS.eval_steps)}
def main(argv):
del argv # Unused.
config = configdict.ConfigDict(configurations.get_config(FLAGS.config_name))
config_overrides = json.loads(FLAGS.config_overrides)
for key in config_overrides:
if key not in ["dataset", "hparams"]:
raise ValueError("Unrecognized config override: {}".format(key))
config.hparams.update(config_overrides.get("hparams", {}))
# Log configs.
configs_json = [
("config_overrides", config_util.to_json(config_overrides)),
("config", config_util.to_json(config)),
]
for config_name, config_json in configs_json:
tf.logging.info("%s: %s", config_name, config_json)
# Create the estimator.
run_config = _create_run_config()
estimator = estimator_util.create_estimator(
astrowavenet_model.AstroWaveNet, config.hparams, run_config,
FLAGS.model_dir, FLAGS.eval_batch_size)
if FLAGS.schedule in ["train", "train_and_eval"]:
# Save configs.
tf.gfile.MakeDirs(FLAGS.model_dir)
for config_name, config_json in configs_json:
filename = os.path.join(FLAGS.model_dir, "{}.json".format(config_name))
with tf.gfile.Open(filename, "w") as f:
f.write(config_json)
train_input_fn = _create_input_fn(tf.estimator.ModeKeys.TRAIN,
config_overrides.get("dataset"))
train_hooks = []
if FLAGS.schedule == "train":
estimator.train(
train_input_fn, hooks=train_hooks, max_steps=FLAGS.train_steps)
else:
assert FLAGS.schedule == "train_and_eval"
eval_args = _create_eval_args(config_overrides.get("dataset"))
for _ in estimator_runner.continuous_train_and_eval(
estimator=estimator,
train_input_fn=train_input_fn,
eval_args=eval_args,
local_eval_frequency=FLAGS.local_eval_frequency,
train_hooks=train_hooks,
train_steps=FLAGS.train_steps):
# continuous_train_and_eval() yields evaluation metrics after each
# FLAGS.local_eval_frequency. It also saves and logs them, so we don't
# do anything here.
pass
else:
assert FLAGS.schedule == "continuous_eval"
eval_args = _create_eval_args(config_overrides.get("dataset"))
for _ in estimator_runner.continuous_eval(
estimator=estimator, eval_args=eval_args,
train_steps=FLAGS.train_steps):
# continuous_train_and_eval() yields evaluation metrics after each
# checkpoint. It also saves and logs them, so we don't do anything here.
pass
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
flags.mark_flags_as_required(["dataset", "model_dir", "schedule"])
def _validate_schedule(flag_values):
"""Validates the --schedule flag and the flags it interacts with."""
schedule = flag_values["schedule"]
save_checkpoints_steps = flag_values["save_checkpoints_steps"]
save_checkpoints_secs = flag_values["save_checkpoints_secs"]
if schedule in ["train", "train_and_eval"]:
if not (save_checkpoints_steps or save_checkpoints_secs):
raise flags.ValidationError(
"--schedule='%s' requires --save_checkpoints_steps or "
"--save_checkpoints_secs." % schedule)
return True
flags.register_multi_flags_validator(
["schedule", "save_checkpoints_steps", "save_checkpoints_secs"],
_validate_schedule)
tf.app.run()
package(default_visibility = ["//visibility:public"])
licenses(["notice"]) # Apache 2.0
py_library(
name = "estimator_util",
srcs = ["estimator_util.py"],
srcs_version = "PY2AND3",
deps = ["//astronet/ops:training"],
)
# Copyright 2018 The TensorFlow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper functions for creating a TensorFlow Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import tensorflow as tf
from astronet.ops import training
class _InputFn(object):
"""Class that acts as a callable input function for Estimator train / eval."""
def __init__(self, dataset_builder):
"""Initializes the input function.
Args:
dataset_builder: Instance of DatasetBuilder.
"""
self._builder = dataset_builder
def __call__(self, params):
"""Builds the input pipeline."""
return self._builder.build(batch_size=params["batch_size"])
def create_input_fn(dataset_builder):
"""Creates an input_fn that that builds an input pipeline.
Args:
dataset_builder: Instance of DatasetBuilder.
Returns:
A callable that builds an input pipeline and returns a tf.data.Dataset
object.
"""
return _InputFn(dataset_builder)
class _ModelFn(object):
"""Class that acts as a callable model function for Estimator train / eval."""
def __init__(self, model_class, hparams, use_tpu=False):
"""Initializes the model function.
Args:
model_class: Model class.
hparams: A HParams object containing hyperparameters for building and
training the model.
use_tpu: If True, a TPUEstimator will be returned. Otherwise an Estimator
will be returned.
"""
self._model_class = model_class
self._base_hparams = hparams
self._use_tpu = use_tpu
def __call__(self, features, mode, params):
"""Builds the model and returns an EstimatorSpec or TPUEstimatorSpec."""
hparams = copy.deepcopy(self._base_hparams)
if "batch_size" in params:
hparams.batch_size = params["batch_size"]
model = self._model_class(features, hparams, mode)
model.build()
# Possibly create train_op.
use_tpu = self._use_tpu
train_op = None
if mode == tf.estimator.ModeKeys.TRAIN:
learning_rate = training.create_learning_rate(hparams, model.global_step)
optimizer = training.create_optimizer(hparams, learning_rate, use_tpu)
train_op = training.create_train_op(model, optimizer)
if use_tpu:
estimator = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, loss=model.total_loss, train_op=train_op)
else:
estimator = tf.estimator.EstimatorSpec(
mode=mode, loss=model.total_loss, train_op=train_op)
return estimator
def create_model_fn(model_class, hparams, use_tpu=False):
"""Wraps model_class as an Estimator or TPUEstimator model_fn.
Args:
model_class: AstroModel or a subclass.
hparams: ConfigDict of configuration parameters for building the model.
use_tpu: If True, a TPUEstimator model_fn is returned. Otherwise an
Estimator model_fn is returned.
Returns:
model_fn: A callable that constructs the model and returns a
TPUEstimatorSpec if use_tpu is True, otherwise an EstimatorSpec.
"""
return _ModelFn(model_class, hparams, use_tpu)
def create_estimator(model_class,
hparams,
run_config=None,
model_dir=None,
eval_batch_size=None):
"""Wraps model_class as an Estimator or TPUEstimator.
If run_config is None or a tf.estimator.RunConfig, an Estimator is returned.
If run_config is a tf.contrib.tpu.RunConfig, a TPUEstimator is returned.
Args:
model_class: AstroWaveNet or a subclass.
hparams: ConfigDict of configuration parameters for building the model.
run_config: Optional tf.estimator.RunConfig or tf.contrib.tpu.RunConfig.
model_dir: Optional directory for saving the model. If not passed
explicitly, it must be specified in run_config.
eval_batch_size: Optional batch size for evaluation on TPU. Only applicable
if run_config is a tf.contrib.tpu.RunConfig. Defaults to
hparams.batch_size.
Returns:
An Estimator object if run_config is None or a tf.estimator.RunConfig, or a
TPUEstimator object if run_config is a tf.contrib.tpu.RunConfig.
Raises:
ValueError:
If model_dir is not passed explicitly or in run_config.model_dir, or if
eval_batch_size is specified and run_config is not a
tf.contrib.tpu.RunConfig.
"""
if run_config is None:
run_config = tf.estimator.RunConfig()
else:
run_config = copy.deepcopy(run_config)
if not model_dir and not run_config.model_dir:
raise ValueError(
"model_dir must be passed explicitly or specified in run_config")
use_tpu = isinstance(run_config, tf.contrib.tpu.RunConfig)
model_fn = create_model_fn(model_class, hparams, use_tpu)
if use_tpu:
eval_batch_size = eval_batch_size or hparams.batch_size
estimator = tf.contrib.tpu.TPUEstimator(
model_fn=model_fn,
model_dir=model_dir,
config=run_config,
train_batch_size=hparams.batch_size,
eval_batch_size=eval_batch_size)
else:
if eval_batch_size is not None:
raise ValueError("eval_batch_size can only be specified for TPU.")
estimator = tf.estimator.Estimator(
model_fn=model_fn,
model_dir=model_dir,
config=run_config,
params={"batch_size": hparams.batch_size})
return estimator
...@@ -6,6 +6,7 @@ py_library( ...@@ -6,6 +6,7 @@ py_library(
name = "kepler_io", name = "kepler_io",
srcs = ["kepler_io.py"], srcs = ["kepler_io.py"],
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [":util"],
) )
py_test( py_test(
......
...@@ -44,5 +44,5 @@ class MedianFilterTest(absltest.TestCase): ...@@ -44,5 +44,5 @@ class MedianFilterTest(absltest.TestCase):
np.testing.assert_almost_equal(result, expected) np.testing.assert_almost_equal(result, expected)
if __name__ == '__main__': if __name__ == "__main__":
absltest.main() absltest.main()
...@@ -66,5 +66,5 @@ class PhaseFoldAndSortLightCurveTest(absltest.TestCase): ...@@ -66,5 +66,5 @@ class PhaseFoldAndSortLightCurveTest(absltest.TestCase):
np.testing.assert_almost_equal(folded_flux, expected_flux) np.testing.assert_almost_equal(folded_flux, expected_flux)
if __name__ == '__main__': if __name__ == "__main__":
absltest.main() absltest.main()
...@@ -24,7 +24,8 @@ def ValueErrorOnFalse(ok, *output_args): ...@@ -24,7 +24,8 @@ def ValueErrorOnFalse(ok, *output_args):
"""Raises ValueError if not ok, otherwise returns the output arguments.""" """Raises ValueError if not ok, otherwise returns the output arguments."""
n_outputs = len(output_args) n_outputs = len(output_args)
if n_outputs < 2: if n_outputs < 2:
raise ValueError("Expected 2 or more output_args. Got: %d" % n_outputs) raise ValueError(
"Expected 2 or more output_args. Got: {}".format(n_outputs))
if not ok: if not ok:
error = output_args[-1] error = output_args[-1]
......
...@@ -76,5 +76,5 @@ class ViewGeneratorTest(absltest.TestCase): ...@@ -76,5 +76,5 @@ class ViewGeneratorTest(absltest.TestCase):
np.testing.assert_almost_equal(result, expected) np.testing.assert_almost_equal(result, expected)
if __name__ == '__main__': if __name__ == "__main__":
absltest.main() absltest.main()
...@@ -23,10 +23,9 @@ import os.path ...@@ -23,10 +23,9 @@ import os.path
from astropy.io import fits from astropy.io import fits
import numpy as np import numpy as np
from light_curve_util import util
from tensorflow import gfile from tensorflow import gfile
LONG_CADENCE_TIME_DELTA_DAYS = 0.02043422 # Approximately 29.4 minutes.
# Quarter index to filename prefix for long cadence Kepler data. # Quarter index to filename prefix for long cadence Kepler data.
# Reference: https://archive.stsci.edu/kepler/software/get_kepler.py # Reference: https://archive.stsci.edu/kepler/software/get_kepler.py
LONG_CADENCE_QUARTER_PREFIXES = { LONG_CADENCE_QUARTER_PREFIXES = {
...@@ -73,6 +72,14 @@ SHORT_CADENCE_QUARTER_PREFIXES = { ...@@ -73,6 +72,14 @@ SHORT_CADENCE_QUARTER_PREFIXES = {
17: ["2013121191144", "2013131215648"] 17: ["2013121191144", "2013131215648"]
} }
# Quarter order for different scrambling procedures.
# Page 9: https://ntrs.nasa.gov/archive/nasa/casi.ntrs.nasa.gov/20170009549.pdf.
SIMULATED_DATA_SCRAMBLE_ORDERS = {
"SCR1": [0, 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4, 17],
"SCR2": [0, 1, 2, 3, 4, 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 17],
"SCR3": [0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 17],
}
def kepler_filenames(base_dir, def kepler_filenames(base_dir,
kep_id, kep_id,
...@@ -98,21 +105,21 @@ def kepler_filenames(base_dir, ...@@ -98,21 +105,21 @@ def kepler_filenames(base_dir,
Args: Args:
base_dir: Base directory containing Kepler data. base_dir: Base directory containing Kepler data.
kep_id: Id of the Kepler target star. May be an int or a possibly zero- kep_id: Id of the Kepler target star. May be an int or a possibly zero-
padded string. padded string.
long_cadence: Whether to read a long cadence (~29.4 min / measurement) light long_cadence: Whether to read a long cadence (~29.4 min / measurement) light
curve as opposed to a short cadence (~1 min / measurement) light curve. curve as opposed to a short cadence (~1 min / measurement) light curve.
quarters: Optional list of integers in [0, 17]; the quarters of the Kepler quarters: Optional list of integers in [0, 17]; the quarters of the Kepler
mission to return. mission to return.
injected_group: Optional string indicating injected light curves. One of injected_group: Optional string indicating injected light curves. One of
"inj1", "inj2", "inj3". "inj1", "inj2", "inj3".
check_existence: If True, only return filenames corresponding to files that check_existence: If True, only return filenames corresponding to files that
exist (not all stars have data for all quarters). exist (not all stars have data for all quarters).
Returns: Returns:
A list of filenames. A list of filenames.
""" """
# Pad the Kepler id with zeros to length 9. # Pad the Kepler id with zeros to length 9.
kep_id = "%.9d" % int(kep_id) kep_id = "{:09d}".format(int(kep_id))
quarter_prefixes, cadence_suffix = ((LONG_CADENCE_QUARTER_PREFIXES, "llc") quarter_prefixes, cadence_suffix = ((LONG_CADENCE_QUARTER_PREFIXES, "llc")
if long_cadence else if long_cadence else
...@@ -128,12 +135,11 @@ def kepler_filenames(base_dir, ...@@ -128,12 +135,11 @@ def kepler_filenames(base_dir,
for quarter in quarters: for quarter in quarters:
for quarter_prefix in quarter_prefixes[quarter]: for quarter_prefix in quarter_prefixes[quarter]:
if injected_group: if injected_group:
base_name = "kplr%s-%s_INJECTED-%s_%s.fits" % (kep_id, quarter_prefix, base_name = "kplr{}-{}_INJECTED-{}_{}.fits".format(
injected_group, kep_id, quarter_prefix, injected_group, cadence_suffix)
cadence_suffix)
else: else:
base_name = "kplr%s-%s_%s.fits" % (kep_id, quarter_prefix, base_name = "kplr{}-{}_{}.fits".format(kep_id, quarter_prefix,
cadence_suffix) cadence_suffix)
filename = os.path.join(base_dir, base_name) filename = os.path.join(base_dir, base_name)
# Not all stars have data for all quarters. # Not all stars have data for all quarters.
if not check_existence or gfile.Exists(filename): if not check_existence or gfile.Exists(filename):
...@@ -142,40 +148,86 @@ def kepler_filenames(base_dir, ...@@ -142,40 +148,86 @@ def kepler_filenames(base_dir,
return filenames return filenames
def scramble_light_curve(all_time, all_flux, all_quarters, scramble_type):
"""Scrambles a light curve according to a given scrambling procedure.
Args:
all_time: List holding arrays of time values, each containing a quarter of
time data.
all_flux: List holding arrays of flux values, each containing a quarter of
flux data.
all_quarters: List of integers specifying which quarters are present in
the light curve (max is 18: Q0...Q17).
scramble_type: String specifying the scramble order, one of {'SCR1', 'SCR2',
'SCR3'}.
Returns:
scr_flux: Scrambled flux values; the same list as the input flux in another
order.
scr_time: Time values, re-partitioned to match sizes of the scr_flux lists.
"""
order = SIMULATED_DATA_SCRAMBLE_ORDERS[scramble_type]
scr_flux = []
for quarter in order:
# Ignore missing quarters in the scramble order.
if quarter in all_quarters:
scr_flux.append(all_flux[all_quarters.index(quarter)])
scr_time = util.reshard_arrays(all_time, scr_flux)
return scr_time, scr_flux
def read_kepler_light_curve(filenames, def read_kepler_light_curve(filenames,
light_curve_extension="LIGHTCURVE", light_curve_extension="LIGHTCURVE",
invert=False): scramble_type=None,
interpolate_missing_time=False):
"""Reads time and flux measurements for a Kepler target star. """Reads time and flux measurements for a Kepler target star.
Args: Args:
filenames: A list of .fits files containing time and flux measurements. filenames: A list of .fits files containing time and flux measurements.
light_curve_extension: Name of the HDU 1 extension containing light curves. light_curve_extension: Name of the HDU 1 extension containing light curves.
invert: Whether to invert the flux measurements by multiplying by -1. scramble_type: What scrambling procedure to use: 'SCR1', 'SCR2', or 'SCR3'
(pg 9: https://exoplanetarchive.ipac.caltech.edu/docs/KSCI-19114-002.pdf).
interpolate_missing_time: Whether to interpolate missing (NaN) time values.
This should only affect the output if scramble_type is specified (NaN time
values typically come with NaN flux values, which are removed anyway, but
scrambing decouples NaN time values from NaN flux values).
Returns: Returns:
all_time: A list of numpy arrays; the time values of the light curve. all_time: A list of numpy arrays; the time values of the light curve.
all_flux: A list of numpy arrays corresponding to the time arrays in all_flux: A list of numpy arrays; the flux values of the light curve.
all_time.
""" """
all_time = [] all_time = []
all_flux = [] all_flux = []
all_quarters = []
for filename in filenames: for filename in filenames:
with fits.open(gfile.Open(filename, "rb")) as hdu_list: with fits.open(gfile.Open(filename, "rb")) as hdu_list:
quarter = hdu_list["PRIMARY"].header["QUARTER"]
light_curve = hdu_list[light_curve_extension].data light_curve = hdu_list[light_curve_extension].data
time = light_curve.TIME
flux = light_curve.PDCSAP_FLUX
# Remove NaN flux values. time = light_curve.TIME
valid_indices = np.where(np.isfinite(flux)) flux = light_curve.PDCSAP_FLUX
time = time[valid_indices] if not time.size:
flux = flux[valid_indices] continue # No data.
# Possibly interpolate missing time values.
if interpolate_missing_time:
time = util.interpolate_missing_time(time, light_curve.CADENCENO)
all_time.append(time)
all_flux.append(flux)
all_quarters.append(quarter)
if invert: if scramble_type:
flux *= -1 all_time, all_flux = scramble_light_curve(all_time, all_flux, all_quarters,
scramble_type)
if time.size: # Remove timestamps with NaN time or flux values.
all_time.append(time) for i, (time, flux) in enumerate(zip(all_time, all_flux)):
all_flux.append(flux) flux_and_time_finite = np.logical_and(np.isfinite(flux), np.isfinite(time))
all_time[i] = time[flux_and_time_finite]
all_flux[i] = flux[flux_and_time_finite]
return all_time, all_flux return all_time, all_flux
...@@ -19,8 +19,10 @@ from __future__ import division ...@@ -19,8 +19,10 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import os.path import os.path
from absl import flags from absl import flags
from absl.testing import absltest from absl.testing import absltest
import numpy as np
from light_curve_util import kepler_io from light_curve_util import kepler_io
...@@ -34,6 +36,26 @@ class KeplerIoTest(absltest.TestCase): ...@@ -34,6 +36,26 @@ class KeplerIoTest(absltest.TestCase):
def setUp(self): def setUp(self):
self.data_dir = os.path.join(FLAGS.test_srcdir, _DATA_DIR) self.data_dir = os.path.join(FLAGS.test_srcdir, _DATA_DIR)
def testScrambleLightCurve(self):
all_flux = [[11, 12], [21], [np.nan, np.nan, 33], [41, 42]]
all_time = [[101, 102], [201], [301, 302, 303], [401, 402]]
all_quarters = [3, 4, 7, 14]
scramble_type = "SCR1" # New quarters order will be [14,7,3,4].
scr_time, scr_flux = kepler_io.scramble_light_curve(
all_time, all_flux, all_quarters, scramble_type)
# NaNs are not removed in this function.
gold_flux = [[41, 42], [np.nan, np.nan, 33], [11, 12], [21]]
gold_time = [[101, 102], [201, 301, 302], [303, 401], [402]]
self.assertEqual(len(gold_flux), len(scr_flux))
self.assertEqual(len(gold_time), len(scr_time))
for i in range(len(gold_flux)):
np.testing.assert_array_equal(gold_flux[i], scr_flux[i])
np.testing.assert_array_equal(gold_time[i], scr_time[i])
def testKeplerFilenames(self): def testKeplerFilenames(self):
# All quarters. # All quarters.
filenames = kepler_io.kepler_filenames( filenames = kepler_io.kepler_filenames(
...@@ -100,15 +122,17 @@ class KeplerIoTest(absltest.TestCase): ...@@ -100,15 +122,17 @@ class KeplerIoTest(absltest.TestCase):
filenames = kepler_io.kepler_filenames( filenames = kepler_io.kepler_filenames(
self.data_dir, 11442793, check_existence=True) self.data_dir, 11442793, check_existence=True)
expected_filenames = [ expected_filenames = [
os.path.join(self.data_dir, "0114/011442793/kplr011442793-%s_llc.fits") os.path.join(self.data_dir,
% q for q in ["2009350155506", "2010009091648", "2010174085026"] "0114/011442793/kplr011442793-{}_llc.fits".format(q))
for q in ["2009350155506", "2010009091648", "2010174085026"]
] ]
self.assertItemsEqual(expected_filenames, filenames) self.assertItemsEqual(expected_filenames, filenames)
def testReadKeplerLightCurve(self): def testReadKeplerLightCurve(self):
filenames = [ filenames = [
os.path.join(self.data_dir, "0114/011442793/kplr011442793-%s_llc.fits") os.path.join(self.data_dir,
% q for q in ["2009350155506", "2010009091648", "2010174085026"] "0114/011442793/kplr011442793-{}_llc.fits".format(q))
for q in ["2009350155506", "2010009091648", "2010174085026"]
] ]
all_time, all_flux = kepler_io.read_kepler_light_curve(filenames) all_time, all_flux = kepler_io.read_kepler_light_curve(filenames)
self.assertLen(all_time, 3) self.assertLen(all_time, 3)
...@@ -120,6 +144,55 @@ class KeplerIoTest(absltest.TestCase): ...@@ -120,6 +144,55 @@ class KeplerIoTest(absltest.TestCase):
self.assertLen(all_time[2], 4486) self.assertLen(all_time[2], 4486)
self.assertLen(all_flux[2], 4486) self.assertLen(all_flux[2], 4486)
for time, flux in zip(all_time, all_flux):
self.assertTrue(np.isfinite(time).all())
self.assertTrue(np.isfinite(flux).all())
def testReadKeplerLightCurveScrambled(self):
filenames = [
os.path.join(self.data_dir,
"0114/011442793/kplr011442793-{}_llc.fits".format(q))
for q in ["2009350155506", "2010009091648", "2010174085026"]
]
all_time, all_flux = kepler_io.read_kepler_light_curve(
filenames, scramble_type="SCR1")
self.assertLen(all_time, 3)
self.assertLen(all_flux, 3)
# Arrays are shorter than above due to separation of time and flux NaNs.
self.assertLen(all_time[0], 4344)
self.assertLen(all_flux[0], 4344)
self.assertLen(all_time[1], 4041)
self.assertLen(all_flux[1], 4041)
self.assertLen(all_time[2], 1008)
self.assertLen(all_flux[2], 1008)
for time, flux in zip(all_time, all_flux):
self.assertTrue(np.isfinite(time).all())
self.assertTrue(np.isfinite(flux).all())
def testReadKeplerLightCurveScrambledInterpolateMissingTime(self):
filenames = [
os.path.join(self.data_dir,
"0114/011442793/kplr011442793-{}_llc.fits".format(q))
for q in ["2009350155506", "2010009091648", "2010174085026"]
]
all_time, all_flux = kepler_io.read_kepler_light_curve(
filenames, scramble_type="SCR1", interpolate_missing_time=True)
self.assertLen(all_time, 3)
self.assertLen(all_flux, 3)
self.assertLen(all_time[0], 4486)
self.assertLen(all_flux[0], 4486)
self.assertLen(all_time[1], 4134)
self.assertLen(all_flux[1], 4134)
self.assertLen(all_time[2], 1008)
self.assertLen(all_flux[2], 1008)
for time, flux in zip(all_time, all_flux):
self.assertTrue(np.isfinite(time).all())
self.assertTrue(np.isfinite(flux).all())
if __name__ == "__main__": if __name__ == "__main__":
FLAGS.test_srcdir = "" FLAGS.test_srcdir = ""
......
...@@ -32,16 +32,16 @@ def median_filter(x, y, num_bins, bin_width=None, x_min=None, x_max=None): ...@@ -32,16 +32,16 @@ def median_filter(x, y, num_bins, bin_width=None, x_min=None, x_max=None):
Args: Args:
x: 1D array of x-coordinates sorted in ascending order. Must have at least 2 x: 1D array of x-coordinates sorted in ascending order. Must have at least 2
elements, and all elements cannot be the same value. elements, and all elements cannot be the same value.
y: 1D array of y-coordinates with the same size as x. y: 1D array of y-coordinates with the same size as x.
num_bins: The number of intervals to divide the x-axis into. Must be at num_bins: The number of intervals to divide the x-axis into. Must be at
least 2. least 2.
bin_width: The width of each bin on the x-axis. Must be positive, and less bin_width: The width of each bin on the x-axis. Must be positive, and less
than x_max - x_min. Defaults to (x_max - x_min) / num_bins. than x_max - x_min. Defaults to (x_max - x_min) / num_bins.
x_min: The inclusive leftmost value to consider on the x-axis. Must be less x_min: The inclusive leftmost value to consider on the x-axis. Must be less
than or equal to the largest value of x. Defaults to min(x). than or equal to the largest value of x. Defaults to min(x).
x_max: The exclusive rightmost value to consider on the x-axis. Must be x_max: The exclusive rightmost value to consider on the x-axis. Must be
greater than x_min. Defaults to max(x). greater than x_min. Defaults to max(x).
Returns: Returns:
1D NumPy array of size num_bins containing the median y-values of uniformly 1D NumPy array of size num_bins containing the median y-values of uniformly
...@@ -51,35 +51,35 @@ def median_filter(x, y, num_bins, bin_width=None, x_min=None, x_max=None): ...@@ -51,35 +51,35 @@ def median_filter(x, y, num_bins, bin_width=None, x_min=None, x_max=None):
ValueError: If an argument has an inappropriate value. ValueError: If an argument has an inappropriate value.
""" """
if num_bins < 2: if num_bins < 2:
raise ValueError("num_bins must be at least 2. Got: %d" % num_bins) raise ValueError("num_bins must be at least 2. Got: {}".format(num_bins))
# Validate the lengths of x and y. # Validate the lengths of x and y.
x_len = len(x) x_len = len(x)
if x_len < 2: if x_len < 2:
raise ValueError("len(x) must be at least 2. Got: %s" % x_len) raise ValueError("len(x) must be at least 2. Got: {}".format(x_len))
if x_len != len(y): if x_len != len(y):
raise ValueError("len(x) (got: %d) must equal len(y) (got: %d)" % (x_len, raise ValueError("len(x) (got: {}) must equal len(y) (got: {})".format(
len(y))) x_len, len(y)))
# Validate x_min and x_max. # Validate x_min and x_max.
x_min = x_min if x_min is not None else x[0] x_min = x_min if x_min is not None else x[0]
x_max = x_max if x_max is not None else x[-1] x_max = x_max if x_max is not None else x[-1]
if x_min >= x_max: if x_min >= x_max:
raise ValueError("x_min (got: %d) must be less than x_max (got: %d)" % raise ValueError("x_min (got: {}) must be less than x_max (got: {})".format(
(x_min, x_max)) x_min, x_max))
if x_min > x[-1]: if x_min > x[-1]:
raise ValueError( raise ValueError(
"x_min (got: %d) must be less than or equal to the largest value of x " "x_min (got: {}) must be less than or equal to the largest value of x "
"(got: %d)" % (x_min, x[-1])) "(got: {})".format(x_min, x[-1]))
# Validate bin_width. # Validate bin_width.
bin_width = bin_width if bin_width is not None else (x_max - x_min) / num_bins bin_width = bin_width if bin_width is not None else (x_max - x_min) / num_bins
if bin_width <= 0: if bin_width <= 0:
raise ValueError("bin_width must be positive. Got: %d" % bin_width) raise ValueError("bin_width must be positive. Got: {}".format(bin_width))
if bin_width >= x_max - x_min: if bin_width >= x_max - x_min:
raise ValueError( raise ValueError(
"bin_width (got: %d) must be less than x_max - x_min (got: %d)" % "bin_width (got: {}) must be less than x_max - x_min (got: {})".format(
(bin_width, x_max - x_min)) bin_width, x_max - x_min))
bin_spacing = (x_max - x_min - bin_width) / (num_bins - 1) bin_spacing = (x_max - x_min - bin_width) / (num_bins - 1)
......
...@@ -124,5 +124,5 @@ class MedianFilterTest(absltest.TestCase): ...@@ -124,5 +124,5 @@ class MedianFilterTest(absltest.TestCase):
np.testing.assert_array_equal([7, 1, 5, 2, 3], result) np.testing.assert_array_equal([7, 1, 5, 2, 3], result)
if __name__ == '__main__': if __name__ == "__main__":
absltest.main() absltest.main()
...@@ -62,7 +62,7 @@ class Event(object): ...@@ -62,7 +62,7 @@ class Event(object):
other_event: An Event. other_event: An Event.
period_rtol: Relative tolerance in matching the periods. period_rtol: Relative tolerance in matching the periods.
t0_durations: Tolerance in matching the t0 values, in units of the other t0_durations: Tolerance in matching the t0 values, in units of the other
Event's duration. Event's duration.
Returns: Returns:
True if this Event is the same as other_event, within the given tolerance. True if this Event is the same as other_event, within the given tolerance.
......
...@@ -19,6 +19,7 @@ from __future__ import division ...@@ -19,6 +19,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
import scipy.interpolate
from six.moves import range # pylint:disable=redefined-builtin from six.moves import range # pylint:disable=redefined-builtin
...@@ -50,9 +51,9 @@ def split(all_time, all_flux, gap_width=0.75): ...@@ -50,9 +51,9 @@ def split(all_time, all_flux, gap_width=0.75):
Args: Args:
all_time: Numpy array or sequence of numpy arrays; each is a sequence of all_time: Numpy array or sequence of numpy arrays; each is a sequence of
time values. time values.
all_flux: Numpy array or sequence of numpy arrays; each is a sequence of all_flux: Numpy array or sequence of numpy arrays; each is a sequence of
flux values of the corresponding time array. flux values of the corresponding time array.
gap_width: Minimum gap size (in time units) for a split. gap_width: Minimum gap size (in time units) for a split.
Returns: Returns:
...@@ -90,9 +91,9 @@ def remove_events(all_time, ...@@ -90,9 +91,9 @@ def remove_events(all_time,
Args: Args:
all_time: Numpy array or sequence of numpy arrays; each is a sequence of all_time: Numpy array or sequence of numpy arrays; each is a sequence of
time values. time values.
all_flux: Numpy array or sequence of numpy arrays; each is a sequence of all_flux: Numpy array or sequence of numpy arrays; each is a sequence of
flux values of the corresponding time array. flux values of the corresponding time array.
events: List of Event objects to remove. events: List of Event objects to remove.
width_factor: Fractional multiplier of the duration of each event to remove. width_factor: Fractional multiplier of the duration of each event to remove.
include_empty_segments: Whether to include empty segments in the output. include_empty_segments: Whether to include empty segments in the output.
...@@ -130,23 +131,63 @@ def remove_events(all_time, ...@@ -130,23 +131,63 @@ def remove_events(all_time,
return output_time, output_flux return output_time, output_flux
def interpolate_missing_time(time, cadence_no=None, fill_value="extrapolate"):
"""Interpolates missing (NaN or Inf) time values.
Args:
time: A numpy array of monotonically increasing values, with missing values
denoted by NaN or Inf.
cadence_no: Optional numpy array of cadence numbers corresponding to the
time values. If not provided, missing time values are assumed to be evenly
spaced between present time values.
fill_value: Specifies how missing time values should be treated at the
beginning and end of the array. See scipy.interpolate.interp1d.
Returns:
A numpy array of the same length as the input time array, with NaN/Inf
values replaced with interpolated values.
Raises:
ValueError: If fewer than 2 values of time are finite.
"""
if cadence_no is None:
cadence_no = np.arange(len(time))
is_finite = np.isfinite(time)
num_finite = np.sum(is_finite)
if num_finite < 2:
raise ValueError(
"Cannot interpolate time with fewer than 2 finite values. Got "
"len(time) = {} with {} finite values.".format(len(time), num_finite))
interpolate_fn = scipy.interpolate.interp1d(
cadence_no[is_finite],
time[is_finite],
copy=False,
bounds_error=False,
fill_value=fill_value,
assume_sorted=True)
return interpolate_fn(cadence_no)
def interpolate_masked_spline(all_time, all_masked_time, all_masked_spline): def interpolate_masked_spline(all_time, all_masked_time, all_masked_spline):
"""Linearly interpolates spline values across masked points. """Linearly interpolates spline values across masked points.
Args: Args:
all_time: List of numpy arrays; each is a sequence of time values. all_time: List of numpy arrays; each is a sequence of time values.
all_masked_time: List of numpy arrays; each is a sequence of time values all_masked_time: List of numpy arrays; each is a sequence of time values
with some values missing (masked). with some values missing (masked).
all_masked_spline: List of numpy arrays; the masked spline values all_masked_spline: List of numpy arrays; the masked spline values
corresponding to all_masked_time. corresponding to all_masked_time.
Returns: Returns:
interp_spline: List of numpy arrays; each is the masked spline with missing interp_spline: List of numpy arrays; each is the masked spline with missing
points linearly interpolated. points linearly interpolated.
""" """
interp_spline = [] interp_spline = []
for time, masked_time, masked_spline in zip( for time, masked_time, masked_spline in zip(all_time, all_masked_time,
all_time, all_masked_time, all_masked_spline): all_masked_spline):
if masked_time.size: if masked_time.size:
interp_spline.append(np.interp(time, masked_time, masked_spline)) interp_spline.append(np.interp(time, masked_time, masked_spline))
else: else:
...@@ -154,6 +195,76 @@ def interpolate_masked_spline(all_time, all_masked_time, all_masked_spline): ...@@ -154,6 +195,76 @@ def interpolate_masked_spline(all_time, all_masked_time, all_masked_spline):
return interp_spline return interp_spline
def reshard_arrays(xs, ys):
"""Reshards arrays in xs to match the lengths of arrays in ys.
Args:
xs: List of 1d numpy arrays with the same total length as ys.
ys: List of 1d numpy arrays with the same total length as xs.
Returns:
A list of numpy arrays containing the same elements as xs, in the same
order, but with array lengths matching the pairwise array in ys.
Raises:
ValueError: If xs and ys do not have the same total length.
"""
# Compute indices of boundaries between segments of ys, plus the end boundary.
boundaries = np.cumsum([len(y) for y in ys])
concat_x = np.concatenate(xs)
if len(concat_x) != boundaries[-1]:
raise ValueError(
"xs and ys do not have the same total length ({} vs. {}).".format(
len(concat_x), boundaries[-1]))
boundaries = boundaries[:-1] # Remove exclusive end boundary.
return np.split(concat_x, boundaries)
def uniform_cadence_light_curve(cadence_no, time, flux):
"""Combines data into a single light curve with uniform cadence numbers.
Args:
cadence_no: numpy array; the cadence numbers of the light curve.
time: numpy array; the time values of the light curve.
flux: numpy array; the flux values of the light curve.
Returns:
cadence_no: numpy array; the cadence numbers of the light curve with no
gaps. It starts and ends at the minimum and maximum cadence numbers in the
input light curve, respectively.
time: numpy array; the time values of the light curve. Missing data points
have value zero and correspond to a False value in the mask.
flux: numpy array; the time values of the light curve. Missing data points
have value zero and correspond to a False value in the mask.
mask: Boolean numpy array; False indicates missing data points, where
missing data points are those that have no corresponding cadence number in
the input or those where at least one of the cadence number, time value,
or flux value is NaN/Inf.
Raises:
ValueError: If there are duplicate cadence numbers in the input.
"""
min_cadence_no = np.min(cadence_no)
max_cadence_no = np.max(cadence_no)
out_cadence_no = np.arange(
min_cadence_no, max_cadence_no + 1, dtype=cadence_no.dtype)
out_time = np.zeros_like(out_cadence_no, dtype=time.dtype)
out_flux = np.zeros_like(out_cadence_no, dtype=flux.dtype)
out_mask = np.zeros_like(out_cadence_no, dtype=np.bool)
for c, t, f in zip(cadence_no, time, flux):
if np.isfinite(c) and np.isfinite(t) and np.isfinite(f):
i = int(c - min_cadence_no)
if out_mask[i]:
raise ValueError("Duplicate cadence number: {}".format(c))
out_time[i] = t
out_flux[i] = f
out_mask[i] = True
return out_cadence_no, out_time, out_flux, out_mask
def count_transit_points(time, event): def count_transit_points(time, event):
"""Computes the number of points in each transit of a given event. """Computes the number of points in each transit of a given event.
...@@ -174,8 +285,8 @@ def count_transit_points(time, event): ...@@ -174,8 +285,8 @@ def count_transit_points(time, event):
# Tiny periods or erroneous time values could make this loop take forever. # Tiny periods or erroneous time values could make this loop take forever.
if (t_max - t_min) / event.period > 10**6: if (t_max - t_min) / event.period > 10**6:
raise ValueError( raise ValueError(
"Too many transits! Time range is [%.2f, %.2f] and period is %.2e." % "Too many transits! Time range is [{:.4f}, {:.4f}] and period is "
(t_min, t_max, event.period)) "{:.4e}.".format(t_min, t_max, event.period))
# Make sure t0 is in [t_min, t_min + period). # Make sure t0 is in [t_min, t_min + period).
t0 = np.mod(event.t0 - t_min, event.period) + t_min t0 = np.mod(event.t0 - t_min, event.period) + t_min
......
...@@ -176,6 +176,61 @@ class LightCurveUtilTest(absltest.TestCase): ...@@ -176,6 +176,61 @@ class LightCurveUtilTest(absltest.TestCase):
self.assertSequenceAlmostEqual([16, 17, 18, 19], output_time[0]) self.assertSequenceAlmostEqual([16, 17, 18, 19], output_time[0])
self.assertSequenceAlmostEqual([160, 170, 180, 190], output_flux[0]) self.assertSequenceAlmostEqual([160, 170, 180, 190], output_flux[0])
def testInterpolateMissingTime(self):
# Fewer than 2 finite values.
with self.assertRaises(ValueError):
util.interpolate_missing_time(np.array([]))
with self.assertRaises(ValueError):
util.interpolate_missing_time(np.array([5.0]))
with self.assertRaises(ValueError):
util.interpolate_missing_time(np.array([5.0, np.nan]))
with self.assertRaises(ValueError):
util.interpolate_missing_time(np.array([np.nan, np.nan, np.nan]))
# Small time arrays.
self.assertSequenceAlmostEqual([0.5, 0.6],
util.interpolate_missing_time(
np.array([0.5, 0.6])))
self.assertSequenceAlmostEqual([0.5, 0.6, 0.7],
util.interpolate_missing_time(
np.array([0.5, np.nan, 0.7])))
# Time array of length 20 with some values NaN.
time = np.array([
np.nan, 0.5, 1.0, 1.5, 2.0, 2.5, np.nan, 3.5, 4.0, 4.5, 5.0, np.nan,
np.nan, np.nan, np.nan, 7.5, 8.0, 8.5, np.nan, np.nan
])
interp_time = util.interpolate_missing_time(time)
self.assertSequenceAlmostEqual([
0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5,
7.0, 7.5, 8.0, 8.5, 9.0, 9.5
], interp_time)
# Fill with 0.0 for missing values at the beginning and end.
interp_time = util.interpolate_missing_time(time, fill_value=0.0)
self.assertSequenceAlmostEqual([
0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5,
7.0, 7.5, 8.0, 8.5, 0.0, 0.0
], interp_time)
# Interpolate with cadences.
cadences = np.array([
100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
114, 115, 116, 117, 118, 119
])
interp_time = util.interpolate_missing_time(time, cadences)
self.assertSequenceAlmostEqual([
0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5,
7.0, 7.5, 8.0, 8.5, 9.0, 9.5
], interp_time)
# Interpolate with missing cadences.
time = np.array([0.6, 0.7, np.nan, np.nan, np.nan, 1.3, 1.4, 1.5])
cadences = np.array([106, 107, 108, 109, 110, 113, 114, 115])
interp_time = util.interpolate_missing_time(time, cadences)
self.assertSequenceAlmostEqual([0.6, 0.7, 0.8, 0.9, 1.0, 1.3, 1.4, 1.5],
interp_time)
def testInterpolateMaskedSpline(self): def testInterpolateMaskedSpline(self):
all_time = [ all_time = [
np.arange(0, 10, dtype=np.float), np.arange(0, 10, dtype=np.float),
...@@ -198,6 +253,53 @@ class LightCurveUtilTest(absltest.TestCase): ...@@ -198,6 +253,53 @@ class LightCurveUtilTest(absltest.TestCase):
[120, 122, 124, 126, 128, 130, 132, 132, 132, 132], interp_spline[1]) [120, 122, 124, 126, 128, 130, 132, 132, 132, 132], interp_spline[1])
self.assertTrue(np.all(np.isnan(interp_spline[2]))) self.assertTrue(np.all(np.isnan(interp_spline[2])))
def testReshardArrays(self):
xs = [
np.array([1, 2, 3]),
np.array([4]),
np.array([5, 6, 7, 8, 9]),
np.array([]),
]
ys = [
np.array([]),
np.array([10, 20]),
np.array([30, 40, 50, 60]),
np.array([70]),
np.array([80, 90]),
]
reshard_xs = util.reshard_arrays(xs, ys)
self.assertEqual(5, len(reshard_xs))
np.testing.assert_array_equal([], reshard_xs[0])
np.testing.assert_array_equal([1, 2], reshard_xs[1])
np.testing.assert_array_equal([3, 4, 5, 6], reshard_xs[2])
np.testing.assert_array_equal([7], reshard_xs[3])
np.testing.assert_array_equal([8, 9], reshard_xs[4])
with self.assertRaisesRegexp(ValueError,
"xs and ys do not have the same total length"):
util.reshard_arrays(xs, [np.array([10, 20, 30]), np.array([40, 50])])
def testUniformCadenceLightCurve(self):
input_cadence_no = np.array([13, 4, 5, 6, 8, 9, 11, 12])
input_time = np.array([130, 40, 50, 60, 80, 90, 110, 120])
input_flux = np.array([1300, 400, 500, 600, 800, np.nan, 1100, 1200])
cadence_no, time, flux, mask = util.uniform_cadence_light_curve(
input_cadence_no, input_time, input_flux)
np.testing.assert_array_equal([4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
cadence_no)
np.testing.assert_array_equal([40, 50, 60, 0, 80, 0, 0, 110, 120, 130],
time)
np.testing.assert_array_equal(
[400, 500, 600, 0, 800, 0, 0, 1100, 1200, 1300], flux)
np.testing.assert_array_equal([1, 1, 1, 0, 1, 0, 0, 1, 1, 1], mask)
# Add duplicate cadence number.
input_cadence_no = np.concatenate([input_cadence_no, np.array([13, 14])])
input_time = np.concatenate([input_time, np.array([130, 140])])
input_flux = np.concatenate([input_flux, np.array([1300, 1400])])
with self.assertRaisesRegexp(ValueError, "Duplicate cadence number"):
util.uniform_cadence_light_curve(input_cadence_no, input_time, input_flux)
def testCountTransitPoints(self): def testCountTransitPoints(self):
time = np.concatenate([ time = np.concatenate([
np.arange(0, 10, 0.1, dtype=np.float), np.arange(0, 10, 0.1, dtype=np.float),
......
...@@ -37,9 +37,9 @@ def kepler_spline(time, flux, bkspace=1.5, maxiter=5, outlier_cut=3): ...@@ -37,9 +37,9 @@ def kepler_spline(time, flux, bkspace=1.5, maxiter=5, outlier_cut=3):
flux: Numpy array; the flux (brightness) values of the light curve. flux: Numpy array; the flux (brightness) values of the light curve.
bkspace: Spline break point spacing in time units. bkspace: Spline break point spacing in time units.
maxiter: Maximum number of attempts to fit the spline after removing badly maxiter: Maximum number of attempts to fit the spline after removing badly
fit points. fit points.
outlier_cut: The maximum number of standard deviations from the median outlier_cut: The maximum number of standard deviations from the median
spline residual before a point is considered an outlier. spline residual before a point is considered an outlier.
Returns: Returns:
spline: The values of the fitted spline corresponding to the input time spline: The values of the fitted spline corresponding to the input time
...@@ -54,7 +54,8 @@ def kepler_spline(time, flux, bkspace=1.5, maxiter=5, outlier_cut=3): ...@@ -54,7 +54,8 @@ def kepler_spline(time, flux, bkspace=1.5, maxiter=5, outlier_cut=3):
""" """
if len(time) < 4: if len(time) < 4:
raise InsufficientPointsError( raise InsufficientPointsError(
"Cannot fit a spline on less than 4 points. Got %d points." % len(time)) "Cannot fit a spline on less than 4 points. Got {} points.".format(
len(time)))
# Rescale time into [0, 1]. # Rescale time into [0, 1].
t_min = np.min(time) t_min = np.min(time)
...@@ -91,7 +92,7 @@ def kepler_spline(time, flux, bkspace=1.5, maxiter=5, outlier_cut=3): ...@@ -91,7 +92,7 @@ def kepler_spline(time, flux, bkspace=1.5, maxiter=5, outlier_cut=3):
# and we consider this a fatal error. # and we consider this a fatal error.
raise InsufficientPointsError( raise InsufficientPointsError(
"Cannot fit a spline on less than 4 points. After removing " "Cannot fit a spline on less than 4 points. After removing "
"outliers, got %d points." % np.sum(mask)) "outliers, got {} points.".format(np.sum(mask)))
try: try:
with warnings.catch_warnings(): with warnings.catch_warnings():
...@@ -106,9 +107,9 @@ def kepler_spline(time, flux, bkspace=1.5, maxiter=5, outlier_cut=3): ...@@ -106,9 +107,9 @@ def kepler_spline(time, flux, bkspace=1.5, maxiter=5, outlier_cut=3):
spline = curve.value(time)[0] spline = curve.value(time)[0]
except (IndexError, TypeError) as e: except (IndexError, TypeError) as e:
raise SplineError( raise SplineError(
"Fitting spline failed with error: '%s'. This might be caused by the " "Fitting spline failed with error: '{}'. This might be caused by the "
"breakpoint spacing being too small, and/or there being insufficient " "breakpoint spacing being too small, and/or there being insufficient "
"points to fit the spline in one of the intervals." % e) "points to fit the spline in one of the intervals.".format(e))
return spline, mask return spline, mask
...@@ -118,15 +119,15 @@ class SplineMetadata(object): ...@@ -118,15 +119,15 @@ class SplineMetadata(object):
Attributes: Attributes:
light_curve_mask: List of boolean numpy arrays indicating which points in light_curve_mask: List of boolean numpy arrays indicating which points in
the light curve were used to fit the best-fit spline. the light curve were used to fit the best-fit spline.
bkspace: The break-point spacing used for the best-fit spline. bkspace: The break-point spacing used for the best-fit spline.
bad_bkspaces: List of break-point spacing values that failed. bad_bkspaces: List of break-point spacing values that failed.
likelihood_term: The likelihood term of the Bayesian Information Criterion; likelihood_term: The likelihood term of the Bayesian Information Criterion;
-2*ln(L), where L is the likelihood of the data given the model. -2*ln(L), where L is the likelihood of the data given the model.
penalty_term: The penalty term for the number of parameters in the penalty_term: The penalty term for the number of parameters in the Bayesian
Bayesian Information Criterion. Information Criterion.
bic: The value of the Bayesian Information Criterion; equal to bic: The value of the Bayesian Information Criterion; equal to
likelihood_term + penalty_coeff * penalty_term. likelihood_term + penalty_coeff * penalty_term.
""" """
def __init__(self): def __init__(self):
...@@ -162,14 +163,13 @@ def choose_kepler_spline(all_time, ...@@ -162,14 +163,13 @@ def choose_kepler_spline(all_time,
all_flux: List of 1D numpy arrays; the flux values of the light curve. all_flux: List of 1D numpy arrays; the flux values of the light curve.
bkspaces: List of break-point spacings to try. bkspaces: List of break-point spacings to try.
maxiter: Maximum number of attempts to fit each spline after removing badly maxiter: Maximum number of attempts to fit each spline after removing badly
fit points. fit points.
penalty_coeff: Coefficient of the penalty term for using more parameters in penalty_coeff: Coefficient of the penalty term for using more parameters in
the Bayesian Information Criterion. Decreasing this value will allow the Bayesian Information Criterion. Decreasing this value will allow more
more parameters to be used (i.e. smaller break-point spacing), and parameters to be used (i.e. smaller break-point spacing), and vice-versa.
vice-versa.
verbose: Whether to log individual spline errors. Note that if bkspaces verbose: Whether to log individual spline errors. Note that if bkspaces
contains many values (particularly small ones) then this may cause contains many values (particularly small ones) then this may cause logging
logging pollution if calling this function for many light curves. pollution if calling this function for many light curves.
Returns: Returns:
spline: List of numpy arrays; values of the best-fit spline corresponding to spline: List of numpy arrays; values of the best-fit spline corresponding to
...@@ -227,7 +227,7 @@ def choose_kepler_spline(all_time, ...@@ -227,7 +227,7 @@ def choose_kepler_spline(all_time,
# It's expected to get a SplineError occasionally for small values of # It's expected to get a SplineError occasionally for small values of
# bkspace. Skip this bkspace. # bkspace. Skip this bkspace.
if verbose: if verbose:
warnings.warn("Bad bkspace %.4f: %s" % (bkspace, e)) warnings.warn("Bad bkspace {}: {}".format(bkspace, e))
metadata.bad_bkspaces.append(bkspace) metadata.bad_bkspaces.append(bkspace)
bad_bkspace = True bad_bkspace = True
break break
...@@ -294,14 +294,13 @@ def fit_kepler_spline(all_time, ...@@ -294,14 +294,13 @@ def fit_kepler_spline(all_time,
bkspace_max: Maximum breakpoint spacing to try. bkspace_max: Maximum breakpoint spacing to try.
bkspace_num: Number of breakpoint spacings to try. bkspace_num: Number of breakpoint spacings to try.
maxiter: Maximum number of attempts to fit each spline after removing badly maxiter: Maximum number of attempts to fit each spline after removing badly
fit points. fit points.
penalty_coeff: Coefficient of the penalty term for using more parameters in penalty_coeff: Coefficient of the penalty term for using more parameters in
the Bayesian Information Criterion. Decreasing this value will allow the Bayesian Information Criterion. Decreasing this value will allow more
more parameters to be used (i.e. smaller break-point spacing), and parameters to be used (i.e. smaller break-point spacing), and vice-versa.
vice-versa.
verbose: Whether to log individual spline errors. Note that if bkspaces verbose: Whether to log individual spline errors. Note that if bkspaces
contains many values (particularly small ones) then this may cause contains many values (particularly small ones) then this may cause logging
logging pollution if calling this function for many light curves. pollution if calling this function for many light curves.
Returns: Returns:
spline: List of numpy arrays; values of the best-fit spline corresponding to spline: List of numpy arrays; values of the best-fit spline corresponding to
......
...@@ -17,7 +17,7 @@ def robust_mean(y, cut): ...@@ -17,7 +17,7 @@ def robust_mean(y, cut):
Args: Args:
y: 1D numpy array. Assumed to be normally distributed with outliers. y: 1D numpy array. Assumed to be normally distributed with outliers.
cut: Points more than this number of standard deviations from the median are cut: Points more than this number of standard deviations from the median are
ignored. ignored.
Returns: Returns:
mean: A robust estimate of the mean of y. mean: A robust estimate of the mean of y.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment