Source code for meld.vault

import contextlib
import os
import time
import cPickle as pickle
import netCDF4 as cdf
import numpy as np
import shutil
from meld.system import state


[docs]class DataStore(object): """ Class to handle storing data from MELD runs. :param n_atoms: number of atoms :param n_replicas: number of replicas :param block_size: size of netcdf blocks and frequency to do backups Data will be stored in the 'Data' subdirectory. Backups will be stored in 'Data/Backup'. Some information is stored as python pickled files: - data_store.dat -- the DataStore object - communicator.dat -- the MPICommunicator object - remd_runner.dat -- the MasterReplicaExchangeRunner object Other data (positions, velocities, etc) is stored in the results.nc file. """ # # data paths # data_dir = 'Data' backup_dir = os.path.join(data_dir, 'Backup') blocks_dir = os.path.join(data_dir, 'Blocks') data_store_filename = 'data_store.dat' data_store_path = os.path.join(data_dir, data_store_filename) data_store_backup_path = os.path.join(backup_dir, data_store_filename) communicator_filename = 'communicator.dat' communicator_path = os.path.join(data_dir, communicator_filename) communicator_backup_path = os.path.join(backup_dir, communicator_filename) remd_runner_filename = 'remd_runner.dat' remd_runner_path = os.path.join(data_dir, remd_runner_filename) remd_runner_backup_path = os.path.join(backup_dir, remd_runner_filename) system_filename = 'system.dat' system_path = os.path.join(data_dir, system_filename) system_backup_path = os.path.join(backup_dir, system_filename) run_options_filename = 'run_options.dat' run_options_path = os.path.join(data_dir, run_options_filename) run_options_backup_path = os.path.join(backup_dir, run_options_filename) net_cdf_filename_template = 'block_{:06d}.nc' net_cdf_path_template = os.path.join(blocks_dir, net_cdf_filename_template) traj_filename = 'trajectory.pdb' traj_path = os.path.join(data_dir, traj_filename) traj_backup_path = os.path.join(backup_dir, traj_filename) def __init__(self, n_atoms, n_replicas, pdb_writer, block_size=100): self._n_atoms = n_atoms self._n_replicas = n_replicas self._block_size = block_size self._cdf_data_set = None self._readonly_mode = False self._pdb_writer = pdb_writer self._current_stage = None self._current_block = None self._max_safe_block = -1 self._readonly_mode = False def __getstate__(self): # don't save some fields to disk excluded = ['_cdf_data_set'] return dict((k, v) for (k, v) in self.__dict__.iteritems() if not k in excluded) def __setstate__(self, state): # set _cdf_data_set to None self.__dict__ = state self._cdf_data_set = None def __del__(self): # close the _cdf_data_set when we go out of scope if self._cdf_data_set: self._cdf_data_set.close() # # properties # @property def n_replicas(self): return self._n_replicas @property def n_atoms(self): return self._n_atoms # # public methods #
[docs] def initialize(self, mode): """ Prepare to use the DataStore object. :param mode: mode to open in. Available modes are: - 'w' -- create a new directory structure and initialize the hd5 file - 'a' -- append to the existing files - 'r' -- open the file in read-only mode """ if mode == 'w': if os.path.exists(self.data_dir): raise RuntimeError('Data directory already exists') os.mkdir(self.data_dir) os.mkdir(self.blocks_dir) os.mkdir(self.backup_dir) self._current_block = 0 self._current_stage = 0 self._create_cdf_file() elif mode == 'a': block_path = self.net_cdf_path_template.format(self._current_block) if os.path.exists(block_path): self._cdf_data_set = cdf.Dataset(block_path, 'a') else: self._create_cdf_file() elif mode == 'r': self._current_block = 0 self._readonly_mode = True self._load_cdf_file_readonly() else: raise RuntimeError('Unknown value for mode={}'.format(mode))
[docs] def close(self): """Close the DataStore""" if self._cdf_data_set: self._cdf_data_set.close() self._cdf_data_set = None
[docs] def save_data_store(self): """Save this object to disk.""" with open(self.data_store_path, 'w') as store_file: pickle.dump(self, store_file)
@classmethod
[docs] def load_data_store(cls, load_backup=False): """Load the DataStore object from disk.""" path = cls.data_store_backup_path if load_backup else cls.data_store_path with open(path) as store_file: return pickle.load(store_file)
[docs] def save_communicator(self, comm): """Save the communicator to disk""" self._can_save() with open(self.communicator_path, 'w') as comm_file: pickle.dump(comm, comm_file)
[docs] def load_communicator(self): """Load the communicator from disk""" if self._readonly_mode: path = self.communicator_backup_path else: path = self.communicator_path with open(path) as comm_file: return pickle.load(comm_file)
[docs] def save_positions(self, positions, stage): """ Save the positions to disk. :param positions: n_replicas x n_atoms x 3 array :param stage: int stage to store """ self._can_save() self._handle_save_stage(stage) self._cdf_data_set.variables['positions'][..., stage] = positions
[docs] def load_positions(self, stage): """ Load positions from disk. :param stage: int stage to load """ self._handle_load_stage(stage) return self._cdf_data_set.variables['positions'][..., stage]
[docs] def load_positions_random_access(self, stage): """ Load positions from disk. :param stage: int stage to load This differs from :meth:`load_positions` in that you can positions from any stage, while :meth:`load_positions` can only move forward in time. However, this comes at a performance penalty. """ # get the block for this stage block = self._block_for_stage(stage) # if it's the current block, then just return the positions if block == self._current_block: return self._cdf_data_set.variables['positions'][..., stage] # otherwise open the file, grab the positions, and then close it else: path = self.net_cdf_path_template.format(block) with contextlib.closing(cdf.Dataset(path, 'r')) as dataset: return dataset.variables['positions'][..., stage]
[docs] def load_all_positions(self): """ Load all positions from disk. Warning, this could use a lot of memory. """ return np.concatenate([np.array(self.load_positions(i))[..., np.newaxis] for i in range(self.max_safe_frame())], axis=-1)
[docs] def iterate_positions(self, start=None, end=None): """ Iterate over the positions from disk. """ if start is None: start = 0 if end is None: end = self.max_safe_frame() for i in range(start, end): yield self.load_positions(i)
[docs] def save_velocities(self, velocities, stage): """ Save velocities to disk. :param velocities: n_replicas x n_atoms x 3 array :param stage: int stage to store """ self._can_save() self._handle_save_stage(stage) self._cdf_data_set.variables['velocities'][..., stage] = velocities
[docs] def load_velocities(self, stage): """ Load velocities from disk. :param stage: int stage to load """ self._handle_load_stage(stage) return self._cdf_data_set.variables['velocities'][..., stage]
[docs] def load_all_velocities(self): """ Load all velocities from disk. Warning, this could use a lot of memory. """ return np.concatenate([np.array(self.load_velocities(i))[..., np.newaxis] for i in range(self.max_safe_frame())], axis=-1)
[docs] def save_states(self, states, stage): """ Save states to disk. :param states: list of SystemStage objects to store :param stage: int stage to store """ self._can_save() self._handle_save_stage(stage) positions = np.array([s.positions for s in states]) velocities = np.array([s.velocities for s in states]) alphas = np.array([s.alpha for s in states]) energies = np.array([s.energy for s in states]) self.save_positions(positions, stage) self.save_velocities(velocities, stage) self.save_alphas(alphas, stage) self.save_energies(energies, stage)
[docs] def load_states(self, stage): """ Load states from disk :param stage: integer stage to load :return: list of SystemState objects """ self._handle_load_stage(stage) positions = self.load_positions(stage) velocities = self.load_velocities(stage) alphas = self.load_alphas(stage) energies = self.load_energies(stage) states = [] for i in range(self._n_replicas): s = state.SystemState(positions[i], velocities[i], alphas[i], energies[i]) states.append(s) return states
def append_traj(self, state, stage): pdb_string = self._pdb_writer.get_pdb_string(state.positions, stage) with open(self.traj_path, 'a') as traj_file: traj_file.write(pdb_string)
[docs] def save_alphas(self, alphas, stage): """ Save alphas to disk. :param alphas: n_replicas array :param stage: int stage to store """ self._can_save() self._handle_save_stage(stage) self._cdf_data_set.variables['alphas'][..., stage] = alphas
[docs] def load_alphas(self, stage): """ Load alphas from disk. :param stage: int stage to load from disk :return: n_replicas array """ self._handle_load_stage(stage) return self._cdf_data_set.variables['alphas'][..., stage]
[docs] def load_all_alphas(self): """ Load all alphas from disk. Warning, this could use a lot of memory. """ return np.concatenate([np.array(self.load_alphas(i))[..., np.newaxis] for i in range(self.max_safe_frame())], axis=-1)
[docs] def save_energies(self, energies, stage): """ Save energies to disk. :param energies: n_replicas array :param stage: int stage to save """ self._can_save() self._handle_save_stage(stage) self._cdf_data_set.variables['energies'][..., stage] = energies
[docs] def load_energies(self, stage): """ Load energies from disk. :param stage: int stage to load :return: n_replicas array """ self._handle_load_stage(stage) return self._cdf_data_set.variables['energies'][..., stage]
[docs] def load_all_energies(self): """ Load all energies from disk. Warning, this could use a lot of memory """ return np.concatenate([np.array(self.load_energies(i))[..., np.newaxis] for i in range(self.max_safe_frame())], axis=-1)
def save_energy_matrix(self, energy_matrix, stage): self._can_save() self._handle_save_stage(stage) self._cdf_data_set.variables['energy_matrix'][..., stage] = energy_matrix def load_energy_matrix(self, stage): self._handle_load_stage(stage) return self._cdf_data_set.variables['energy_matrix'][..., stage] def load_all_energy_matrices(self): return np.concatenate([np.array(self.load_energy_matrix(i))[..., np.newaxis] for i in range(self.max_safe_frame())], axis=-1)
[docs] def save_permutation_vector(self, perm_vec, stage): """ Save permutation vector to disk. :param perm_vec: n_replicas array of int :param stage: int stage to store """ self._can_save() self._handle_save_stage(stage) self._cdf_data_set.variables['permutation_vectors'][..., stage] = perm_vec
[docs] def load_permutation_vector(self, stage): """ Load permutation vector from disk. :param stage: int stage to load :return: n_replicas array of int """ self._handle_load_stage(stage) return self._cdf_data_set.variables['permutation_vectors'][..., stage]
[docs] def load_all_permutation_vectors(self): """ Load all permutation vector from disk. Warning, this might take a lot of memory """ return np.concatenate([np.array(self.load_permutation_vector(i))[..., np.newaxis] for i in range(self.max_safe_frame())], axis=-1)
[docs] def iterate_permutation_vectors(self, start=None, end=None): """ Iterate over the permutation vectors from disk. """ if start is None: start = 0 if end is None: end = self.max_safe_frame() for i in range(start, end): yield self.load_permutation_vector(i)
[docs] def save_acceptance_probabilities(self, accept_probs, stage): """ Save acceptance probabilities vector to disk. :param accept_probs: n_replicas array of int :param stage: int stage to store """ self._can_save() self._handle_save_stage(stage) self._cdf_data_set.variables['acceptance_probabilities'][..., stage] = accept_probs
[docs] def load_acceptance_probabilities(self, stage): """ Load acceptance probability vector from disk. :param stage: int stage to load :return: n_replica_pairs array of int """ self._handle_load_stage(stage) return self._cdf_data_set.variables['acceptance_probabilities'][..., stage]
[docs] def load_all_acceptance_probabilities(self): """ Load all acceptance probabilities from disk Warning, this might take a lot of memory """ return np.concatenate([np.array(self.load_acceptance_probabilities(i))[..., np.newaxis] for i in range(self.max_safe_frame())], axis=-1)
[docs] def save_remd_runner(self, runner): """Save replica runner to disk""" self._can_save() with open(self.remd_runner_path, 'w') as runner_file: pickle.dump(runner, runner_file)
[docs] def load_remd_runner(self): """Load replica runner from disk""" path = self.remd_runner_backup_path if self._readonly_mode else self.remd_runner_path with open(path) as runner_file: return pickle.load(runner_file)
def save_system(self, system): self._can_save() with open(self.system_path, 'w') as system_file: pickle.dump(system, system_file) def load_system(self): path = self.system_backup_path if self._readonly_mode else self.system_path with open(path) as system_file: return pickle.load(system_file) def save_run_options(self, run_options): self._can_save() with open(self.run_options_path, 'w') as options_file: pickle.dump(run_options, options_file) def load_run_options(self): path = self.run_options_backup_path if self._readonly_mode else self.run_options_path with open(path) as options_file: return pickle.load(options_file)
[docs] def backup(self, stage): """ Backup all files to Data/Backup. :param stage: int stage Backup will occur if `stage % backup_freq == 0` """ self._can_save() if not stage % self._block_size: self._backup(self.communicator_path, self.communicator_backup_path) self._backup(self.data_store_path, self.data_store_backup_path) self._backup(self.remd_runner_path, self.remd_runner_backup_path) self._backup(self.system_path, self.system_backup_path) self._backup(self.run_options_path, self.run_options_backup_path) # # private methods #
def _create_cdf_file(self): # create the file path = self.net_cdf_path_template.format(self._current_block) self._cdf_data_set = cdf.Dataset(path, 'w', format='NETCDF4') # setup dimensions self._cdf_data_set.createDimension('n_replicas', self._n_replicas) self._cdf_data_set.createDimension('n_replica_pairs', self._n_replicas - 1) self._cdf_data_set.createDimension('n_atoms', self._n_atoms) self._cdf_data_set.createDimension('cartesian', 3) self._cdf_data_set.createDimension('timesteps', None) # setup variables self._cdf_data_set.createVariable('positions', float, ['n_replicas', 'n_atoms', 'cartesian', 'timesteps'], zlib=True, fletcher32=True, shuffle=True, complevel=9) self._cdf_data_set.createVariable('velocities', float, ['n_replicas', 'n_atoms', 'cartesian', 'timesteps'], zlib=True, fletcher32=True, shuffle=True, complevel=9) self._cdf_data_set.createVariable('alphas', float, ['n_replicas', 'timesteps'], zlib=True, fletcher32=True, shuffle=True, complevel=9) self._cdf_data_set.createVariable('energies', float, ['n_replicas', 'timesteps'], zlib=True, fletcher32=True, shuffle=True, complevel=9) self._cdf_data_set.createVariable('permutation_vectors', int, ['n_replicas', 'timesteps'], zlib=True, fletcher32=True, shuffle=True, complevel=9) self._cdf_data_set.createVariable('energy_matrix', float, ['n_replicas', 'n_replicas', 'timesteps'], zlib=True, fletcher32=True, shuffle=True, complevel=9) self._cdf_data_set.createVariable('acceptance_probabilities', float, ['n_replica_pairs', 'timesteps'], zlib=True, fletcher32=True, shuffle=True, complevel=9) def _backup(self, src, dest): if os.path.exists(src): try: shutil.copy(src, dest) except IOError: # if we encounter an error, wait five seconds and try again time.sleep(5) shutil.copy(src, dest) def _can_save(self): if self._readonly_mode: raise RuntimeError('Cannot save in safe mode.') def _handle_save_stage(self, stage): if stage < self._current_stage: raise RuntimeError('Cannot go back in time') self._current_stage = stage block_index = self._block_for_stage(stage) if block_index > self._current_block: self.close() self._max_safe_block = self._current_block self._current_block = block_index self._create_cdf_file() def _handle_load_stage(self, stage): block_index = self._block_for_stage(stage) if self._readonly_mode: if block_index > self._max_safe_block: raise RuntimeError('Tried to read an unsafe block') else: if block_index < self._current_block: raise RuntimeError('Cannot go back in time') if block_index != self._current_block: self.close() self._current_block = block_index self._load_cdf_file_readonly() def _block_for_stage(self, stage): return stage / self._block_size def _load_cdf_file_readonly(self): path = self.net_cdf_path_template.format(self._current_block) self._cdf_data_set = cdf.Dataset(path, 'r') def max_safe_frame(self): return (self._max_safe_block + 1) * self._block_size