Source code for schrodinger.application.matsci.hdfutils

"""
Contains functionality related to hdf files

Copyright Schrodinger, LLC. All rights reserved.
"""

import contextlib
import pickle

import pandas

from schrodinger.utils import mmutil

PICKLE_EXT = '.pkl'
HDF_EXT = '.h5'


[docs]class HDFIO: """ Class describing HDF5 input and output methods """
[docs] def readHDF(self, filename, key): """ Reads a HDF file :param filename: The filename :type filename: str :param key: The key to read :type key: str :rtype: pandas.core.frame.DataFrame :return: Pandas data frame stored in the passed key """ return pandas.read_hdf(filename, key=key)
[docs] def getKeys(self, filename): """ Gets the keys in HDF file :param filename: The filename :type filename: str :rtype: list :return: list of keys in the file """ with pandas.HDFStore(filename, mode='r') as data_h5: return list(data_h5.keys())
[docs] def setData(self, filename, data): """ Puts the pandas dataframes in the HDF file. :param filename: The filename :type filename: str :param data: The dictionary where the key is the key for the values of pandas dataframe :type data: dict """ with pandas.HDFStore(filename, mode='w') as data_h5: for key, value in data.items(): data_h5.put(key=key, value=value)
[docs]class HDFProxyPickleIO: """ Class describing pickle input and output methods which act as a proxy for HDF5 file """
[docs] def readHDF(self, filename, key): """ Reads a pickle file :param filename: The filename :type filename: str :param key: The key to read :type key: str :rtype: pandas.core.frame.DataFrame :return: Pandas data frame stored in the passed key """ with open(filename, 'rb') as pkl_file: return pickle.load(pkl_file)[key]
[docs] def getKeys(self, filename): """ Gets the keys of data stored in the pickle file :param filename: The filename :type filename: str :rtype: list :return: list of keys in the file """ with open(filename, 'rb') as pkl_file: return list(pickle.load(pkl_file).keys())
[docs] def setData(self, filename, data): """ Puts the pandas dataframes in the HDF file. :param filename: The filename :type filename: str :param data: The dictionary where the key is the key for the values of pandas dataframe :type data: dict """ with open(filename, 'wb') as pkl_file: pickle.dump(data, pkl_file)
[docs]def generate_hierarchial_key(*args): """ Generates a hierarchial key for a hdf file :type args: list :param args: the list of strings which will form the key :rtype: str :return: the hierarchial key """ # we use '/' to generate a key to maintain the # hierarchial data format of hdf file return '/'.join(args)
[docs]def get_filename(basename): """ Gets the filename with proper extension :param basename: The basename :type basename: str :returns: The filename :rtype: str """ if mmutil.feature_flag_is_enabled(mmutil.SAVE_HDF5): ext = HDF_EXT else: ext = PICKLE_EXT return f'{basename}{ext}'
[docs]def get_hdf_io(filename): """ Gets the i/o class associated with the extension of the filename :param filename: The filename :type filename: str :returns: The i/o class :rtype: HDFIO or HDFProxyPickleIO """ if filename.endswith(HDF_EXT): return HDFIO() elif filename.endswith(PICKLE_EXT): return HDFProxyPickleIO()
[docs]def get_hdf_keys(filename): """ Gets the keys of data stored in the file :param filename: The filename :type filename: str :rtype: list :return: list of keys in the file """ hdf_io = get_hdf_io(filename) return hdf_io.getKeys(filename)
[docs]def read_hdf(filename, key): """ Reads the data in the file associated with the key :param filename: The filename :type filename: str :param key: The key to read :type key: str :rtype: pandas.core.frame.DataFrame :return: Pandas data frame stored in the passed key """ hdf_io = get_hdf_io(filename) return hdf_io.readHDF(filename, key)
[docs]@contextlib.contextmanager def HDFStore(filename, mode='w'): """ Context manager for hdf/hdf proxy file. :param filename: The filename :type filename: str :param mode: The mode for opening the file. Only writing is supported. :type mode: str """ if mode != 'w': raise RuntimeError data = {} yield data hdf_io = get_hdf_io(filename) hdf_io.setData(filename, data)