Source code for schrodinger.application.matsci.hdfutils
"""
Contains functionality related to hdf files
Copyright Schrodinger, LLC. All rights reserved.
"""
import contextlib
import pickle
import pandas
from schrodinger.utils import mmutil
PICKLE_EXT = '.pkl'
HDF_EXT = '.h5'
[docs]class HDFIO:
"""
Class describing HDF5 input and output methods
"""
[docs] def readHDF(self, filename, key):
"""
Reads a HDF file
:param filename: The filename
:type filename: str
:param key: The key to read
:type key: str
:rtype: pandas.core.frame.DataFrame
:return: Pandas data frame stored in the passed key
"""
return pandas.read_hdf(filename, key=key)
[docs] def getKeys(self, filename):
"""
Gets the keys in HDF file
:param filename: The filename
:type filename: str
:rtype: list
:return: list of keys in the file
"""
with pandas.HDFStore(filename, mode='r') as data_h5:
return list(data_h5.keys())
[docs] def setData(self, filename, data):
"""
Puts the pandas dataframes in the HDF file.
:param filename: The filename
:type filename: str
:param data: The dictionary where the key is the key for the values of
pandas dataframe
:type data: dict
"""
with pandas.HDFStore(filename, mode='w') as data_h5:
for key, value in data.items():
data_h5.put(key=key, value=value)
[docs]class HDFProxyPickleIO:
"""
Class describing pickle input and output methods which act as a proxy for
HDF5 file
"""
[docs] def readHDF(self, filename, key):
"""
Reads a pickle file
:param filename: The filename
:type filename: str
:param key: The key to read
:type key: str
:rtype: pandas.core.frame.DataFrame
:return: Pandas data frame stored in the passed key
"""
with open(filename, 'rb') as pkl_file:
return pickle.load(pkl_file)[key]
[docs] def getKeys(self, filename):
"""
Gets the keys of data stored in the pickle file
:param filename: The filename
:type filename: str
:rtype: list
:return: list of keys in the file
"""
with open(filename, 'rb') as pkl_file:
return list(pickle.load(pkl_file).keys())
[docs] def setData(self, filename, data):
"""
Puts the pandas dataframes in the HDF file.
:param filename: The filename
:type filename: str
:param data: The dictionary where the key is the key for the values of
pandas dataframe
:type data: dict
"""
with open(filename, 'wb') as pkl_file:
pickle.dump(data, pkl_file)
[docs]def generate_hierarchial_key(*args):
"""
Generates a hierarchial key for a hdf file
:type args: list
:param args: the list of strings which will form the key
:rtype: str
:return: the hierarchial key
"""
# we use '/' to generate a key to maintain the
# hierarchial data format of hdf file
return '/'.join(args)
[docs]def get_filename(basename):
"""
Gets the filename with proper extension
:param basename: The basename
:type basename: str
:returns: The filename
:rtype: str
"""
if mmutil.feature_flag_is_enabled(mmutil.SAVE_HDF5):
ext = HDF_EXT
else:
ext = PICKLE_EXT
return f'{basename}{ext}'
[docs]def get_hdf_io(filename):
"""
Gets the i/o class associated with the extension of the filename
:param filename: The filename
:type filename: str
:returns: The i/o class
:rtype: HDFIO or HDFProxyPickleIO
"""
if filename.endswith(HDF_EXT):
return HDFIO()
elif filename.endswith(PICKLE_EXT):
return HDFProxyPickleIO()
[docs]def get_hdf_keys(filename):
"""
Gets the keys of data stored in the file
:param filename: The filename
:type filename: str
:rtype: list
:return: list of keys in the file
"""
hdf_io = get_hdf_io(filename)
return hdf_io.getKeys(filename)
[docs]def read_hdf(filename, key):
"""
Reads the data in the file associated with the key
:param filename: The filename
:type filename: str
:param key: The key to read
:type key: str
:rtype: pandas.core.frame.DataFrame
:return: Pandas data frame stored in the passed key
"""
hdf_io = get_hdf_io(filename)
return hdf_io.readHDF(filename, key)
[docs]@contextlib.contextmanager
def HDFStore(filename, mode='w'):
"""
Context manager for hdf/hdf proxy file.
:param filename: The filename
:type filename: str
:param mode: The mode for opening the file. Only writing is supported.
:type mode: str
"""
if mode != 'w':
raise RuntimeError
data = {}
yield data
hdf_io = get_hdf_io(filename)
hdf_io.setData(filename, data)