# h5py update

## HDF5 European Workshop for Science and Industry
## ESRF, Grenoble, 2019-09-18

# History, Particurals & Usage

 - Started in 2008 by Andrew Collette
   - Now maintained by community
 - https://github.com/h5py/h5py
 - https://h5py.readthedocs.io/en/stable/
 - 129th most downlodaded package on pypi (mostil CI machines)
 - used by keras / tensorflow

# Basic Philosophy

 - Provides a "pythonic" wrapping of `libhdf5`
   - less opnionated about use cases than `pytables`
   - less tuned that `pytables`
 
## Core Analogies

- `dict` <-> {`h5py.File`, `h5py.Group`}
  -  `g['key']` access to children (groups or datasets)
- `np.array` <-> `h5py.Dataset`
  - `Dateset` object support array protocol, slicing
  - only pulls data from disk on demand

# Write some data

In [2]:
!pip install h5py

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting h5py
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/6b/31/b5965f76e0bb2b02f273d87ec9cb59c77b9864ac27a0078c4229baa45dfc/h5py-3.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[K     |████████████████████████████████| 4.8 MB 3.0 MB/s eta 0:00:01
Installing collected packages: h5py
Successfully installed h5py-3.10.0


In [3]:
import h5py
import numpy as np

with h5py.File('example.h5', 'w') as fout:
    # do the right thing in simple cases
    fout['data'] = [0, 1, 2, 3, 4]
    fout['nested/twoD'] = np.array([[1, 2], [3, 4]])
    # method provides access to all of the dataset creation knobs
    fout.create_dataset('data_B', 
                        data=np.arange(10).reshape(2, 5),
                        chunks=(1, 5))

# Read some data

In [4]:
fin = h5py.File('example.h5', 'r')
# the File object
fin

<HDF5 file "example.h5" (mode r)>

In [5]:
# root group
fin['/']

<HDF5 group "/" (3 members)>

In [6]:
list(fin['/'])

['data', 'data_B', 'nested']

In [7]:
# a Dateset, has not read any data yet
fin['data']

<HDF5 dataset "data": shape (5,), type "<i8">

### numpy-stlye slicing on datasets

In [8]:
# pull data from disk to an array
fin['data'][:]

array([0, 1, 2, 3, 4])

In [7]:
# pull part of the dataset
fin['data'][1:3]

array([1, 2])

In [8]:
# handles numpy-style strided ND slicing
fin['data_B'][:, 1::2]

array([[1, 3],
       [6, 8]])

In [9]:
# fancy slicing
fin['data'][[0, 3, 4]]

array([0, 3, 4])

### Accessing Nested Groups/Datasets

In [10]:
# acess nested groups / datasets via repeated []
fin['nested']['twoD']

<HDF5 dataset "twoD": shape (2, 2), type "<i8">

In [11]:
# Or use file-path like access
fin['nested/twoD']

<HDF5 dataset "twoD": shape (2, 2), type "<i8">

### Close the file

In [12]:
# if not using a context manager, remember to clean up!
fin.close()
fin

<Closed HDF5 file>

# New is h5py 2.8

 - register new file drivers
 - track object creation order
 - lots of bug fixes!

# New in `h5py` 2.9

 - high-level API for creating virtual datasets
 - passing in python "file-like" objects to `h5py.File`
 - control chunk cache when creating `h5py.File`
 - `create_dataset_like` method
 - track creation order of attributes
 - bug fixes!

## High level API for Virtual Datasets 


- Work stared by Aaron Parsons at DLS
- continued by Thomas Caswell at NSLS-II
- finished by Thomas Kluyver at EuXFEL


low-level API has been availble from h5py 2.6

### Create some data

In [9]:
# create some sample data
data = np.arange(0, 100).reshape(1, 100) + np.arange(1, 5).reshape(4, 1)

# Create source files (0.h5 to 3.h5)
for n in range(4):
    with h5py.File(f"{n}.h5", "w") as f:
        d = f.create_dataset("data", (100,), "i4", data[n])

### Create the Virtual Dataset

In [10]:
# Assemble virtual dataset
layout = h5py.VirtualLayout(shape=(4, 100), dtype="i4")
for n in range(4):
    layout[n] = h5py.VirtualSource(f"{n}.h5", "data", shape=(100,))

# Add virtual dataset to output file
with h5py.File("VDS.h5", "w", libver="latest") as f:
    # the virtual dataset
    f.create_virtual_dataset("data_A", layout, fillvalue=-5)
    # normal dataset with identical values
    f.create_dataset("data_B", data=data, dtype='i4')

### Read it back

In [11]:
# read data back
# virtual dataset is transparent for reader!
with h5py.File("VDS.h5", "r") as f:
    print(f"Virtual dataset: {f['data_A']}")
    print(f["data_A"][:, ::10])
    print(f"Normal dataset : {f['data_B']}")
    print(f["data_B"][:, ::10])

Virtual dataset: <HDF5 dataset "data_A": shape (4, 100), type "<i4">
[[ 1 11 21 31 41 51 61 71 81 91]
 [ 2 12 22 32 42 52 62 72 82 92]
 [ 3 13 23 33 43 53 63 73 83 93]
 [ 4 14 24 34 44 54 64 74 84 94]]
Normal dataset : <HDF5 dataset "data_B": shape (4, 100), type "<i4">
[[ 1 11 21 31 41 51 61 71 81 91]
 [ 2 12 22 32 42 52 62 72 82 92]
 [ 3 13 23 33 43 53 63 73 83 93]
 [ 4 14 24 34 44 54 64 74 84 94]]


## Pass Python file-like objects to `h5py.File`

 - contributed by Andrey Paramonov (Андрей Парамонов)
 - can pass in object returned by `open` or a `BytesIO` object

### Creat a `BtyesIO` object and write data to it

In [16]:
from io import BytesIO

obj = BytesIO()
with h5py.File(obj, 'w') as fout:
    fout['data'] = np.linspace(0, 30, 10)

### Read the data back

In [17]:
obj.seek(0)
print(f"the frist 5 bytse: {obj.read(5)}")

the frist 5 bytse: b'\x89HDF\r'


In [18]:
obj.seek(0)
with h5py.File(obj, 'r') as fin:
    print(fin['data'])

<HDF5 dataset "data": shape (10,), type "<f8">


### Write buffer to disk

In [19]:
obj.seek(0)
with open('test_out.h5', 'wb') as fout:
    fout.write(obj.getbuffer())

### Read back with hdf5 opening the file

In [20]:
with h5py.File('test_out.h5', 'r') as fin:
    print(fin['data'])

<HDF5 dataset "data": shape (10,), type "<f8">


### Use `open` to read the file

In [21]:
with open('test_out.h5', 'rb') as raw_file:
    with h5py.File(raw_file, 'r') as fin:
        print(fin['data'])

<HDF5 dataset "data": shape (10,), type "<f8">


## Better KeysView repr

In [22]:
with h5py.File('example.h5', 'r') as fin:
    print(fin.keys())
    

<KeysViewHDF5 ['data', 'data_B', 'nested']>


# New in h5py 2.10

- Better support for reading bit fields
- deprecate implicit file mode
- better tab-completion out-of-the-box in IPython
- add `Dataset.make_scale` helper
- improve handling of spcial data types
- expose `H5PL` functions
- expose `H5Dread_chunk` and `h5d.read_direct_chunk`

## Require file mode (so we can change the default next release)

- the current default mode is "open append, or create if needed"
- this is dangerous as users may accindentally mutate files they did not want to!
- does not match behivor of `open`
- for back-compatibliity did not want to change default in one step

In [23]:
with h5py.File('blahblah.h5') as fout:
    pass

  """Entry point for launching an IPython kernel.


In [24]:
h5py.get_config().default_file_mode = 'r'
with h5py.File('blahblah.h5') as fout:
    pass
# put it back to default just to be tidy!
h5py.get_config().default_file_mode = None

## `make_scale` helper

In [25]:
with h5py.File("with_scale.h5", 'w') as fout:
    fout['data'] = range(10)
    fout['pos'] = np.arange(10) + 5
    fout['pos'].make_scale("pos")
    fout['data'].dims[0].attach_scale(fout['pos'])

In [26]:
!h5dump --dataset=data with_scale.h5

HDF5 "with_scale.h5" {
DATASET "data" {
   DATATYPE  H5T_STD_I64LE
   DATASPACE  SIMPLE { ( 10 ) / ( 10 ) }
   DATA {
   (0): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
   }
   ATTRIBUTE "DIMENSION_LIST" {
      DATATYPE  H5T_VLEN { H5T_REFERENCE { H5T_STD_REF_OBJECT }}
      DATASPACE  SIMPLE { ( 1 ) / ( 1 ) }
      DATA {
      (0): (DATASET 1400 /pos )
      }
   }
}
}


# 行情数据存储例子

以下部分由大富翁量化课程提供，获取行情的代码需要在课程中运行

In [13]:
from coursea import *
await init()

2024-01-01 18:09:59,853 I 2537 cfg4py.core:update_config:280 | configuration is
alpha: {data_home: ~/zillionare/alpha/data, tts_server: 'http://127.0.0.1:5002/api/tts?'}
backtest: {url: 'http://192.168.100.114:7080/backtest/api/trade/v0.5/'}
influxdb: {bucket_name: zillionare, enable_compress: true, max_query_size: 5000, org: zillionare,
  token: hwxHycJfp_t6bCOYe2MhEDW4QBOO4FDtgeBWnPR6bGZJGEZ_41m_OHtTJFZKyD2HsbVqkZM8rJNkMvjyoXCG6Q==,
  url: 'http://192.168.100.101:58086'}
notify: {dingtalk_access_token: 58df072143b52368086736cb38236753073ccde6537650cad1d5567747803563,
  keyword: trader}
pluto: {store: ~/zillionare/pluto/store}
redis: {dsn: 'redis://192.168.100.101:56379'}
tasks: {pooling: false, wr: false}

2024-01-01 18:09:59,856 I 2537 /usr/local/lib/python3.8/dist-packages/omicron/dal/cache.py:init:94 | init redis cache...
2024-01-01 18:09:59,866 I 2537 /usr/local/lib/python3.8/dist-packages/omicron/dal/cache.py:init:124 | redis cache is inited
2024-01-01 18:09:59,976 I 2537 omicro

init securities done


In [139]:
codes = ["000001.XSHE", "600000.XSHG"]

h5file = "/tmp/bars.h5"

h5 = h5py.File(h5file, "a")

if "1m" not in h5.keys():
    h5.create_group("/1m")

In [140]:
def convert_frame(bars):
    # h5 不能处理 np.datetime64，转换成整数
    dtype = bars.dtype.descr
    dtype[0] = ('frame', 'i8')
    
    return bars.astype(dtype)
   
def append_ds(name: str, bars):
    ds = h5.get(name)
    if ds is None:
        ds = h5.create_dataset(name, data = bars, chunks=True, maxshape=(None,))
    else:
        nold = ds.shape[0]
        nnew = len(bars)
        ds.resize(nold + nnew, axis=0)
        ds[-nnew:] = bars
        
    return ds

# 每日增加行情数据
async def save_bars(codes:List[str], ft: FrameType):    
    for code in codes:
        bars = await Stock.get_bars(code, 240, ft)
        append_ds(f"/{ft.value}/{code}", convert_frame(bars))


    
# 显示h5文件结构

def h5_tree(val, pre=''):
    items = len(val)
    for key, val in val.items():
        items -= 1
        if items == 0:
            # the last item
            if type(val) == h5py._hl.group.Group:
                print(pre + '└── ' + key)
                h5_tree(val, pre+'    ')
            else:
                print(pre + '└── ' + key + ' (%d)' % len(val))
        else:
            if type(val) == h5py._hl.group.Group:
                print(pre + '├── ' + key)
                h5_tree(val, pre+'│   ')
            else:
                print(pre + '├── ' + key + ' (%d)' % len(val))
                
h5_tree(h5)

├── 1d
│   ├── 000001.XSHE (2400000)
│   └── 600000.XSHG (2400000)
└── 1m


In [141]:
await save_bars(codes, FrameType.DAY)
await save_bars(code

0


CancelledError: 

In [128]:
mbars = h5["1d"]["000001.XSHE"]

tm = datetime.datetime(2023, 12, 29, 14, 58)
len(mbars[mbars["frame"]>tm.timestamp()])

2400000