Source code for scope_rl.dataset.base

# Copyright (c) 2023, Haruka Kiyohara, Ren Kishimoto, HAKUHODO Technologies Inc., and Hanjuku-kaso Co., Ltd. All rights reserved.
# Licensed under the Apache 2.0 License.

"""Abstract base class for logged dataset."""
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass

from ..types import LoggedDataset


[docs]@dataclass
class BaseDataset(metaclass=ABCMeta):
    """Base class for logged dataset.

    Imported as: :class:`scope_rl.dataset.BaseDataset`

    """

[docs]    @abstractmethod
    def obtain_episodes(self, n_trajectories: int) -> LoggedDataset:
        """Rollout behavior policy and obtain episodes.

        Parameters
        -------
        n_trajectories: int, default=10000 (> 0)
            Number of trajectories to generate by rolling out the behavior policy.

        Returns
        -------
        logged_dataset(s): LoggedDataset or MultipleLoggedDataset
            :class:`MultipleLoggedDataset` is an instance containing (multiple) logged datasets.

            For API consistency, each logged dataset should contain the following.

            .. code-block:: python

                key: [
                    size,
                    n_trajectories,
                    step_per_trajectory,
                    action_type,
                    n_actions,
                    action_dim,
                    action_keys,
                    action_meaning,
                    state_dim,
                    state_keys,
                    state,
                    action,
                    reward,
                    done,
                    terminal,
                    info,
                    pscore,
                ]

            size: int (> 0)
                Number of steps the dataset records.

            n_trajectories: int (> 0)
                Number of trajectories the dataset records.

            step_per_trajectory: int (> 0)
                Number of timesteps in an trajectory.

            action_type: str
                Type of the action space.
                Either "discrete" or "continuous".

            n_actions: int (> 0)
                Number of actions.
                If action_type is "continuous", `None` is recorded.

            action_dim: int (> 0)
                Dimensions of the action space.
                If action_type is "discrete", `None` is recorded.

            action_keys: list of str
                Name of each dimension in the action space.
                If action_type is "discrete", `None` is recorded.

            action_meaning: dict
                Dictionary to map discrete action index to a specific action.
                If action_type is "continuous", `None` is recorded.

            state_dim: int (> 0)
                Dimensions of the state space.

            state_keys: list of str
                Name of each dimension of the state space.

            state: ndarray of shape (size, state_dim)
                State observed under the behavior policy.

            action: ndarray of shape (size, ) or (size, action_dim)
                Action chosen by the behavior policy.

            reward: ndarray of shape (size, )
                Reward observed for each (state, action) pair.

            done: ndarray of shape (size, )
                Whether an episode ends or not.

            terminal: ndarray of shape (size, )
                Whether an episode reaches the pre-defined maximum steps.

            info: dict
                Additional feedbacks from the environment.

            pscore: ndarray of shape (size, )
                Propensity of the observed action being chosen under the behavior policy (pscore stands for propensity score).


        """
        raise NotImplementedError

[docs]    @abstractmethod
    def obtain_steps(self, n_trajectories: int) -> LoggedDataset:
        """Rollout behavior policy and obtain steps.

        Parameters
        -------
        n_trajectories: int, default=10000 (> 0)
            Number of trajectories to generate by rolling out the behavior policy.

        Returns
        -------
        logged_dataset(s): LoggedDataset or MultipleLoggedDataset
            :class:`MultipleLoggedDataset` is an instance containing (multiple) logged datasets.

            For API consistency, each logged dataset should contain the following.

            .. code-block:: python

                key: [
                    size,
                    n_trajectories,
                    step_per_trajectory,
                    action_type,
                    n_actions,
                    action_dim,
                    action_keys,
                    action_meaning,
                    state_dim,
                    state_keys,
                    state,
                    action,
                    reward,
                    done,
                    terminal,
                    info,
                    pscore,
                ]

            size: int (> 0)
                Number of steps the dataset records.

            n_trajectories: int (> 0)
                Number of trajectories the dataset records.

            step_per_trajectory: int (> 0)
                Number of timesteps in an trajectory.

            action_type: str
                Type of the action space.
                Either "discrete" or "continuous".

            n_actions: int (> 0)
                Number of actions.
                If action_type is "continuous", `None` is recorded.

            action_dim: int (> 0)
                Dimensions of the action space.
                If action_type is "discrete", `None` is recorded.

            action_keys: list of str
                Name of each dimension of the action space.
                If action_type is "discrete", `None` is recorded.

            action_meaning: dict
                Dictionary to map discrete action index to a specific action.
                If action_type is "continuous", `None` is recorded.

            state_dim: int (> 0)
                Dimensions of the state space.

            state_keys: list of str
                Name of each dimension of the state space.

            state: ndarray of shape (size, state_dim)
                State observed under the behavior policy.

            action: ndarray of shape (size, ) or (size, action_dim)
                Action chosen by the behavior policy.

            reward: ndarray of shape (size, )
                Reward observed for each (state, action) pair.

            done: ndarray of shape (size, )
                Whether an episode ends or not.

            terminal: ndarray of shape (size, )
                Whether an episode reaches the pre-defined maximum steps.

            info: dict
                Additional feedbacks from the environment.

            pscore: ndarray of shape (size, )
                Propensity of the observed action being chosen under the behavior policy (pscore stands for propensity score).

        """
        raise NotImplementedError