Source code for scope_rl.ope.ops

# Copyright (c) 2023, Haruka Kiyohara, Ren Kishimoto, HAKUHODO Technologies Inc., and Hanjuku-kaso Co., Ltd. All rights reserved.
# Licensed under the Apache 2.0 License.

"""Meta class to handle Off-Policy Selection (OPS) and evaluation of OPE/OPS."""
from collections import defaultdict
from dataclasses import dataclass
from typing import Optional, Union, List, Dict
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
from sklearn.utils import check_scalar
import matplotlib.pyplot as plt

from .ope import (
    OffPolicyEvaluation,
    CumulativeDistributionOPE,
)
from ..utils import (
    MultipleInputDict,
    estimate_confidence_interval_by_bootstrap,
    estimate_confidence_interval_by_hoeffding,
    estimate_confidence_interval_by_empirical_bernstein,
    estimate_confidence_interval_by_t_test,
    defaultdict_to_dict,
)
from ..types import OPEInputDict

markers = ["o", "v", "^", "s", "p", "P", "*", "h", "X", "D", "d"]
dkred = "#A60628"


[docs]@dataclass
class OffPolicySelection:
    """Class to conduct OPS and evaluation of OPE/OPS with multiple estimators simultaneously.

    Imported as: :class:`scope_rl.ope.OffPolicySelection`

    Note
    -----------
    **Off-Policy Selection (OPS)**

    OPS selects the "best" policy among several candidates based on the policy value or other statistics estimated by OPE.

    .. math::

        \\hat{\\pi} := {\\arg \\max}_{\\pi \\in \\Pi} \hat{J}(\\pi)

    where :math:`\\Pi` is a set of candidate policies and :math:`\hat{J}(\\cdot)` is some OPE estimates of the policy performance. Below, we describe two types of OPE to estimate such policy performance.

    **Off-Policy Evaluation (OPE)**

    (Basic) OPE estimates the expected policy performance called the policy value.

    .. math::

        V(\\pi) := \\mathbb{E} \\left[ \\sum_{t=1}^T \\gamma^{t-1} r_t \\mid \\pi \\right]

    where :math:`r_t` is the reward observed at each timestep :math:`t`,
    :math:`T` is the total number of timesteps in an episode, and :math:`\\gamma` is the discount factor.

    .. seealso::

        :class:`OffPolicyEvaluation`

    **Cumulative Distribution OPE**

    In contrast, cumulative distribution OPE first estimates the following cumulative distribution function.

    .. math::

        F(t, \\pi) := \\mathbb{E} \\left[ \\mathbb{I} \\left \\{ \\sum_{t=1}^T \\gamma^{t-1} r_t \\leq t \\right \\} \\mid \\pi \\right]

    Then, cumulative distribution OPE also estimates some risk functions including variance, conditional value at risk, and interquartile range based on the CDF estimate.

    .. seealso::

        :class:`CumulativeDistributionOPE`

    Parameters
    -----------
    ope: OffPolicyEvaluation, default=None
        Instance of the (standard) OPE class.

    cumulative_distribution_ope: CumulativeDistributionOPE, default=None
        Instance of the cumulative distribution OPE class.

    Examples
    ----------

    Preparation:

    .. code-block:: python

        # import necessary module from SCOPE-RL
        from scope_rl.dataset import SyntheticDataset
        from scope_rl.policy import EpsilonGreedyHead
        from scope_rl.ope import CreateOPEInput
        from scope_rl.ope import OffPolicySelection
        from scope_rl.ope import OffPolicyEvaluation as OPE
        from scope_rl.ope.discrete import TrajectoryWiseImportanceSampling as TIS
        from scope_rl.ope.discrete import PerDecisionImportanceSampling as PDIS
        from scope_rl.ope import CumulativeDistributionOPE
        from scope_rl.ope.discrete import CumulativeDistributionTIS as CD_IS
        from scope_rl.ope.discrete import CumulativeDistributionSNTIS as CD_SNIS

        # import necessary module from other libraries
        import gym
        import rtbgym
        from d3rlpy.algos import DoubleDQNConfig
        from d3rlpy.dataset import create_fifo_replay_buffer
        from d3rlpy.algos import ConstantEpsilonGreedy

        # initialize environment
        env = gym.make("RTBEnv-discrete-v0")

        # define (RL) agent (i.e., policy) and train on the environment
        ddqn = DoubleDQNConfig().create()
        buffer = create_fifo_replay_buffer(
            limit=10000,
            env=env,
        )
        explorer = ConstantEpsilonGreedy(
            epsilon=0.3,
        )
        ddqn.fit_online(
            env=env,
            buffer=buffer,
            explorer=explorer,
            n_steps=10000,
            n_steps_per_epoch=1000,
        )

        # convert ddqn policy to stochastic data collection policy
        behavior_policy = EpsilonGreedyHead(
            ddqn,
            n_actions=env.action_space.n,
            epsilon=0.3,
            name="ddqn_epsilon_0.3",
            random_state=12345,
        )

        # initialize dataset class
        dataset = SyntheticDataset(
            env=env,
            max_episode_steps=env.step_per_episode,
        )

        # data collection
        logged_dataset = dataset.obtain_episodes(
            behavior_policies=behavior_policy,
            n_trajectories=100,
            random_state=12345,
        )

    Create Input for OPE:

    .. code-block:: python

        # evaluation policy
        ddqn_ = EpsilonGreedyHead(
            base_policy=ddqn,
            n_actions=env.action_space.n,
            name="ddqn",
            epsilon=0.0,
            random_state=12345
        )
        random_ = EpsilonGreedyHead(
            base_policy=ddqn,
            n_actions=env.action_space.n,
            name="random",
            epsilon=1.0,
            random_state=12345
        )

        # create input for off-policy evaluation (OPE)
        prep = CreateOPEInput(
            env=env,
        )
        input_dict = prep.obtain_whole_inputs(
            logged_dataset=logged_dataset,
            evaluation_policies=[ddqn_, random_],
            n_trajectories_on_policy_evaluation=100,
            random_state=12345,
        )

    **Off-Policy Evaluation and Selection**:

    .. code-block:: python

        # OPS
        ope = OPE(
            logged_dataset=logged_dataset,
            ope_estimators=[TIS(), PDIS()],
        )
        cd_ope = CumulativeDistributionOPE(
            logged_dataset=logged_dataset,
            ope_estimators=[
                CD_IS(estimator_name="cd_is"),
                CD_SNIS(estimator_name="cd_snis"),
            ],
        )
        ops = OffPolicySelection(
            ope=ope,
            cumulative_distribution_ope=cd_ope,
        )
        ops_dict = ops.select_by_policy_value(
            input_dict=input_dict,
            return_metrics=True,
        )

    **Output**:

    .. code-block:: python

        >>> ops_dict

        {'tis': {'estimated_ranking': ['ddqn', 'random'],
                'estimated_policy_value': array([21.3624954,  0.3827044]),
                'estimated_relative_policy_value': array([1.44732354, 0.02592848]),
                'mean_squared_error': 94.79587393975419,
                'rank_correlation': SpearmanrResult(correlation=0.9999999999999999, pvalue=nan),
                'regret': (0.0, 1),
                'type_i_error_rate': 0.0,
                'type_ii_error_rate': 0.0,
                'safety_threshold': 13.284},
        'pdis': {'estimated_ranking': ['ddqn', 'random'],
                'estimated_policy_value': array([18.02806424,  7.13847486]),
                'estimated_relative_policy_value': array([1.22141357, 0.48363651]),
                'mean_squared_error': 19.45349619733373,
                'rank_correlation': SpearmanrResult(correlation=0.9999999999999999, pvalue=nan),
                'regret': (0.0, 1),
                'type_i_error_rate': 0.0,
                'type_ii_error_rate': 0.0,
                'safety_threshold': 13.284}}

    .. seealso::

        * :doc:`Quickstart </documentation/quickstart>`
        * :doc:`Related tutorials (OPS) </documentation/examples/ops>` and :doc:`related tutorials (assessments) <documentation/examples/assessments>`

    References
    -------
    Vladislav Kurenkov and Sergey Kolesnikov.
    "Showing Your Offline Reinforcement Learning Work: Online Evaluation Budget Matters." 2022.

    Shengpu Tang and Jenna Wiens.
    "Model Selection for Offline Reinforcement Learning: Practical Considerations for Healthcare Settings." 2021.

    Justin Fu, Mohammad Norouzi, Ofir Nachum, George Tucker, Ziyu Wang, Alexander Novikov, Mengjiao Yang,
    Michael R. Zhang, Yutian Chen, Aviral Kumar, Cosmin Paduraru, Sergey Levine, and Tom Le Paine.
    "Benchmarks for Deep Off-Policy Evaluation." 2021.

    Tom Le Paine, Cosmin Paduraru, Andrea Michi, Caglar Gulcehre, Konrad Zolna, Alexander Novikov, Ziyu Wang, and Nando de Freitas.
    "Hyperparameter Selection for Offline Reinforcement Learning." 2020.

    """

    ope: Optional[OffPolicyEvaluation] = None
    cumulative_distribution_ope: Optional[CumulativeDistributionOPE] = None

    def __post_init__(self):
        if self.ope is None and self.cumulative_distribution_ope is None:
            raise RuntimeError(
                "one of `ope` or `cumulative_distribution_ope` must be given"
            )

        if self.ope is not None and not isinstance(self.ope, OffPolicyEvaluation):
            raise RuntimeError("ope must be the instance of OffPolicyEvaluation")
        if self.cumulative_distribution_ope is not None and not isinstance(
            self.cumulative_distribution_ope, CumulativeDistributionOPE
        ):
            raise RuntimeError(
                "cumulative_distribution_ope must be the instance of CumulativeDistributionOPE"
            )

        self.step_per_trajectory = self.ope.logged_dataset["step_per_trajectory"]
        check_scalar(
            self.step_per_trajectory,
            name="ope.logged_dataset['step_per_trajectory']",
            target_type=int,
            min_val=1,
        )

        self.behavior_policy_reward = {}
        if self.ope.use_multiple_logged_dataset:
            for (
                behavior_policy
            ) in self.ope.multiple_logged_dataset.behavior_policy_names:
                logged_dataset_ = self.ope.multiple_logged_dataset.get(
                    behavior_policy_name=behavior_policy, dataset_id=0
                )
                self.behavior_policy_reward[behavior_policy] = logged_dataset_[
                    "reward"
                ].reshape((-1, self.step_per_trajectory))

                if self.ope.disable_reward_after_done:
                    done = logged_dataset_["done"].reshape(
                        (-1, self.step_per_trajectory)
                    )
                    self.behavior_policy_reward[
                        behavior_policy
                    ] = self.behavior_policy_reward[behavior_policy] * (
                        1 - done
                    ).cumprod(
                        axis=1
                    )

        else:
            behavior_policy = self.ope.logged_dataset["behavior_policy"]
            self.behavior_policy_reward[behavior_policy] = self.ope.logged_dataset[
                "reward"
            ].reshape((-1, self.step_per_trajectory))

            if self.ope.disable_reward_after_done:
                done = self.ope.logged_dataset["done"].reshape(
                    (-1, self.step_per_trajectory)
                )
                self.behavior_policy_reward[
                    behavior_policy
                ] = self.behavior_policy_reward[behavior_policy] * (1 - done).cumprod(
                    axis=1
                )

        self._estimate_confidence_interval = {
            "bootstrap": estimate_confidence_interval_by_bootstrap,
            "hoeffding": estimate_confidence_interval_by_hoeffding,
            "bernstein": estimate_confidence_interval_by_empirical_bernstein,
            "ttest": estimate_confidence_interval_by_t_test,
        }

    def _check_compared_estimators(
        self,
        compared_estimators: Optional[List[str]] = None,
        ope_type: str = "standard_ope",
    ):
        if ope_type == "standard_ope":
            if self.ope is None:
                raise RuntimeError(
                    "ope is not given. Please initialize the class with ope attribute"
                )
        else:
            if self.cumulative_distribution_ope is None:
                raise RuntimeError(
                    "cumulative_distribution_ope is not given. Please initialize the class with cumulative_distribution_ope attribute"
                )

        if compared_estimators is None:
            compared_estimators = self.estimators_name[ope_type]
        elif not set(compared_estimators).issubset(self.estimators_name[ope_type]):
            raise ValueError(
                f"compared_estimators must be a subset of self.estimators_name['{ope_type}'], but found False."
            )
        return compared_estimators

    def _check_basic_visualization_inputs(
        self,
        n_cols: Optional[int] = None,
        fig_dir: Optional[Path] = None,
        fig_name: Optional[str] = None,
    ):
        if n_cols is not None:
            check_scalar(n_cols, name="n_cols", target_type=int, min_val=1)
        if fig_dir is not None and not isinstance(fig_dir, Path):
            raise ValueError(f"fig_dir must be a Path, but {type(fig_dir)} is given")
        if fig_name is not None and not isinstance(fig_name, str):
            raise ValueError(f"fig_dir must be a string, but {type(fig_dir)} is given")

    def _check_topk_inputs(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        max_topk: Optional[int] = None,
        metrics: Optional[List[str]] = None,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        gamma: Optional[float] = None,
    ):
        if isinstance(input_dict, MultipleInputDict):
            max_topk_ = 100

            if behavior_policy_name is None:
                if dataset_id is None:
                    for n_eval_policies in input_dict.n_eval_policies.values():
                        max_topk_ = min(max_topk_, n_eval_policies.min())
                else:
                    for n_eval_policies in input_dict.n_eval_policies.values():
                        max_topk_ = min(max_topk_, n_eval_policies[dataset_id])
            else:
                if dataset_id is None:
                    max_topk_ = min(
                        max_topk_,
                        input_dict.n_eval_policies[behavior_policy_name].min(),
                    )
                else:
                    max_topk_ = input_dict.n_eval_policies[behavior_policy_name][
                        dataset_id
                    ]
        else:
            behavior_policy_name = input_dict[list(input_dict.keys())[0]][
                "behavior_policy"
            ]
            max_topk_ = len(input_dict)

        if max_topk is None:
            max_topk = int(max_topk_)
        else:
            check_scalar(max_topk, name="max_topk", target_type=int, min_val=1)
            max_topk = min(max_topk, max_topk_)

        if metrics is not None:
            for metric in metrics:
                if metric not in [
                    "k-th",
                    "best",
                    "worst",
                    "mean",
                    "std",
                    "safety_violation_rate",
                    "sharpe_ratio",
                ]:
                    raise ValueError(
                        f"The elements of metrics must be one of 'k-th', 'best', 'worst', 'mean', 'std', 'safety_violation_rate', or 'sharpe_ratio', but {metric} is given."
                    )

        if safety_threshold is None:
            if relative_safety_criteria is not None:
                check_scalar(
                    relative_safety_criteria,
                    name="relative_safety_criteria",
                    target_type=float,
                    min_val=0.0,
                )

                discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma

                if behavior_policy_name is not None:
                    behavior_policy_reward = self.behavior_policy_reward[
                        behavior_policy_name
                    ]
                    behavior_policy_value = (
                        discount[np.newaxis, :] * behavior_policy_reward
                    ).sum(
                        axis=1
                    ).mean() + 1e-10  # to avoid zero division

                    safety_threshold = relative_safety_criteria * behavior_policy_value
                    safety_threshold = float(safety_threshold)

                elif len(self.behavior_policy_reward) == 1:
                    behavior_policy_reward = list(self.behavior_policy_reward.values())[
                        0
                    ]
                    behavior_policy_value = (
                        discount[np.newaxis, :] * behavior_policy_reward
                    ).sum(
                        axis=1
                    ).mean() + 1e-10  # to avoid zero division

                    safety_threshold = relative_safety_criteria * behavior_policy_value
                    safety_threshold = float(safety_threshold)

                else:
                    safety_threshold = 0.0

            else:
                safety_threshold = 0.0

        check_scalar(
            safety_threshold,
            name="safety_threshold",
            target_type=float,
        )

        return max_topk, safety_threshold

    def _obtain_true_selection_result(
        self,
        input_dict: OPEInputDict,
        return_variance: bool = False,
        return_lower_quartile: bool = False,
        return_conditional_value_at_risk: bool = False,
        return_by_dataframe: bool = False,
        quartile_alpha: float = 0.05,
        cvar_alpha: float = 0.05,
    ):
        """Obtain the oracle selection result based on the ground-truth policy value.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        return_variance: bool, default=False
            Whether to return the variance or not.

        return_lower_quartile: bool. default=False
            Whether to return the lower interquartile or not.

        return_conditional_value_at_risk: bool, default=False
            Whether to return the conditional value at risk or not.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        quartile_alpha: float, default=0.05
            Proportion of the shaded region of the interquartile range.

        cvar_alpha: float, default=0.05
            Proportion of the shaded region of the conditional value at risk.

        Return
        -------
        ground_truth_dict/ground_truth_df: dict or dataframe
            Dictionary/dataframe containing the following ground-truth (on-policy) metrics.

            .. code-block:: python

                key: [
                    ranking,
                    policy_value,
                    relative_policy_value,
                    variance,
                    ranking_by_lower_quartile,
                    lower_quartile,
                    ranking_by_conditional_value_at_risk,
                    conditional_value_at_risk,
                    parameters,  # only when return_by_dataframe == False
                ]

            ranking: list of str
                Name of the candidate policies sorted by the ground-truth policy value.

            policy_value: list of float
                Ground-truth policy value of the candidate policies (sorted by ranking).

            relative_policy_value: list of float
                Ground-truth relative policy value of the candidate policies compared to the behavior policy (sorted by ranking).

            variance: list of float
                Ground-truth variance of the trajectory-wise reward of the candidate policies (sorted by ranking).
                If return_variance is `False`, `None` is recorded.

            ranking_by_lower_quartile: list of str
                Name of the candidate policies sorted by the ground-truth lower quartile of the trajectory-wise reward.
                If return_lower_quartile is `False`, `None` is recorded.

            lower_quartile: list of float
                Ground-truth lower quartile of the candidate policies (sorted by ranking_by_lower_quartile).
                If return_lower_quartile is `False`, `None` is recorded.

            ranking_by_conditional_value_at_risk: list of str
                Name of the candidate policies sorted by the ground-truth conditional value at risk.
                If return_conditional_value_at_risk is `False`, `None` is recorded.

            conditional_value_at_risk: list of float
                Ground-truth conditional value at risk of the candidate policies (sorted by ranking_by_conditional_value_at_risk).
                If return_conditional_value_at_risk is `False`, `None` is recorded.

            parameters: dict
                Dictionary containing quartile_alpha, and cvar_alpha.
                If return_by_dataframe is `True`, parameters will not be returned.

        """
        candidate_policy_names = list(input_dict.keys())
        for eval_policy in candidate_policy_names:
            if input_dict[eval_policy]["on_policy_policy_value"] is None:
                raise ValueError(
                    f"one of the candidate policies, {eval_policy}, does not contain on-policy policy value in input_dict"
                )
        behavior_policy = input_dict[eval_policy]["behavior_policy"]

        n_policies = len(candidate_policy_names)
        n_samples = len(input_dict[eval_policy]["on_policy_policy_value"])

        policy_value = np.zeros(n_policies)
        for i, eval_policy in enumerate(candidate_policy_names):
            policy_value[i] = input_dict[eval_policy]["on_policy_policy_value"].mean()

        ranking_index = np.argsort(policy_value)[::-1]
        ranking = [candidate_policy_names[ranking_index[i]] for i in range(n_policies)]

        gamma = input_dict[eval_policy]["gamma"]
        discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma

        behavior_policy_reward = self.behavior_policy_reward[behavior_policy]
        behavior_policy_value = (discount[np.newaxis, :] * behavior_policy_reward).sum(
            axis=1
        ).mean() + 1e-10  # to avoid zero division

        policy_value = np.sort(policy_value)[::-1]
        relative_policy_value = policy_value / behavior_policy_value

        if return_variance:
            variance = np.zeros(n_policies)
            for i, eval_policy in enumerate(candidate_policy_names):
                variance[i] = input_dict[eval_policy]["on_policy_policy_value"].var(
                    ddof=1
                )
            variance = variance[ranking_index]

        if return_lower_quartile:
            lower_quartile = np.zeros(n_policies)
            for i, eval_policy in enumerate(candidate_policy_names):
                lower_quartile[i] = np.quantile(
                    input_dict[eval_policy]["on_policy_policy_value"], q=quartile_alpha
                )

            quartile_ranking_index = np.argsort(policy_value)[::-1]
            ranking_by_lower_quartile = [
                candidate_policy_names[quartile_ranking_index[i]]
                for i in range(n_policies)
            ]
            lower_quartile = np.sort(lower_quartile)[::-1]

        if return_conditional_value_at_risk:
            cvar = np.zeros(n_policies)
            for i, eval_policy in enumerate(candidate_policy_names):
                cvar[i] = np.sort(input_dict[eval_policy]["on_policy_policy_value"])[
                    : int(n_samples * cvar_alpha)
                ].mean()

            cvar_ranking_index = np.argsort(cvar)[::-1]
            ranking_by_cvar = [
                candidate_policy_names[cvar_ranking_index[i]] for i in range(n_policies)
            ]
            cvar = np.sort(cvar)[::-1]

        ground_truth_dict = {
            "ranking": ranking,
            "policy_value": policy_value,
            "relative_policy_value": relative_policy_value,
            "variance": variance if return_variance else None,
            "ranking_by_lower_quartile": ranking_by_lower_quartile
            if return_lower_quartile
            else None,
            "lower_quartile": lower_quartile if return_lower_quartile else None,
            "ranking_by_conditional_value_at_risk": ranking_by_cvar
            if return_conditional_value_at_risk
            else None,
            "conditional_value_at_risk": cvar
            if return_conditional_value_at_risk
            else None,
            "parameters": {
                "quartile_alpha": quartile_alpha if return_lower_quartile else None,
                "cvar_alpha": cvar_alpha if return_conditional_value_at_risk else None,
            },
        }

        if return_by_dataframe:
            ground_truth_df = pd.DataFrame()
            for key in ground_truth_dict.keys():
                if ground_truth_dict[key] is None or key == "parameters":
                    continue

                ground_truth_df[key] = ground_truth_dict[key]

        return ground_truth_df if return_by_dataframe else ground_truth_dict

    def _select_by_policy_value(
        self,
        input_dict: OPEInputDict,
        compared_estimators: Optional[List[str]] = None,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        top_k_in_eval_metrics: int = 1,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
    ):
        """Rank the candidate policies by their estimated policy values.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        return_true_values: bool, default=False
            Whether to return the true policy value and corresponding ranking of the candidate policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            mean-squared-error, rank-correlation, regret@k, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        top_k_in_eval_metrics: int, default=1
            How many candidate policies are included in regret@k.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None (>= 0)
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [estimator_name][
                    estimated_ranking,
                    estimated_policy_value,
                    estimated_relative_policy_value,
                    true_ranking,
                    true_policy_value,
                    true_relative_policy_value,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated policy value.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_policy_value: list of float
                Estimated policy value of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_relative_policy_value: list of float
                Estimated relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_policy_value: list of float
                True policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict when return_by_dataframe is `True`.

            true_relative_policy_value: list of float
                True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: float
                Mean-squared-error of the estimators calculated across candidate evaluation policies.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple of float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            regret: tuple of float and int
                Regret@k and k.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df when return_by_dataframe is `True`.

            safety_threshold: float
                A policy whose policy value is below the given threshold is to be considered unsafe.

        """
        behavior_policy_name = list(input_dict.values())[0]["behavior_policy"]
        dataset_id = list(input_dict.values())[0]["dataset_id"]
        gamma = list(input_dict.values())[0]["gamma"]

        discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma
        behavior_policy_reward = self.behavior_policy_reward[behavior_policy_name]

        behavior_policy_value = (discount[np.newaxis, :] * behavior_policy_reward).sum(
            axis=1
        ).mean() + 1e-10  # to avoid zero division

        if safety_threshold is None:
            if relative_safety_criteria is None:
                safety_threshold = 0.0
            else:
                safety_threshold = relative_safety_criteria * behavior_policy_value

        estimated_policy_value_dict = self.ope.estimate_policy_value(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        ground_truth_dict = self.obtain_true_selection_result(input_dict)
        true_ranking = ground_truth_dict["ranking"]
        true_policy_value = ground_truth_dict["policy_value"]

        candidate_policy_names = (
            true_ranking if return_metrics else list(input_dict.keys())
        )
        n_policies = len(candidate_policy_names)

        ops_dict = {}
        for i, estimator in enumerate(compared_estimators):
            estimated_policy_value_ = np.zeros(n_policies)
            true_policy_value_ = np.zeros(n_policies)
            for j, eval_policy in enumerate(candidate_policy_names):
                estimated_policy_value_[j] = estimated_policy_value_dict[eval_policy][
                    estimator
                ]
                true_policy_value_[j] = true_policy_value[j]

            estimated_ranking_index_ = np.argsort(estimated_policy_value_)[::-1]
            true_ranking_index_ = np.argsort(true_policy_value_)[::-1]

            estimated_ranking = [
                candidate_policy_names[estimated_ranking_index_[i]]
                for i in range(n_policies)
            ]
            estimated_policy_value = np.sort(estimated_policy_value_)[::-1]
            estimated_relative_policy_value = (
                estimated_policy_value / behavior_policy_value
            )

            if return_metrics:
                mse = mean_squared_error(true_policy_value, estimated_policy_value_)
                rankcorr = spearmanr(np.arange(n_policies), estimated_ranking_index_)
                regret = (
                    true_policy_value[:top_k_in_eval_metrics].sum()
                    - true_policy_value[estimated_ranking_index_][
                        :top_k_in_eval_metrics
                    ].sum()
                )

                true_safety = true_policy_value >= safety_threshold
                estimated_safety = estimated_policy_value_ >= safety_threshold

                if true_safety.sum() > 0:
                    type_i_error_rate = (
                        true_safety > estimated_safety
                    ).sum() / true_safety.sum()
                else:
                    type_i_error_rate = 0.0

                if (1 - true_safety).sum() > 0:
                    type_ii_error_rate = (true_safety < estimated_safety).sum() / (
                        1 - true_safety
                    ).sum()
                else:
                    type_ii_error_rate = 0.0

            ops_dict[estimator] = {
                "estimated_ranking": estimated_ranking,
                "estimated_policy_value": estimated_policy_value,
                "estimated_relative_policy_value": estimated_relative_policy_value,
            }
            if return_true_values:
                ops_dict[estimator]["true_ranking"] = true_ranking_index_[
                    estimated_ranking_index_
                ]
                ops_dict[estimator]["true_policy_value"] = true_policy_value_[
                    estimated_ranking_index_
                ]
                ops_dict[estimator]["true_relative_policy_value"] = (
                    true_policy_value_[estimated_ranking_index_] / behavior_policy_value
                )
            if return_metrics:
                ops_dict[estimator]["mean_squared_error"] = mse
                ops_dict[estimator]["rank_correlation"] = rankcorr
                ops_dict[estimator]["regret"] = (regret, top_k_in_eval_metrics)
                ops_dict[estimator]["type_i_error_rate"] = type_i_error_rate
                ops_dict[estimator]["type_ii_error_rate"] = type_ii_error_rate
                ops_dict[estimator]["safety_threshold"] = safety_threshold

        if return_by_dataframe:
            ranking_df_dict = defaultdict(pd.DataFrame)

            for i, estimator in enumerate(compared_estimators):
                ranking_df_ = pd.DataFrame()
                ranking_df_["estimated_ranking"] = ops_dict[estimator][
                    "estimated_ranking"
                ]
                ranking_df_["estimated_policy_value"] = ops_dict[estimator][
                    "estimated_policy_value"
                ]
                ranking_df_["estimated_relative_policy_value"] = ops_dict[estimator][
                    "estimated_relative_policy_value"
                ]

                if return_true_values:
                    ranking_df_["true_ranking"] = ops_dict[estimator]["true_ranking"]
                    ranking_df_["true_policy_value"] = ops_dict[estimator][
                        "true_policy_value"
                    ]
                    ranking_df_["true_relative_policy_value"] = ops_dict[estimator][
                        "true_relative_policy_value"
                    ]

                ranking_df_dict[estimator] = ranking_df_

            ranking_df_dict = defaultdict_to_dict(ranking_df_dict)

            if return_metrics:
                (
                    mse,
                    rankcorr,
                    pvalue,
                    regret,
                    type_i,
                    type_ii,
                ) = (
                    [],
                    [],
                    [],
                    [],
                    [],
                    [],
                )
                for i, estimator in enumerate(compared_estimators):
                    mse.append(ops_dict[estimator]["mean_squared_error"])
                    rankcorr.append(ops_dict[estimator]["rank_correlation"][0])
                    pvalue.append(ops_dict[estimator]["rank_correlation"][1])
                    regret.append(ops_dict[estimator]["regret"][0])
                    type_i.append(ops_dict[estimator]["type_i_error_rate"])
                    type_ii.append(ops_dict[estimator]["type_ii_error_rate"])

                metric_df = pd.DataFrame()
                metric_df["estimator"] = compared_estimators
                metric_df["mean_squared_error"] = mse
                metric_df["rank_correlation"] = rankcorr
                metric_df["pvalue"] = pvalue
                metric_df[f"regret@{top_k_in_eval_metrics}"] = regret
                metric_df["type_i_error_rate"] = type_i
                metric_df["type_ii_error_rate"] = type_ii

            dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict

        return dfs if return_by_dataframe else ops_dict

    def _select_by_policy_value_via_cumulative_distribution_ope(
        self,
        input_dict: OPEInputDict,
        compared_estimators: Optional[List[str]] = None,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        top_k_in_eval_metrics: int = 1,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
    ):
        """Rank the candidate policies by their estimated policy value via cumulative distribution OPE methods.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        return_true_values: bool, default=False
            Whether to return the true policy value and corresponding ranking of the candidate policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            mean-squared-error, rank-correlation, regret@k, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        top_k_in_eval_metrics: int, default=1
            How many candidate policies are included in regret@k.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None (>= 0)
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [estimator_name][
                    estimated_ranking,
                    estimated_policy_value,
                    estimated_relative_policy_value,
                    true_ranking,
                    true_policy_value,
                    true_relative_policy_value,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated policy value.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_policy_value: list of float
                Estimated policy value of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_relative_policy_value: list of float
                Estimated relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_policy_value: list of float
                True policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_relative_policy_value: list of float
                True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: float
                Mean-squared-error of the estimators calculated across candidate evaluation policies.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple of float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df when return_by_dataframe is `True`.

            regret: tuple of float and int
                Regret@k and k.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df when return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df when return_by_dataframe is `True`.

            safety_threshold: float
                A policy whose policy value is below the given threshold is to be considered unsafe.

        """
        behavior_policy_name = list(input_dict.values())[0]["behavior_policy"]
        dataset_id = list(input_dict.values())[0]["dataset_id"]
        gamma = list(input_dict.values())[0]["gamma"]

        discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma
        behavior_policy_reward = self.behavior_policy_reward[behavior_policy_name]

        behavior_policy_value = (discount[np.newaxis, :] * behavior_policy_reward).sum(
            axis=1
        ).mean() + 1e-10  # to avoid zero division

        if safety_threshold is None:
            if relative_safety_criteria is None:
                safety_threshold = 0.0
            else:
                safety_threshold = relative_safety_criteria * behavior_policy_value

        estimated_policy_value_dict = self.cumulative_distribution_ope.estimate_mean(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        ground_truth_dict = self.obtain_true_selection_result(input_dict)
        true_ranking = ground_truth_dict["ranking"]
        true_policy_value = ground_truth_dict["policy_value"]

        candidate_policy_names = (
            true_ranking if return_metrics else list(input_dict.keys())
        )
        n_policies = len(candidate_policy_names)

        ops_dict = {}
        for i, estimator in enumerate(compared_estimators):
            estimated_policy_value_ = np.zeros(n_policies)
            true_policy_value_ = np.zeros(n_policies)
            for j, eval_policy in enumerate(candidate_policy_names):
                estimated_policy_value_[j] = estimated_policy_value_dict[eval_policy][
                    estimator
                ]
                true_policy_value_[j] = true_policy_value[j]

            estimated_ranking_index_ = np.argsort(estimated_policy_value_)[::-1]
            true_ranking_index_ = np.argsort(true_policy_value_)[::-1]

            estimated_ranking = [
                candidate_policy_names[estimated_ranking_index_[i]]
                for i in range(n_policies)
            ]
            estimated_policy_value = np.sort(estimated_policy_value_)[::-1]
            estimated_relative_policy_value = (
                estimated_policy_value / behavior_policy_value
            )

            if return_metrics:
                mse = mean_squared_error(
                    true_policy_value, np.nan_to_num(estimated_policy_value_)
                )
                rankcorr = spearmanr(np.arange(n_policies), estimated_ranking_index_)
                regret = (
                    true_policy_value[:top_k_in_eval_metrics].sum()
                    - true_policy_value[estimated_ranking_index_][
                        :top_k_in_eval_metrics
                    ].sum()
                )

                true_safety = true_policy_value >= safety_threshold
                estimated_safety = estimated_policy_value_ >= safety_threshold

                if true_safety.sum() > 0:
                    type_i_error_rate = (
                        true_safety > estimated_safety
                    ).sum() / true_safety.sum()
                else:
                    type_i_error_rate = 0.0

                if (1 - true_safety).sum() > 0:
                    type_ii_error_rate = (true_safety < estimated_safety).sum() / (
                        1 - true_safety
                    ).sum()
                else:
                    type_ii_error_rate = 0.0

            ops_dict[estimator] = {
                "estimated_ranking": estimated_ranking,
                "estimated_policy_value": estimated_policy_value,
                "estimated_relative_policy_value": estimated_relative_policy_value,
            }
            if return_true_values:
                ops_dict[estimator]["true_ranking"] = true_ranking_index_[
                    estimated_ranking_index_
                ]
                ops_dict[estimator]["true_policy_value"] = true_policy_value_[
                    estimated_ranking_index_
                ]
                ops_dict[estimator]["true_relative_policy_value"] = (
                    true_policy_value_[estimated_ranking_index_] / behavior_policy_value
                )
            if return_metrics:
                ops_dict[estimator]["mean_squared_error"] = mse
                ops_dict[estimator]["rank_correlation"] = rankcorr
                ops_dict[estimator]["regret"] = (regret, top_k_in_eval_metrics)
                ops_dict[estimator]["type_i_error_rate"] = type_i_error_rate
                ops_dict[estimator]["type_ii_error_rate"] = type_ii_error_rate
                ops_dict[estimator]["safety_threshold"] = safety_threshold

        if return_by_dataframe:
            ranking_df_dict = defaultdict(pd.DataFrame)

            for i, estimator in enumerate(compared_estimators):
                ranking_df_ = pd.DataFrame()
                ranking_df_["estimated_ranking"] = ops_dict[estimator][
                    "estimated_ranking"
                ]
                ranking_df_["estimated_policy_value"] = ops_dict[estimator][
                    "estimated_policy_value"
                ]
                ranking_df_["estimated_relative_policy_value"] = ops_dict[estimator][
                    "estimated_relative_policy_value"
                ]

                if return_true_values:
                    ranking_df_["true_ranking"] = ops_dict[estimator]["true_ranking"]
                    ranking_df_["true_policy_value"] = ops_dict[estimator][
                        "true_policy_value"
                    ]
                    ranking_df_["true_relative_policy_value"] = ops_dict[estimator][
                        "true_relative_policy_value"
                    ]

                ranking_df_dict[estimator] = ranking_df_

            ranking_df_dict = defaultdict_to_dict(ranking_df_dict)

            if return_metrics:
                (
                    mse,
                    rankcorr,
                    pvalue,
                    regret,
                    type_i,
                    type_ii,
                ) = (
                    [],
                    [],
                    [],
                    [],
                    [],
                    [],
                )
                for i, estimator in enumerate(compared_estimators):
                    mse.append(ops_dict[estimator]["mean_squared_error"])
                    rankcorr.append(ops_dict[estimator]["rank_correlation"][0])
                    pvalue.append(ops_dict[estimator]["rank_correlation"][1])
                    regret.append(ops_dict[estimator]["regret"][0])
                    type_i.append(ops_dict[estimator]["type_i_error_rate"])
                    type_ii.append(ops_dict[estimator]["type_ii_error_rate"])

                metric_df = pd.DataFrame()
                metric_df["estimator"] = compared_estimators
                metric_df["mean_squared_error"] = mse
                metric_df["rank_correlation"] = rankcorr
                metric_df["pvalue"] = pvalue
                metric_df[f"regret@{top_k_in_eval_metrics}"] = regret
                metric_df["type_i_error_rate"] = type_i
                metric_df["type_ii_error_rate"] = type_ii

            dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict

        return dfs if return_by_dataframe else ops_dict

    def _select_by_policy_value_lower_bound(
        self,
        input_dict: OPEInputDict,
        compared_estimators: Optional[List[str]] = None,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        top_k_in_eval_metrics: int = 1,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        cis: List[str] = ["bootstrap"],
        alpha: float = 0.05,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
    ):
        """Rank the candidate policies by their estimated policy value lower bound.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        return_true_values: bool, default=False
            Whether to return the true policy value and corresponding ranking of the candidate policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            rank-correlation, regret@k, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        top_k_in_eval_metrics: int, default=1
            How many candidate policies are included in regret@k.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None (>= 0)
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"]
            Estimation methods for confidence intervals.

        alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        n_bootstrap_samples: int, default=100 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [ci][estimator_name][
                    estimated_ranking,
                    estimated_policy_value_lower_bound,
                    estimated_relative_policy_value_lower_bound,
                    true_ranking,
                    true_policy_value,
                    true_relative_policy_value,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated policy value lower bound.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_policy_value_lower_bound: list of float
                Estimated policy value lower bound of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_relative_policy_value_lower_bound: list of float
                Estimated relative policy value lower bound of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_policy_value: list of float
                True policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_relative_policy_value: list of float
                True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: None
                This is for API consistency.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple of float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            regret: tuple of float and int
                Regret@k and k.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            safety_threshold: float
                A policy whose policy value is below the given threshold is to be considered unsafe.

        """
        ground_truth_dict = self.obtain_true_selection_result(input_dict)
        true_ranking = ground_truth_dict["ranking"]
        true_policy_value = ground_truth_dict["policy_value"]

        behavior_policy_name = list(input_dict.values())[0]["behavior_policy"]
        dataset_id = list(input_dict.values())[0]["dataset_id"]
        gamma = list(input_dict.values())[0]["gamma"]

        discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma
        behavior_policy_reward = self.behavior_policy_reward[behavior_policy_name]

        behavior_policy_value = (discount[np.newaxis, :] * behavior_policy_reward).sum(
            axis=1
        ).mean() + 1e-10  # to avoid zero division

        if safety_threshold is None:
            if relative_safety_criteria is None:
                safety_threshold = 0.0
            else:
                safety_threshold = relative_safety_criteria * behavior_policy_value

        candidate_policy_names = (
            true_ranking if return_metrics else list(input_dict.keys())
        )
        n_policies = len(candidate_policy_names)

        ops_dict = defaultdict(dict)
        for ci in cis:
            estimated_policy_value_interval_dict = self.ope.estimate_intervals(
                input_dict,
                compared_estimators=compared_estimators,
                behavior_policy_name=behavior_policy_name,
                dataset_id=dataset_id,
                alpha=alpha,
                ci=ci,
                n_bootstrap_samples=n_bootstrap_samples,
                random_state=random_state,
            )

            for i, estimator in enumerate(compared_estimators):
                estimated_policy_value_lower_bound_ = np.zeros(n_policies)
                true_policy_value_ = np.zeros(n_policies)
                for j, eval_policy in enumerate(candidate_policy_names):
                    estimated_policy_value_lower_bound_[
                        j
                    ] = estimated_policy_value_interval_dict[eval_policy][estimator][
                        f"{100 * (1. - alpha)}% CI (lower)"
                    ]
                    true_policy_value_[j] = true_policy_value[j]

                estimated_ranking_index_ = np.argsort(
                    estimated_policy_value_lower_bound_
                )[::-1]
                true_ranking_index_ = np.argsort(true_policy_value_)[::-1]

                estimated_ranking = [
                    candidate_policy_names[estimated_ranking_index_[i]]
                    for i in range(n_policies)
                ]
                estimated_policy_value_lower_bound = np.sort(
                    estimated_policy_value_lower_bound_
                )[::-1]
                estimated_relative_policy_value_lower_bound = (
                    estimated_policy_value_lower_bound / behavior_policy_value
                )

                if return_metrics:
                    rankcorr = spearmanr(
                        np.arange(n_policies), estimated_ranking_index_
                    )
                    regret = (
                        true_policy_value[:top_k_in_eval_metrics].sum()
                        - true_policy_value[estimated_ranking_index_][
                            :top_k_in_eval_metrics
                        ].sum()
                    )

                    true_safety = true_policy_value >= safety_threshold
                    estimated_safety = (
                        estimated_policy_value_lower_bound_ >= safety_threshold
                    )

                    if true_safety.sum() > 0:
                        type_i_error_rate = (
                            true_safety > estimated_safety
                        ).sum() / true_safety.sum()
                    else:
                        type_i_error_rate = 0.0

                    if (1 - true_safety).sum() > 0:
                        type_ii_error_rate = (true_safety < estimated_safety).sum() / (
                            1 - true_safety
                        ).sum()
                    else:
                        type_ii_error_rate = 0.0

                ops_dict[ci][estimator] = {
                    "estimated_ranking": estimated_ranking,
                    "estimated_policy_value_lower_bound": estimated_policy_value_lower_bound,
                    "estimated_relative_policy_value_lower_bound": estimated_relative_policy_value_lower_bound,
                }
                if return_true_values:
                    ops_dict[ci][estimator]["true_ranking"] = true_ranking_index_[
                        estimated_ranking_index_
                    ]
                    ops_dict[ci][estimator]["true_policy_value"] = true_policy_value_[
                        estimated_ranking_index_
                    ]
                    ops_dict[ci][estimator]["true_relative_policy_value"] = (
                        true_policy_value_[estimated_ranking_index_]
                        / behavior_policy_value
                    )
                if return_metrics:
                    ops_dict[ci][estimator]["mean_squared_error"] = None
                    ops_dict[ci][estimator]["rank_correlation"] = rankcorr
                    ops_dict[ci][estimator]["regret"] = (regret, top_k_in_eval_metrics)
                    ops_dict[ci][estimator]["type_i_error_rate"] = type_i_error_rate
                    ops_dict[ci][estimator]["type_ii_error_rate"] = type_ii_error_rate
                    ops_dict[ci][estimator]["safety_threshold"] = safety_threshold

        ops_dict = defaultdict_to_dict(ops_dict)

        if return_by_dataframe:
            ranking_df_dict = defaultdict(lambda: defaultdict(pd.DataFrame))

            for ci in cis:
                for i, estimator in enumerate(compared_estimators):
                    ranking_df_ = pd.DataFrame()
                    ranking_df_["estimated_ranking"] = ops_dict[ci][estimator][
                        "estimated_ranking"
                    ]
                    ranking_df_["estimated_policy_value_lower_bound"] = ops_dict[ci][
                        estimator
                    ]["estimated_policy_value_lower_bound"]
                    ranking_df_[
                        "estimated_relative_policy_value_lower_bound"
                    ] = ops_dict[ci][estimator][
                        "estimated_relative_policy_value_lower_bound"
                    ]

                    if return_true_values:
                        ranking_df_["true_ranking"] = ops_dict[ci][estimator][
                            "true_ranking"
                        ]
                        ranking_df_["true_policy_value"] = ops_dict[ci][estimator][
                            "true_policy_value"
                        ]
                        ranking_df_["true_relative_policy_value"] = ops_dict[ci][
                            estimator
                        ]["true_relative_policy_value"]

                    ranking_df_dict[ci][estimator] = ranking_df_

            ranking_df_dict = defaultdict_to_dict(ranking_df_dict)

        if return_metrics:
            (
                ci_,
                estimator_,
                rankcorr,
                pvalue,
                regret,
                type_i,
                type_ii,
            ) = (
                [],
                [],
                [],
                [],
                [],
                [],
                [],
            )
            for ci in cis:
                for i, estimator in enumerate(compared_estimators):
                    ci_.append(ci)
                    estimator_.append(estimator)
                    rankcorr.append(ops_dict[ci][estimator]["rank_correlation"][0])
                    pvalue.append(ops_dict[ci][estimator]["rank_correlation"][1])
                    regret.append(ops_dict[ci][estimator]["regret"][0])
                    type_i.append(ops_dict[ci][estimator]["type_i_error_rate"])
                    type_ii.append(ops_dict[ci][estimator]["type_ii_error_rate"])

            metric_df = pd.DataFrame()
            metric_df["ci"] = ci_
            metric_df["estimator"] = estimator_
            metric_df["mean_squared_error"] = np.nan
            metric_df["rank_correlation"] = rankcorr
            metric_df["pvalue"] = pvalue
            metric_df[f"regret@{top_k_in_eval_metrics}"] = regret
            metric_df["type_i_error_rate"] = type_i
            metric_df["type_ii_error_rate"] = type_ii

            dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict

        return dfs if return_by_dataframe else ops_dict

    def _select_by_lower_quartile(
        self,
        input_dict: OPEInputDict,
        compared_estimators: Optional[List[str]] = None,
        alpha: float = 0.05,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        safety_threshold: float = 0.0,
    ):
        """Rank the candidate policies by their estimated lower quartile of the trajectory-wise reward.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 0.5]`.

        return_true_values: bool, default=False
            Whether to return the true lower quartile of the trajectory-wise reward
            and corresponding ranking of the candidate evaluation policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            mean-squared-error, rank-correlation, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        safety_threshold: float, default=0.0 (>= 0)
            The lower quartile required to be considered a safe policy.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [estimator_name][
                    estimated_ranking,
                    estimated_lower_quartile,
                    true_ranking,
                    true_lower_quartile,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated lower quartile of the trajectory-wise reward.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_lower_quartile: list of float
                Estimated lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_lower_quartile: list of float
                True lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: float
                Mean-squared-error of the estimated lower quartile of the trajectory-wise reward.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple of float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            regret: None
                This is for API consistency.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            safety_threshold: float
                The lower quartile required to be considered a safe policy.

        """
        behavior_policy_name = list(input_dict.values())[0]["behavior_policy"]
        dataset_id = list(input_dict.values())[0]["dataset_id"]

        estimated_interquartile_range_dict = (
            self.cumulative_distribution_ope.estimate_interquartile_range(
                input_dict,
                compared_estimators=compared_estimators,
                behavior_policy_name=behavior_policy_name,
                dataset_id=dataset_id,
                alpha=alpha,
            )
        )

        ground_truth_dict = self.obtain_true_selection_result(
            input_dict,
            return_lower_quartile=True,
            quartile_alpha=alpha,
        )
        true_ranking = ground_truth_dict["ranking_by_lower_quartile"]
        true_lower_quartile = ground_truth_dict["lower_quartile"]

        candidate_policy_names = (
            true_ranking if return_metrics else list(input_dict.keys())
        )
        n_policies = len(candidate_policy_names)

        ops_dict = {}
        for i, estimator in enumerate(compared_estimators):
            estimated_lower_quartile_ = np.zeros(n_policies)
            true_lower_quartile_ = np.zeros(n_policies)
            for j, eval_policy in enumerate(candidate_policy_names):
                estimated_lower_quartile_[j] = estimated_interquartile_range_dict[
                    eval_policy
                ][estimator][f"{100 * (1. - alpha)}% quartile (lower)"]
                true_lower_quartile_[j] = true_lower_quartile[j]

            estimated_ranking_index_ = np.argsort(estimated_lower_quartile_)[::-1]
            true_ranking_index_ = np.argsort(true_lower_quartile_)[::-1]

            estimated_ranking = [
                candidate_policy_names[estimated_ranking_index_[i]]
                for i in range(n_policies)
            ]
            estimated_lower_quartile = np.sort(estimated_lower_quartile_)[::-1]

            if return_metrics:
                mse = mean_squared_error(true_lower_quartile, estimated_lower_quartile_)
                rankcorr = spearmanr(np.arange(n_policies), estimated_ranking_index_)

                true_safety = true_lower_quartile >= safety_threshold
                estimated_safety = estimated_lower_quartile_ >= safety_threshold

                if true_safety.sum() > 0:
                    type_i_error_rate = (
                        true_safety > estimated_safety
                    ).sum() / true_safety.sum()
                else:
                    type_i_error_rate = 0.0

                if (1 - true_safety).sum() > 0:
                    type_ii_error_rate = (true_safety < estimated_safety).sum() / (
                        1 - true_safety
                    ).sum()
                else:
                    type_ii_error_rate = 0.0

            ops_dict[estimator] = {
                "estimated_ranking": estimated_ranking,
                "estimated_lower_quartile": estimated_lower_quartile,
            }
            if return_true_values:
                ops_dict[estimator]["true_ranking"] = true_ranking_index_[
                    estimated_ranking_index_
                ]
                ops_dict[estimator]["true_lower_quartile"] = true_lower_quartile_[
                    estimated_ranking_index_
                ]
            if return_metrics:
                ops_dict[estimator]["mean_squared_error"] = mse
                ops_dict[estimator]["rank_correlation"] = rankcorr
                ops_dict[estimator]["regret"] = None
                ops_dict[estimator]["type_i_error_rate"] = type_i_error_rate
                ops_dict[estimator]["type_ii_error_rate"] = type_ii_error_rate
                ops_dict[estimator]["safety_threshold"] = safety_threshold

        if return_by_dataframe:
            ranking_df_dict = defaultdict(pd.DataFrame)

            for i, estimator in enumerate(compared_estimators):
                ranking_df_ = pd.DataFrame()
                ranking_df_["estimated_ranking"] = ops_dict[estimator][
                    "estimated_ranking"
                ]
                ranking_df_["estimated_lower_quartile"] = ops_dict[estimator][
                    "estimated_lower_quartile"
                ]

                if return_true_values:
                    ranking_df_["true_ranking"] = ops_dict[estimator]["true_ranking"]
                    ranking_df_["true_lower_quartile"] = ops_dict[estimator][
                        "true_lower_quartile"
                    ]

                ranking_df_dict[estimator] = ranking_df_

            ranking_df_dict = defaultdict_to_dict(ranking_df_dict)

            if return_metrics:
                (
                    mse,
                    rankcorr,
                    pvalue,
                    type_i,
                    type_ii,
                ) = (
                    [],
                    [],
                    [],
                    [],
                    [],
                )
                for i, estimator in enumerate(compared_estimators):
                    mse.append(ops_dict[estimator]["mean_squared_error"])
                    rankcorr.append(ops_dict[estimator]["rank_correlation"][0])
                    pvalue.append(ops_dict[estimator]["rank_correlation"][1])
                    type_i.append(ops_dict[estimator]["type_i_error_rate"])
                    type_ii.append(ops_dict[estimator]["type_ii_error_rate"])

                metric_df = pd.DataFrame()
                metric_df["estimator"] = compared_estimators
                metric_df["mean_squared_error"] = mse
                metric_df["rank_correlation"] = rankcorr
                metric_df["pvalue"] = pvalue
                metric_df["regret"] = np.nan
                metric_df["type_i_error_rate"] = type_i
                metric_df["type_ii_error_rate"] = type_ii

            dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict

        return dfs if return_by_dataframe else ops_dict

    def _select_by_conditional_value_at_risk(
        self,
        input_dict: OPEInputDict,
        compared_estimators: Optional[List[str]] = None,
        alpha: float = 0.05,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        safety_threshold: float = 0.0,
    ):
        """Rank the candidate policies by their estimated conditional value at risk.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 1]`.

        return_true_values: bool, default=False
            Whether to return the true conditional value at risk
            and corresponding ranking of the candidate evaluation policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            mean-squared-error, rank-correlation, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        safety_threshold: float, default=0.0 (>= 0)
            The conditional value at risk required to be considered a safe policy.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [estimator_name][
                    estimated_ranking,
                    estimated_conditional_value_at_risk,
                    true_ranking,
                    true_conditional_value_at_risk,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated conditional value at risk.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_conditional_value_at_risk: list of float
                Estimated conditional value at risk of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) conditional value at risk of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_conditional_value_at_risk: list of float
                True conditional value at risk of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: float
                Mean-squared-error of the estimated conditional value at risk.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple or float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            regret: None
                This is for API consistency.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is True`.

            safety_threshold: float
                The conditional value at risk required to be considered a safe policy.

        """
        behavior_policy_name = list(input_dict.values())[0]["behavior_policy"]
        dataset_id = list(input_dict.values())[0]["dataset_id"]

        estimated_cvar_dict = (
            self.cumulative_distribution_ope.estimate_conditional_value_at_risk(
                input_dict,
                compared_estimators=compared_estimators,
                behavior_policy_name=behavior_policy_name,
                dataset_id=dataset_id,
                alphas=alpha,
            )
        )

        ground_truth_dict = self.obtain_true_selection_result(
            input_dict,
            return_conditional_value_at_risk=True,
            cvar_alpha=alpha,
        )
        true_ranking = ground_truth_dict["ranking_by_conditional_value_at_risk"]
        true_cvar = ground_truth_dict["conditional_value_at_risk"]

        candidate_policy_names = (
            true_ranking if return_metrics else list(input_dict.keys())
        )
        n_policies = len(candidate_policy_names)

        ops_dict = {}
        for i, estimator in enumerate(compared_estimators):
            estimated_cvar_ = np.zeros(n_policies)
            true_cvar_ = np.zeros(n_policies)
            for j, eval_policy in enumerate(candidate_policy_names):
                estimated_cvar_[j] = estimated_cvar_dict[eval_policy][estimator]
                true_cvar_[j] = true_cvar[j]

            estimated_ranking_index_ = np.argsort(estimated_cvar_)[::-1]
            true_ranking_index_ = np.argsort(true_cvar_)[::-1]

            estimated_cvar_ = np.zeros(n_policies)
            for j, eval_policy in enumerate(candidate_policy_names):
                estimated_cvar_[j] = estimated_cvar_dict[eval_policy][estimator]

            estimated_ranking_index_ = np.argsort(estimated_cvar_)[::-1]
            estimated_ranking = [
                candidate_policy_names[estimated_ranking_index_[i]]
                for i in range(n_policies)
            ]
            estimated_cvar = np.sort(estimated_cvar_)[::-1]

            if return_metrics:
                mse = mean_squared_error(true_cvar, np.nan_to_num(estimated_cvar_))
                rankcorr = spearmanr(np.arange(n_policies), estimated_ranking_index_)

                true_safety = true_cvar >= safety_threshold
                estimated_safety = estimated_cvar_ >= safety_threshold

                if true_safety.sum() > 0:
                    type_i_error_rate = (
                        true_safety > estimated_safety
                    ).sum() / true_safety.sum()
                else:
                    type_i_error_rate = 0.0

                if (1 - true_safety).sum() > 0:
                    type_ii_error_rate = (true_safety < estimated_safety).sum() / (
                        1 - true_safety
                    ).sum()
                else:
                    type_ii_error_rate = 0.0

            ops_dict[estimator] = {
                "estimated_ranking": estimated_ranking,
                "estimated_conditional_value_at_risk": estimated_cvar,
            }
            if return_true_values:
                ops_dict[estimator]["true_ranking"] = true_ranking_index_[
                    estimated_ranking_index_
                ]
                ops_dict[estimator]["true_conditional_value_at_risk"] = true_cvar_[
                    estimated_ranking_index_
                ]
            if return_metrics:
                ops_dict[estimator]["mean_squared_error"] = mse
                ops_dict[estimator]["rank_correlation"] = rankcorr
                ops_dict[estimator]["regret"] = None
                ops_dict[estimator]["type_i_error_rate"] = type_i_error_rate
                ops_dict[estimator]["type_ii_error_rate"] = type_ii_error_rate
                ops_dict[estimator]["safety_threshold"] = safety_threshold

        if return_by_dataframe:
            ranking_df_dict = defaultdict(pd.DataFrame)

            for i, estimator in enumerate(compared_estimators):
                ranking_df_ = pd.DataFrame()
                ranking_df_["estimated_ranking"] = ops_dict[estimator][
                    "estimated_ranking"
                ]
                ranking_df_["estimated_conditional_value_at_risk"] = ops_dict[
                    estimator
                ]["estimated_conditional_value_at_risk"]

                if return_true_values:
                    ranking_df_["true_ranking"] = ops_dict[estimator]["true_ranking"]
                    ranking_df_["true_conditional_value_at_risk"] = ops_dict[estimator][
                        "true_conditional_value_at_risk"
                    ]

                ranking_df_dict[estimator] = ranking_df_

            ranking_df_dict = defaultdict_to_dict(ranking_df_dict)

            if return_metrics:
                (
                    mse,
                    rankcorr,
                    pvalue,
                    type_i,
                    type_ii,
                ) = (
                    [],
                    [],
                    [],
                    [],
                    [],
                )
                for i, estimator in enumerate(compared_estimators):
                    mse.append(ops_dict[estimator]["mean_squared_error"])
                    rankcorr.append(ops_dict[estimator]["rank_correlation"][0])
                    pvalue.append(ops_dict[estimator]["rank_correlation"][1])
                    type_i.append(ops_dict[estimator]["type_i_error_rate"])
                    type_ii.append(ops_dict[estimator]["type_ii_error_rate"])

                metric_df = pd.DataFrame()
                metric_df["estimator"] = compared_estimators
                metric_df["mean_squared_error"] = mse
                metric_df["rank_correlation"] = rankcorr
                metric_df["pvalue"] = pvalue
                metric_df["regret"] = np.nan
                metric_df["type_i_error_rate"] = type_i
                metric_df["type_ii_error_rate"] = type_ii

            dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict

        return dfs if return_by_dataframe else ops_dict

[docs]    def obtain_true_selection_result(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        return_variance: bool = False,
        return_lower_quartile: bool = False,
        return_conditional_value_at_risk: bool = False,
        return_by_dataframe: bool = False,
        quartile_alpha: float = 0.05,
        cvar_alpha: float = 0.05,
    ):
        """Obtain the oracle selection result based on the ground-truth policy value.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        return_variance: bool, default=False
            Whether to return the variance or not.

        return_lower_quartile: bool. default=False
            Whether to return the lower interquartile or not.

        return_conditional_value_at_risk: bool, default=False
            Whether to return the conditional value at risk or not.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        quartile_alpha: float, default=0.05
            Proportion of the shaded region of the interquartile range.

        cvar_alpha: float, default=0.05
            Proportion of the shaded region of the conditional value at risk.

        Return
        -------
        ground_truth_dict/ground_truth_df: dict or dataframe (, list of dict or dataframe)
            Dictionary/dataframe containing the following ground-truth (on-policy) metrics.

            .. code-block:: python

                key: [
                    ranking,
                    policy_value,
                    relative_policy_value,
                    variance,
                    ranking_by_lower_quartile,
                    lower_quartile,
                    ranking_by_conditional_value_at_risk,
                    conditional_value_at_risk,
                    parameters,  # only when return_by_dataframe == False
                ]

            ranking: list of str
                Name of the candidate policies sorted by the ground-truth policy value.

            policy_value: list of float
                Ground-truth policy value of the candidate policies (sorted by ranking).

            relative_policy_value: list of float
                Ground-truth relative policy value of the candidate policies compared to the behavior policy (sorted by ranking).

            variance: list of float
                Ground-truth variance of the trajectory-wise reward of the candidate policies (sorted by ranking).
                If return_variance is `False`, `None` is recorded.

            ranking_by_lower_quartile: list of str
                Name of the candidate policies sorted by the ground-truth lower quartile of the trajectory-wise reward.
                If return_lower_quartile is `False`, `None` is recorded.

            lower_quartile: list of float
                Ground-truth lower quartile of the candidate policies (sorted by ranking_by_lower_quartile).
                If return_lower_quartile is `False`, `None` is recorded.

            ranking_by_conditional_value_at_risk: list of str
                Name of the candidate policies sorted by the ground-truth conditional value at risk.
                If return_conditional_value_at_risk is `False`, `None` is recorded.

            conditional_value_at_risk: list of float
                Ground-truth conditional value at risk of the candidate policies (sorted by ranking_by_conditional_value_at_risk).
                If return_conditional_value_at_risk is `False`, `None` is recorded.

            parameters: dict
                Dictionary containing quartile_alpha, and cvar_alpha.
                If return_by_dataframe is `True`, parameters will not be returned.

        """
        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None and dataset_id is None:
                ground_truth = defaultdict(list)

                for (
                    behavior_policy,
                    n_datasets,
                ) in input_dict.n_datasets.items():
                    for dataset_id_ in range(n_datasets):
                        input_dict_ = input_dict.get(
                            behavior_policy_name=behavior_policy,
                            dataset_id=dataset_id_,
                        )
                        ground_truth_ = self._obtain_true_selection_result(
                            input_dict_,
                            return_variance=return_variance,
                            return_lower_quartile=return_lower_quartile,
                            return_conditional_value_at_risk=return_conditional_value_at_risk,
                            return_by_dataframe=return_by_dataframe,
                            quartile_alpha=quartile_alpha,
                            cvar_alpha=cvar_alpha,
                        )
                        ground_truth[behavior_policy].append(ground_truth_)

                ground_truth = defaultdict_to_dict(ground_truth)

            elif behavior_policy_name is None and dataset_id is not None:
                ground_truth = {}
                for behavior_policy in input_dict.behavior_policy_names:
                    input_dict_ = input_dict.get(
                        behavior_policy_name=behavior_policy, dataset_id=dataset_id
                    )
                    ground_truth_ = self._obtain_true_selection_result(
                        input_dict_,
                        return_variance=return_variance,
                        return_lower_quartile=return_lower_quartile,
                        return_conditional_value_at_risk=return_conditional_value_at_risk,
                        return_by_dataframe=return_by_dataframe,
                        quartile_alpha=quartile_alpha,
                        cvar_alpha=cvar_alpha,
                    )
                    ground_truth[behavior_policy] = ground_truth_

            elif behavior_policy_name is not None and dataset_id is None:
                ground_truth = []
                for dataset_id_ in range(input_dict.n_datasets[behavior_policy_name]):
                    input_dict_ = input_dict.get(
                        behavior_policy_name=behavior_policy_name,
                        dataset_id=dataset_id_,
                    )
                    ground_truth_ = self._obtain_true_selection_result(
                        input_dict_,
                        return_variance=return_variance,
                        return_lower_quartile=return_lower_quartile,
                        return_conditional_value_at_risk=return_conditional_value_at_risk,
                        return_by_dataframe=return_by_dataframe,
                        quartile_alpha=quartile_alpha,
                        cvar_alpha=cvar_alpha,
                    )
                    ground_truth.append(ground_truth_)

            else:
                input_dict_ = input_dict.get(
                    behavior_policy_name=behavior_policy_name, dataset_id=dataset_id
                )
                ground_truth = self._obtain_true_selection_result(
                    input_dict_,
                    return_variance=return_variance,
                    return_lower_quartile=return_lower_quartile,
                    return_conditional_value_at_risk=return_conditional_value_at_risk,
                    return_by_dataframe=return_by_dataframe,
                    quartile_alpha=quartile_alpha,
                    cvar_alpha=cvar_alpha,
                )
        else:
            ground_truth = self._obtain_true_selection_result(
                input_dict,
                return_variance=return_variance,
                return_lower_quartile=return_lower_quartile,
                return_conditional_value_at_risk=return_conditional_value_at_risk,
                return_by_dataframe=return_by_dataframe,
                quartile_alpha=quartile_alpha,
                cvar_alpha=cvar_alpha,
            )

        return ground_truth

[docs]    def select_by_policy_value(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        top_k_in_eval_metrics: int = 1,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
    ):
        """Rank the candidate policies by their estimated policy values.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        return_true_values: bool, default=False
            Whether to return the true policy value and corresponding ranking of the candidate policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            mean-squared-error, rank-correlation, regret@k, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        top_k_in_eval_metrics: int, default=1
            How many candidate policies are included in regret@k.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None (>= 0)
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe (, list of dict or dataframe)
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [estimator_name][
                    estimated_ranking,
                    estimated_policy_value,
                    estimated_relative_policy_value,
                    true_ranking,
                    true_policy_value,
                    true_relative_policy_value,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated policy value.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_policy_value: list of float
                Estimated policy value of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_relative_policy_value: list of float
                Estimated relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_policy_value: list of float
                True policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict when return_by_dataframe is `True`.

            true_relative_policy_value: list of float
                True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: float
                Mean-squared-error of the estimators calculated across candidate evaluation policies.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple of float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            regret: tuple of float and int
                Regret@k and k.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df when return_by_dataframe is `True`.

            safety_threshold: float
                A policy whose policy value is below the given threshold is to be considered unsafe.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )

        if self.ope.use_multiple_logged_dataset:
            if isinstance(input_dict, MultipleInputDict):
                if behavior_policy_name is None and dataset_id is None:
                    if (
                        self.ope.multiple_logged_dataset.n_datasets
                        != input_dict.n_datasets
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = defaultdict(list)
                        metric_df = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_policy_value(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    top_k_in_eval_metrics=top_k_in_eval_metrics,
                                    safety_threshold=safety_threshold,
                                    relative_safety_criteria=relative_safety_criteria,
                                )
                                ranking_df[behavior_policy].append(ops_result_[0])
                                metric_df[behavior_policy].append(ops_result_[1])

                            ops_result = (
                                defaultdict_to_dict(ranking_df),
                                defaultdict_to_dict(metric_df),
                            )

                    else:
                        ops_result = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_policy_value(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    top_k_in_eval_metrics=top_k_in_eval_metrics,
                                    safety_threshold=safety_threshold,
                                    relative_safety_criteria=relative_safety_criteria,
                                )
                                ops_result[behavior_policy].append(ops_result_)

                        ops_result = defaultdict_to_dict(ops_result)

                elif behavior_policy_name is None and dataset_id is not None:
                    if (
                        self.ope.multiple_logged_dataset.behavior_policy_names
                        != input_dict.behavior_policy_names
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = {}
                        metric_df = {}

                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_policy_value(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                            )
                            ranking_df[behavior_policy] = ops_result_[0]
                            metric_df[behavior_policy] = ops_result_[1]

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = {}
                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_policy_value(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                            )
                            ops_result[behavior_policy] = ops_result_

                elif behavior_policy_name is not None and dataset_id is None:
                    if (
                        self.ope.multiple_logged_dataset.n_datasets[
                            behavior_policy_name
                        ]
                        != input_dict.n_datasets[behavior_policy_name]
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = []
                        metric_df = []

                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy_name,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_policy_value(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                            )
                            ranking_df.append(ops_result_[0])
                            metric_df.append(ops_result_[1])

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = []
                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy_name,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_policy_value(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                            )
                            ops_result.append(ops_result_)

                else:
                    input_dict_ = input_dict.get(
                        behavior_policy_name=behavior_policy_name, dataset_id=dataset_id
                    )
                    ops_result = self._select_by_policy_value(
                        input_dict_,
                        compared_estimators=compared_estimators,
                        return_true_values=return_true_values,
                        return_metrics=return_metrics,
                        return_by_dataframe=return_by_dataframe,
                        top_k_in_eval_metrics=top_k_in_eval_metrics,
                        safety_threshold=safety_threshold,
                        relative_safety_criteria=relative_safety_criteria,
                    )

            else:
                ops_result = self._select_by_policy_value(
                    input_dict,
                    compared_estimators=compared_estimators,
                    return_true_values=return_true_values,
                    return_metrics=return_metrics,
                    return_by_dataframe=return_by_dataframe,
                    top_k_in_eval_metrics=top_k_in_eval_metrics,
                    safety_threshold=safety_threshold,
                    relative_safety_criteria=relative_safety_criteria,
                )

        else:
            if isinstance(input_dict, MultipleInputDict):
                raise ValueError(
                    "when using LoggedDataset, please use InputDict instead of MultipleInputDict"
                )

            ops_result = self._select_by_policy_value(
                input_dict,
                compared_estimators=compared_estimators,
                return_true_values=return_true_values,
                return_metrics=return_metrics,
                return_by_dataframe=return_by_dataframe,
                top_k_in_eval_metrics=top_k_in_eval_metrics,
                safety_threshold=safety_threshold,
                relative_safety_criteria=relative_safety_criteria,
            )

        return ops_result

[docs]    def select_by_policy_value_via_cumulative_distribution_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        top_k_in_eval_metrics: int = 1,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
    ):
        """Rank the candidate policies by their estimated policy value via cumulative distribution OPE methods.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        return_true_values: bool, default=False
            Whether to return the true policy value and corresponding ranking of the candidate policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            mean-squared-error, rank-correlation, regret@k, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        top_k_in_eval_metrics: int, default=1
            How many candidate policies are included in regret@k.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None (>= 0)
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe (, list of dict or dataframe)
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [estimator_name][
                    estimated_ranking,
                    estimated_policy_value,
                    estimated_relative_policy_value,
                    true_ranking,
                    true_policy_value,
                    true_relative_policy_value,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated policy value.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_policy_value: list of float
                Estimated policy value of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_relative_policy_value: list of float
                Estimated relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_policy_value: list of float
                True policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_relative_policy_value: list of float
                True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: float
                Mean-squared-error of the estimators calculated across candidate evaluation policies.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple of float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df when return_by_dataframe is `True`.

            regret: tuple of float and int
                Regret@k and k.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df when return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df when return_by_dataframe is `True`.

            safety_threshold: float
                A policy whose policy value is below the given threshold is to be considered unsafe.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )

        if self.cumulative_distribution_ope.use_multiple_logged_dataset:
            if isinstance(input_dict, MultipleInputDict):
                if behavior_policy_name is None and dataset_id is None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets
                        != input_dict.n_datasets
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = defaultdict(list)
                        metric_df = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    top_k_in_eval_metrics=top_k_in_eval_metrics,
                                    safety_threshold=safety_threshold,
                                    relative_safety_criteria=relative_safety_criteria,
                                )
                                ranking_df[behavior_policy].append(ops_result_[0])
                                metric_df[behavior_policy].append(ops_result_[1])

                            ops_result = (
                                defaultdict_to_dict(ranking_df),
                                defaultdict_to_dict(metric_df),
                            )

                    else:
                        ops_result = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    top_k_in_eval_metrics=top_k_in_eval_metrics,
                                    safety_threshold=safety_threshold,
                                    relative_safety_criteria=relative_safety_criteria,
                                )
                                ops_result[behavior_policy].append(ops_result_)

                        ops_result = defaultdict_to_dict(ops_result)

                elif behavior_policy_name is None and dataset_id is not None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.behavior_policy_names
                        != input_dict.behavior_policy_names
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = {}
                        metric_df = {}

                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                            )
                            ranking_df[behavior_policy] = ops_result_[0]
                            metric_df[behavior_policy] = ops_result_[1]

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = {}
                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                            )
                            ops_result[behavior_policy] = ops_result_

                elif behavior_policy_name is not None and dataset_id is None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets[
                            behavior_policy_name
                        ]
                        != input_dict.n_datasets[behavior_policy_name]
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = []
                        metric_df = []

                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy_name,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                            )
                            ranking_df.append(ops_result_[0])
                            metric_df.append(ops_result_[1])

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = []
                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy_name,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                            )
                            ops_result.append(ops_result_)

                else:
                    input_dict_ = input_dict.get(
                        behavior_policy_name=behavior_policy_name, dataset_id=dataset_id
                    )
                    ops_result = (
                        self._select_by_policy_value_via_cumulative_distribution_ope(
                            input_dict_,
                            compared_estimators=compared_estimators,
                            return_true_values=return_true_values,
                            return_metrics=return_metrics,
                            return_by_dataframe=return_by_dataframe,
                            top_k_in_eval_metrics=top_k_in_eval_metrics,
                            safety_threshold=safety_threshold,
                            relative_safety_criteria=relative_safety_criteria,
                        )
                    )

            else:
                ops_result = (
                    self._select_by_policy_value_via_cumulative_distribution_ope(
                        input_dict,
                        compared_estimators=compared_estimators,
                        return_true_values=return_true_values,
                        return_metrics=return_metrics,
                        return_by_dataframe=return_by_dataframe,
                        top_k_in_eval_metrics=top_k_in_eval_metrics,
                        safety_threshold=safety_threshold,
                        relative_safety_criteria=relative_safety_criteria,
                    )
                )

        else:
            if isinstance(input_dict, MultipleInputDict):
                raise ValueError(
                    "when using LoggedDataset, please use InputDict instead of MultipleInputDict"
                )

            ops_result = self._select_by_policy_value_via_cumulative_distribution_ope(
                input_dict,
                compared_estimators=compared_estimators,
                return_true_values=return_true_values,
                return_metrics=return_metrics,
                return_by_dataframe=return_by_dataframe,
                top_k_in_eval_metrics=top_k_in_eval_metrics,
                safety_threshold=safety_threshold,
                relative_safety_criteria=relative_safety_criteria,
            )

        return ops_result

[docs]    def select_by_policy_value_lower_bound(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        top_k_in_eval_metrics: int = 1,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        cis: List[str] = ["bootstrap"],
        alpha: float = 0.05,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
    ):
        """Rank the candidate policies by their estimated policy value lower bound.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        return_true_values: bool, default=False
            Whether to return the true policy value and corresponding ranking of the candidate policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            rank-correlation, regret@k, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        top_k_in_eval_metrics: int, default=1
            How many candidate policies are included in regret@k.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None (>= 0)
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"]
            Estimation methods for confidence intervals.

        alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        n_bootstrap_samples: int, default=100 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe (, list of dict or dataframe)
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [ci][estimator_name][
                    estimated_ranking,
                    estimated_policy_value_lower_bound,
                    estimated_relative_policy_value_lower_bound,
                    true_ranking,
                    true_policy_value,
                    true_relative_policy_value,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated policy value lower bound.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_policy_value_lower_bound: list of float
                Estimated policy value lower bound of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_relative_policy_value_lower_bound: list of float
                Estimated relative policy value lower bound of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_policy_value: list of float
                True policy value of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_relative_policy_value: list of float
                True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: None
                This is for API consistency.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple of float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            regret: tuple of float and int
                Regret@k and k.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            safety_threshold: float
                A policy whose policy value is below the given threshold is to be considered unsafe.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )

        if self.ope.use_multiple_logged_dataset:
            if isinstance(input_dict, MultipleInputDict):
                if behavior_policy_name is None and dataset_id is None:
                    if (
                        self.ope.multiple_logged_dataset.n_datasets
                        != input_dict.n_datasets
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = defaultdict(list)
                        metric_df = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_policy_value_lower_bound(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    top_k_in_eval_metrics=top_k_in_eval_metrics,
                                    safety_threshold=safety_threshold,
                                    relative_safety_criteria=relative_safety_criteria,
                                    cis=cis,
                                    alpha=alpha,
                                    n_bootstrap_samples=n_bootstrap_samples,
                                    random_state=random_state,
                                )
                                ranking_df[behavior_policy].append(ops_result_[0])
                                metric_df[behavior_policy].append(ops_result_[1])

                            ops_result = (
                                defaultdict_to_dict(ranking_df),
                                defaultdict_to_dict(metric_df),
                            )

                    else:
                        ops_result = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_policy_value_lower_bound(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    top_k_in_eval_metrics=top_k_in_eval_metrics,
                                    safety_threshold=safety_threshold,
                                    relative_safety_criteria=relative_safety_criteria,
                                    cis=cis,
                                    alpha=alpha,
                                    n_bootstrap_samples=n_bootstrap_samples,
                                    random_state=random_state,
                                )
                                ops_result[behavior_policy].append(ops_result_)

                        ops_result = defaultdict_to_dict(ops_result)

                elif behavior_policy_name is None and dataset_id is not None:
                    if (
                        self.ope.multiple_logged_dataset.behavior_policy_names
                        != input_dict.behavior_policy_names
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = {}
                        metric_df = {}

                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_policy_value_lower_bound(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                                cis=cis,
                                alpha=alpha,
                                n_bootstrap_samples=n_bootstrap_samples,
                                random_state=random_state,
                            )
                            ranking_df[behavior_policy] = ops_result_[0]
                            metric_df[behavior_policy] = ops_result_[1]

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = {}
                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_policy_value_lower_bound(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                                cis=cis,
                                alpha=alpha,
                                n_bootstrap_samples=n_bootstrap_samples,
                                random_state=random_state,
                            )
                            ops_result[behavior_policy] = ops_result_

                elif behavior_policy_name is not None and dataset_id is None:
                    if (
                        self.ope.multiple_logged_dataset.n_datasets[
                            behavior_policy_name
                        ]
                        != input_dict.n_datasets[behavior_policy_name]
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = []
                        metric_df = []

                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_policy_value_lower_bound(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                                cis=cis,
                                alpha=alpha,
                                n_bootstrap_samples=n_bootstrap_samples,
                                random_state=random_state,
                            )
                            ranking_df.append(ops_result_[0])
                            metric_df.append(ops_result_[1])

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = []
                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy_name,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_policy_value_lower_bound(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                top_k_in_eval_metrics=top_k_in_eval_metrics,
                                safety_threshold=safety_threshold,
                                relative_safety_criteria=relative_safety_criteria,
                                cis=cis,
                                alpha=alpha,
                                n_bootstrap_samples=n_bootstrap_samples,
                                random_state=random_state,
                            )
                            ops_result.append(ops_result_)

                else:
                    input_dict_ = input_dict.get(
                        behavior_policy_name=behavior_policy_name,
                        dataset_id=dataset_id,
                    )
                    ops_result = self._select_by_policy_value_lower_bound(
                        input_dict_,
                        compared_estimators=compared_estimators,
                        return_true_values=return_true_values,
                        return_metrics=return_metrics,
                        return_by_dataframe=return_by_dataframe,
                        top_k_in_eval_metrics=top_k_in_eval_metrics,
                        safety_threshold=safety_threshold,
                        relative_safety_criteria=relative_safety_criteria,
                        cis=cis,
                        alpha=alpha,
                        n_bootstrap_samples=n_bootstrap_samples,
                        random_state=random_state,
                    )

            else:
                ops_result = self._select_by_policy_value_lower_bound(
                    input_dict,
                    compared_estimators=compared_estimators,
                    return_true_values=return_true_values,
                    return_metrics=return_metrics,
                    return_by_dataframe=return_by_dataframe,
                    top_k_in_eval_metrics=top_k_in_eval_metrics,
                    safety_threshold=safety_threshold,
                    relative_safety_criteria=relative_safety_criteria,
                    cis=cis,
                    alpha=alpha,
                    n_bootstrap_samples=n_bootstrap_samples,
                    random_state=random_state,
                )

        else:
            if isinstance(input_dict, MultipleInputDict):
                raise ValueError(
                    "when using LoggedDataset, please use InputDict instead of MultipleInputDict"
                )

            ops_result = self._select_by_policy_value_lower_bound(
                input_dict,
                compared_estimators=compared_estimators,
                return_true_values=return_true_values,
                return_metrics=return_metrics,
                return_by_dataframe=return_by_dataframe,
                top_k_in_eval_metrics=top_k_in_eval_metrics,
                safety_threshold=safety_threshold,
                relative_safety_criteria=relative_safety_criteria,
                cis=cis,
                alpha=alpha,
                n_bootstrap_samples=n_bootstrap_samples,
                random_state=random_state,
            )

        return ops_result

[docs]    def select_by_lower_quartile(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        alpha: float = 0.05,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        safety_threshold: float = 0.0,
    ):
        """Rank the candidate policies by their estimated lower quartile of the trajectory-wise reward.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 0.5]`.

        return_true_values: bool, default=False
            Whether to return the true lower quartile of the trajectory-wise reward
            and corresponding ranking of the candidate evaluation policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            mean-squared-error, rank-correlation, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        safety_threshold: float, default=0.0 (>= 0)
            The lower quartile required to be considered a safe policy.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [estimator_name][
                    estimated_ranking,
                    estimated_lower_quartile,
                    true_ranking,
                    true_lower_quartile,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated lower quartile of the trajectory-wise reward.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_lower_quartile: list of float
                Estimated lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_lower_quartile: list of float
                True lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: float
                Mean-squared-error of the estimated lower quartile of the trajectory-wise reward.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple of float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            regret: None
                This is for API consistency.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            safety_threshold: float
                The lower quartile required to be considered a safe policy.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )

        if self.cumulative_distribution_ope.use_multiple_logged_dataset:
            if isinstance(input_dict, MultipleInputDict):
                if behavior_policy_name is None and dataset_id is None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets
                        != input_dict.n_datasets
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = defaultdict(list)
                        metric_df = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_lower_quartile(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    alpha=alpha,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    safety_threshold=safety_threshold,
                                )
                                ranking_df[behavior_policy].append(ops_result_[0])
                                metric_df[behavior_policy].append(ops_result_[1])

                            ops_result = (
                                defaultdict_to_dict(ranking_df),
                                defaultdict_to_dict(metric_df),
                            )

                    else:
                        ops_result = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = (
                                    ops_result_
                                ) = self._select_by_lower_quartile(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    alpha=alpha,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    safety_threshold=safety_threshold,
                                )
                                ops_result[behavior_policy].append(ops_result_)

                        ops_result = defaultdict_to_dict(ops_result)

                elif behavior_policy_name is None and dataset_id is not None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.behavior_policy_names
                        != input_dict.behavior_policy_names
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = {}
                        metric_df = {}

                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_lower_quartile(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                alpha=alpha,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                safety_threshold=safety_threshold,
                            )
                            ranking_df[behavior_policy] = ops_result_[0]
                            metric_df[behavior_policy] = ops_result_[1]

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = {}
                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_lower_quartile(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                alpha=alpha,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                safety_threshold=safety_threshold,
                            )
                            ops_result[behavior_policy] = ops_result_

                elif behavior_policy_name is not None and dataset_id is None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets[
                            behavior_policy_name
                        ]
                        != input_dict.n_datasets[behavior_policy_name]
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = []
                        metric_df = []

                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_lower_quartile(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                alpha=alpha,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                safety_threshold=safety_threshold,
                            )
                            ranking_df.append(ops_result_[0])
                            metric_df.append(ops_result_[1])

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = []
                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy_name,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_lower_quartile(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                alpha=alpha,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                safety_threshold=safety_threshold,
                            )
                            ops_result.append(ops_result_)

                else:
                    input_dict_ = input_dict.get(
                        behavior_policy_name=behavior_policy_name, dataset_id=dataset_id
                    )
                    ops_result = self._select_by_lower_quartile(
                        input_dict_,
                        compared_estimators=compared_estimators,
                        alpha=alpha,
                        return_true_values=return_true_values,
                        return_metrics=return_metrics,
                        return_by_dataframe=return_by_dataframe,
                        safety_threshold=safety_threshold,
                    )

            else:
                ops_result = self._select_by_lower_quartile(
                    input_dict,
                    compared_estimators=compared_estimators,
                    alpha=alpha,
                    return_true_values=return_true_values,
                    return_metrics=return_metrics,
                    return_by_dataframe=return_by_dataframe,
                    safety_threshold=safety_threshold,
                )

        else:
            if isinstance(input_dict, MultipleInputDict):
                raise ValueError(
                    "when using LoggedDataset, please use InputDict instead of MultipleInputDict"
                )

            ops_result = self._select_by_lower_quartile(
                input_dict,
                compared_estimators=compared_estimators,
                alpha=alpha,
                return_true_values=return_true_values,
                return_metrics=return_metrics,
                return_by_dataframe=return_by_dataframe,
                safety_threshold=safety_threshold,
            )

        return ops_result

[docs]    def select_by_conditional_value_at_risk(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        alpha: float = 0.05,
        return_true_values: bool = False,
        return_metrics: bool = False,
        return_by_dataframe: bool = False,
        safety_threshold: float = 0.0,
    ):
        """Rank the candidate policies by their estimated conditional value at risk.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleLoggedDataset
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 1]`.

        return_true_values: bool, default=False
            Whether to return the true conditional value at risk
            and corresponding ranking of the candidate evaluation policies.

        return_metrics: bool, default=False
            Whether to return the following evaluation metrics in terms of OPE and OPS:
            mean-squared-error, rank-correlation, and Type I and Type II error rate.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        safety_threshold: float, default=0.0 (>= 0)
            The conditional value at risk required to be considered a safe policy.

        Return
        -------
        ops_dict/(ranking_df_dict, metric_df): dict or dataframe (, list of dict or dataframe)
            Dictionary/dataframe containing the result of OPS conducted by OPE estimators.

            .. code-block:: python

                key: [estimator_name][
                    estimated_ranking,
                    estimated_conditional_value_at_risk,
                    true_ranking,
                    true_conditional_value_at_risk,
                    mean_squared_error,
                    rank_correlation,
                    regret,
                    type_i_error_rate,
                    type_ii_error_rate,
                ]

            estimated_ranking: list of str
                Name of the candidate policies sorted by the estimated conditional value at risk.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            estimated_conditional_value_at_risk: list of float
                Estimated conditional value at risk of the candidate policies (sorted by estimated_ranking).
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_ranking: list of int
                Ranking index of the (true) conditional value at risk of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            true_conditional_value_at_risk: list of float
                True conditional value at risk of the candidate policies (sorted by estimated_ranking).
                Recorded only when return_true_values is `True`.
                Recorded in ranking_df_dict if return_by_dataframe is `True`.

            mean_squared_error: float
                Mean-squared-error of the estimated conditional value at risk.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            rank_correlation: tuple or float
                Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            regret: None
                This is for API consistency.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_i_error_rate: float
                Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is `True`.

            type_ii_error_rate: float
                Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected.
                Recorded only when return_metric is `True`.
                Recorded in metric_df if return_by_dataframe is True`.

            safety_threshold: float
                The conditional value at risk required to be considered a safe policy.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )

        if self.cumulative_distribution_ope.use_multiple_logged_dataset:
            if isinstance(input_dict, MultipleInputDict):
                if behavior_policy_name is None and dataset_id is None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets
                        != input_dict.n_datasets
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = defaultdict(list)
                        metric_df = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_conditional_value_at_risk(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    alpha=alpha,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    safety_threshold=safety_threshold,
                                )
                                ranking_df[behavior_policy].append(ops_result_[0])
                                metric_df[behavior_policy].append(ops_result_[1])

                            ops_result = (
                                defaultdict_to_dict(ranking_df),
                                defaultdict_to_dict(metric_df),
                            )

                    else:
                        ops_result = defaultdict(list)

                        for (
                            behavior_policy,
                            n_datasets,
                        ) in input_dict.n_datasets.items():
                            for dataset_id_ in range(n_datasets):
                                input_dict_ = input_dict.get(
                                    behavior_policy_name=behavior_policy,
                                    dataset_id=dataset_id_,
                                )
                                ops_result_ = self._select_by_conditional_value_at_risk(
                                    input_dict_,
                                    compared_estimators=compared_estimators,
                                    alpha=alpha,
                                    return_true_values=return_true_values,
                                    return_metrics=return_metrics,
                                    return_by_dataframe=return_by_dataframe,
                                    safety_threshold=safety_threshold,
                                )
                                ops_result[behavior_policy].append(ops_result_)

                        ops_result = defaultdict_to_dict(ops_result)

                elif behavior_policy_name is None and dataset_id is not None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.behavior_policy_names
                        != input_dict.behavior_policy_names
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same behavior policies, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = {}
                        metric_df = {}

                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_conditional_value_at_risk(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                alpha=alpha,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                safety_threshold=safety_threshold,
                            )
                            ranking_df[behavior_policy] = ops_result_[0]
                            metric_df[behavior_policy] = ops_result_[1]

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = {}
                        for behavior_policy in input_dict.behavior_policy_names:
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy,
                                dataset_id=dataset_id,
                            )
                            ops_result_ = self._select_by_conditional_value_at_risk(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                alpha=alpha,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                safety_threshold=safety_threshold,
                            )
                            ops_result[behavior_policy] = ops_result_

                elif behavior_policy_name is not None and dataset_id is None:
                    if (
                        self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets[
                            behavior_policy_name
                        ]
                        != input_dict.n_datasets[behavior_policy_name]
                    ):
                        raise ValueError(
                            "Expected that logged datasets and input dicts consists of the same dataset ids, but found False."
                        )

                    if return_metrics and return_by_dataframe:
                        ranking_df = []
                        metric_df = []

                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy_name,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_conditional_value_at_risk(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                alpha=alpha,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                safety_threshold=safety_threshold,
                            )
                            ranking_df.append(ops_result_[0])
                            metric_df.append(ops_result_[1])

                        ops_result = (ranking_df, metric_df)

                    else:
                        ops_result = []
                        for dataset_id_ in range(
                            input_dict.n_datasets[behavior_policy_name]
                        ):
                            input_dict_ = input_dict.get(
                                behavior_policy_name=behavior_policy_name,
                                dataset_id=dataset_id_,
                            )
                            ops_result_ = self._select_by_conditional_value_at_risk(
                                input_dict_,
                                compared_estimators=compared_estimators,
                                alpha=alpha,
                                return_true_values=return_true_values,
                                return_metrics=return_metrics,
                                return_by_dataframe=return_by_dataframe,
                                safety_threshold=safety_threshold,
                            )
                            ops_result.append(ops_result_)

                else:
                    input_dict_ = input_dict.get(
                        behavior_policy_name=behavior_policy_name, dataset_id=dataset_id
                    )
                    ops_result = self._select_by_conditional_value_at_risk(
                        input_dict_,
                        compared_estimators=compared_estimators,
                        alpha=alpha,
                        return_true_values=return_true_values,
                        return_metrics=return_metrics,
                        return_by_dataframe=return_by_dataframe,
                        safety_threshold=safety_threshold,
                    )

            else:
                ops_result = self._select_by_conditional_value_at_risk(
                    input_dict,
                    compared_estimators=compared_estimators,
                    alpha=alpha,
                    return_true_values=return_true_values,
                    return_metrics=return_metrics,
                    return_by_dataframe=return_by_dataframe,
                    safety_threshold=safety_threshold,
                )

        else:
            if isinstance(input_dict, MultipleInputDict):
                raise ValueError(
                    "when using LoggedDataset, please use InputDict instead of MultipleInputDict"
                )

            ops_result = self._select_by_conditional_value_at_risk(
                input_dict,
                compared_estimators=compared_estimators,
                alpha=alpha,
                return_true_values=return_true_values,
                return_metrics=return_metrics,
                return_by_dataframe=return_by_dataframe,
                safety_threshold=safety_threshold,
            )

        return ops_result

[docs]    def visualize_policy_value_for_selection(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        alpha: float = 0.05,
        ci: str = "bootstrap",
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        is_relative: bool = False,
        hue: str = "estimator",
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_policy_value_standard_ope.png",
    ):
        """Visualize the policy value estimated by OPE estimators (box plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        is_relative: bool, default=False
            If `True`, the method visualizes the estimated policy value of the evaluation policy
            relative to the on-policy policy value of the behavior policy.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different estimators or evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_policy_value_standard_ope.png"
            Name of the bar figure.

        """
        self.ope.visualize_off_policy_estimates(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=alpha,
            ci=ci,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
            is_relative=is_relative,
            hue=hue,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_cumulative_distribution_function_for_selection(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        hue: str = "estimator",
        legend: bool = True,
        n_cols: Optional[int] = None,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_cumulative_distribution_function.png",
    ) -> None:
        """Visualize the cumulative distribution function (cdf plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleLoggedDataset
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        legend: bool, default=True
            Whether to include a legend in the figure.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_cumulative_distribution_function.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_cumulative_distribution_function(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            hue=hue,
            legend=legend,
            n_cols=n_cols,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_policy_value_of_cumulative_distribution_ope_for_selection(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        alpha: float = 0.05,
        is_relative: bool = False,
        hue: str = "estimator",
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_policy_value_cumulative_distribution_ope.png",
    ) -> None:
        """Visualize the policy value estimated by cumulative distribution OPE estimators (box plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        alpha: float, default=0.05
            Significance level. The value should bw within `[0, 1)`.

        is_relative: bool, default=False
            If `True`, the method visualizes the estimated policy value of the evaluation policy
            relative to the ground-truth policy value of the behavior policy.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_policy_value_cumulative_distribution_ope.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_policy_value(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=alpha,
            is_relative=is_relative,
            hue=hue,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_conditional_value_at_risk_for_selection(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        alphas: Optional[np.ndarray] = None,
        hue: str = "estimator",
        legend: bool = True,
        n_cols: Optional[int] = None,
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_conditional_value_at_risk.png",
    ) -> None:
        """Visualize the conditional value at risk estimated by cumulative distribution OPE estimators (cdf plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        alphas: array-like of shape (n_alpha, ), default=None
            Set of proportions of the shaded region. The values should be within `[0, 1)`.
            If `None` is given, :class:`np.linspace(0, 1, 21)` will be used.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        legend: bool, default=True
            Whether to include a legend in the figure.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_conditional_value_at_risk.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_conditional_value_at_risk(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alphas=alphas,
            hue=hue,
            legend=legend,
            n_cols=n_cols,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_interquartile_range_for_selection(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        alpha: float = 0.05,
        hue: str = "estimator",
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_interquartile_range.png",
    ) -> None:
        """Visualize the interquartile range estimated by cumulative distribution OPE estimators (box plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_interquartile_range.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_interquartile_range(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=alpha,
            hue=hue,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_policy_value_with_multiple_estimates_standard_ope(
        self,
        input_dict: MultipleInputDict,
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        plot_type: str = "ci",
        hue: str = "estimator",
        legend: bool = True,
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_policy_value_multiple_standard_ope.png",
    ) -> None:
        """Visualize the policy value estimated by OPE estimators across multiple logged dataset.

        Note
        -------
        This function is applicable only when MultipleLoggedDataset is used and
        MultipleInputDict is collected by the same evaluation policy across logged datasets.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        plot_type: {"ci", "scatter", "violin"}, default="ci"
            Type of plot.
            If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals.
            If "scatter" is given, we get a scatter plot of estimated values.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different estimators or evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_policy_value_multiple.png"
            Name of the bar figure.

        """
        self.ope.visualize_policy_value_with_multiple_estimates(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            plot_type=plot_type,
            hue=hue,
            legend=legend,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_cumulative_distribution_function_with_multiple_estimates(
        self,
        input_dict: MultipleInputDict,
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        scale_min: Optional[float] = None,
        scale_max: Optional[float] = None,
        n_partition: Optional[int] = None,
        plot_type: str = "ci_hue",
        hue: str = "estimator",
        legend: bool = True,
        n_cols: Optional[int] = None,
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_policy_value_multiple.png",
    ) -> None:
        """Visualize the policy value estimated by OPE estimators across multiple logged dataset.

        Note
        -------
        This function is applicable only when MultipleLoggedDataset is used and
        MultipleInputDict is collected by the same evaluation policy across logged datasets.

        This function is not applicable when the data-driven reward scaler is used.
        Please set ``scale_min``, ``scale_max``, and ``n_partition`` to use.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        scale_min: float, default=None
            Minimum value of the reward scale in the CDF.

        scale_max: float, default=None
            Maximum value of the reward scale in the CDF.

        n_partition: int, default=None
            Number of partitions in the reward scale (x-axis of the CDF).

        plot_type: {"ci_hue", "ci_behavior_policy", "enumerate"}, default="ci_hue"
            Type of plot.
            If "ci" is given, the method visualizes the average policy value and its 95% confidence intervals based on the multiple estimate.
            If "enumerate" is given, we get a scatter plot of estimated values.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different estimators or evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_policy_value_multiple.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_cumulative_distribution_function_with_multiple_estimates(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            scale_min=scale_min,
            scale_max=scale_max,
            n_partition=n_partition,
            plot_type=plot_type,
            hue=hue,
            legend=legend,
            n_cols=n_cols,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_policy_value_with_multiple_estimates_cumulative_distribution_ope(
        self,
        input_dict: MultipleInputDict,
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        plot_type: str = "ci",
        hue: str = "estimator",
        legend: bool = True,
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_policy_value_multiple_cumulative_distribution_ope.png",
    ) -> None:
        """Visualize the policy value estimated by OPE estimators across multiple logged dataset.

        Note
        -------
        This function is applicable only when MultipleLoggedDataset is used and
        MultipleInputDict is collected by the same evaluation policy across logged datasets.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        plot_type: {"ci", "scatter", "violin"}, default="ci"
            Type of plot.
            If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals.
            If "scatter" is given, we get a scatter plot of estimated values.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different estimators or evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_policy_value_multiple.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_policy_value_with_multiple_estimates(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            plot_type=plot_type,
            hue=hue,
            legend=legend,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_variance_with_multiple_estimates(
        self,
        input_dict: MultipleInputDict,
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        plot_type: str = "ci",
        hue: str = "estimator",
        legend: bool = True,
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_variance_multiple.png",
    ) -> None:
        """Visualize the variance of the trajectory-wise reward under the evaluation policy estimated by OPE estimators across multiple logged dataset.

        Note
        -------
        This function is applicable only when MultipleLoggedDataset is used and
        MultipleInputDict is collected by the same evaluation policy across logged datasets.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        plot_type: {"ci", "scatter", "violin"}, default="ci"
            Type of plot.
            If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals.
            If "scatter" is given, we get a scatter plot of estimated values.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different estimators or evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_variance_multiple.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_variance_with_multiple_estimates(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            plot_type=plot_type,
            hue=hue,
            legend=legend,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_conditional_value_at_risk_with_multiple_estimates(
        self,
        input_dict: MultipleInputDict,
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        alpha: float = 0.05,
        plot_type: str = "ci",
        hue: str = "estimator",
        legend: bool = True,
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_conditional_value_at_risk_multiple.png",
    ) -> None:
        """Visualize the conditional value at risk of the trajectory-wise reward under the evaluation policy estimated by OPE estimators across multiple logged dataset.

        Note
        -------
        This function is applicable only when MultipleLoggedDataset is used and
        MultipleInputDict is collected by the same evaluation policy across logged datasets.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        alpha: float = 0.05.
            Proportion of the shaded region in CVaR estimate. The value should be within `[0, 1)`.

        plot_type: {"ci", "scatter", "violin"}, default="ci"
            Type of plot.
            If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals.
            If "scatter" is given, we get a scatter plot of estimated values.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different estimators or evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_conditional_value_at_risk_multiple.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_conditional_value_at_risk_with_multiple_estimates(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            alpha=alpha,
            plot_type=plot_type,
            hue=hue,
            legend=legend,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_lower_quartile_with_multiple_estimates(
        self,
        input_dict: MultipleInputDict,
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        alpha: float = 0.05,
        plot_type: str = "ci",
        hue: str = "estimator",
        legend: bool = True,
        sharey: bool = False,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_conditional_value_at_risk_multiple.png",
    ) -> None:
        """Visualize the lower quartile of the trajectory-wise reward under the evaluation policy estimated by OPE estimators across multiple logged dataset.

        Note
        -------
        This function is applicable only when MultipleLoggedDataset is used and
        MultipleInputDict is collected by the same evaluation policy across logged datasets.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        alpha: float = 0.05.
            Proportion of the shaded region in CVaR estimate. The value should be within `[0, 1)`.

        plot_type: {"ci", "scatter", "violin"}, default="ci"
            Type of plot.
            If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals.
            If "scatter" is given, we get a scatter plot of estimated values.

        hue: {"estimator", "policy"}, default="estimator"
            Hue of the plot.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        sharey: bool, default=False
            If `True`, the y-axis will be shared among different estimators or evaluation policies.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="estimated_conditional_value_at_risk_multiple.png"
            Name of the bar figure.

        """
        self.cumulative_distribution_ope.visualize_lower_quartile_with_multiple_estimates(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            alpha=alpha,
            plot_type=plot_type,
            hue=hue,
            legend=legend,
            sharey=sharey,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

    def _obtain_topk_policy_performance(
        self,
        true_dict: Dict,
        estimation_dict: Dict,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        true_dict_ranking_arg: str,
        true_dict_value_arg: str,
        estimation_dict_ranking_arg: str,
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        max_topk: Optional[int] = None,
        ope_alpha: Optional[float] = None,
        return_safety_violation_rate: bool = False,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        return_by_dataframe: bool = False,
    ):
        """Calculate top-k policy deployment performances.

        Parameters
        -------
        true_dict: dict
            Dictionary containing the true policy performance.

        estimation_dict: dict
            Dictionary containing the estimated policy performance.

        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        true_dict_ranking_arg: str
            Name of the key indicating the ranked list of the candidate policies in true_dict.

        true_dict_value_arg: str
            Name of the key indicating the true policy performance of the candidate policies in true_dict.

        estimation_dict_ranking_arg: str
            Name of the ley indicaing the estimated ranking of the candidate policies in true_dict.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        behavior_policy: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        max_topk: int, default=None
            Maximum number of policies to be deployed.

        ope_alpha: float, default=None
            Significance level. The value should be within `[0, 1)`.

        return_safety_violation_rate: bool, default=False.
            Whether to calculate and return the safety violate.

        safety_threshold: float, default=0.0 (>= 0)
            The conditional value at risk required to be considered a safe policy.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that when returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if return_safety_violation_rate:
            metrics = ["k-th", "best", "worst", "mean", "std", "safety_violation_rate"]
        else:
            metrics = ["k-th", "best", "worst", "mean", "std"]

        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None and dataset_id is None:
                ranking_dict = defaultdict(list)
                for behavior_policy, n_datasets in input_dict.n_datasets.items():
                    for dataset_id_ in range(n_datasets):
                        true_dict[behavior_policy][dataset_id_] = dict(
                            zip(
                                true_dict[behavior_policy][dataset_id_][
                                    true_dict_ranking_arg
                                ],
                                true_dict[behavior_policy][dataset_id_][
                                    true_dict_value_arg
                                ],
                            )
                        )

                        tmp_ranking_dict = dict()
                        for i, estimator in enumerate(compared_estimators):
                            policy_performance = np.zeros(
                                input_dict.n_eval_policies[behavior_policy][dataset_id_]
                            )
                            estimated_ranking = estimation_dict[behavior_policy][
                                dataset_id_
                            ][estimator]["estimated_ranking"]

                            for i, eval_policy in enumerate(estimated_ranking):
                                policy_performance[i] = true_dict[behavior_policy][
                                    dataset_id_
                                ][eval_policy]

                            tmp_ranking_dict[estimator] = policy_performance

                        ranking_dict[behavior_policy].append(tmp_ranking_dict)

            elif behavior_policy_name is None and dataset_id is not None:
                ranking_dict = {}
                for behavior_policy in input_dict.behavior_policy_names:
                    true_dict[behavior_policy] = dict(
                        zip(
                            true_dict[behavior_policy][true_dict_ranking_arg],
                            true_dict[behavior_policy][true_dict_value_arg],
                        )
                    )

                    tmp_ranking_dict = dict()
                    for i, estimator in enumerate(compared_estimators):
                        policy_performance = np.zeros(
                            input_dict.n_eval_policies[behavior_policy][dataset_id]
                        )
                        estimated_ranking = estimation_dict[behavior_policy][estimator][
                            estimation_dict_ranking_arg
                        ]

                        for i, eval_policy in enumerate(estimated_ranking):
                            policy_performance[i] = true_dict[behavior_policy][
                                eval_policy
                            ]

                        tmp_ranking_dict[estimator] = policy_performance

                    ranking_dict[behavior_policy] = tmp_ranking_dict

            elif behavior_policy_name is not None and dataset_id is None:
                ranking_dict = []
                for dataset_id_ in range(input_dict.n_datasets[behavior_policy_name]):
                    true_dict[dataset_id_] = dict(
                        zip(
                            true_dict[dataset_id_][true_dict_ranking_arg],
                            true_dict[dataset_id_][true_dict_value_arg],
                        )
                    )

                    tmp_ranking_dict = dict()
                    for i, estimator in enumerate(compared_estimators):
                        policy_performance = np.zeros(
                            input_dict.n_eval_policies[behavior_policy_name][
                                dataset_id_
                            ]
                        )
                        estimated_ranking = estimation_dict[dataset_id_][estimator][
                            estimation_dict_ranking_arg
                        ]

                        for i, eval_policy in enumerate(estimated_ranking):
                            policy_performance[i] = true_dict[dataset_id_][eval_policy]

                        tmp_ranking_dict[estimator] = policy_performance

                    ranking_dict.append(tmp_ranking_dict)

            else:
                true_dict = dict(
                    zip(
                        true_dict[true_dict_ranking_arg],
                        true_dict[true_dict_value_arg],
                    )
                )

                ranking_dict = dict()
                for i, estimator in enumerate(compared_estimators):
                    policy_performance = np.zeros(
                        input_dict.n_eval_policies[behavior_policy_name][dataset_id]
                    )
                    estimated_ranking = estimation_dict[estimator][
                        estimation_dict_ranking_arg
                    ]

                    for i, eval_policy in enumerate(estimated_ranking):
                        policy_performance[i] = true_dict[eval_policy]

                    ranking_dict[estimator] = policy_performance

        else:
            true_dict = dict(
                zip(
                    true_dict[true_dict_ranking_arg],
                    true_dict[true_dict_value_arg],
                )
            )

            ranking_dict = dict()
            for i, estimator in enumerate(compared_estimators):
                policy_performance = np.zeros((len(input_dict),))
                estimated_ranking = estimation_dict[estimator][
                    estimation_dict_ranking_arg
                ]

                for i, eval_policy in enumerate(estimated_ranking):
                    policy_performance[i] = true_dict[eval_policy]

                ranking_dict[estimator] = policy_performance

        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma

        behavior_policy_cum_reward = {}
        behavior_policy_value = {}
        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None:
                for behavior_policy in input_dict.behavior_policy_names:
                    behavior_policy_reward = self.behavior_policy_reward[
                        behavior_policy
                    ]
                    behavior_policy_cum_reward[behavior_policy] = (
                        discount[np.newaxis, :] * behavior_policy_reward
                    ).sum(
                        axis=1
                    ) + 1e-10  # to avoid zero division
                    behavior_policy_value[behavior_policy] = (
                        discount[np.newaxis, :] * behavior_policy_reward
                    ).sum(
                        axis=1
                    ).mean() + 1e-10  # to avoid zero division
            else:
                behavior_policy_reward = self.behavior_policy_reward[
                    behavior_policy_name
                ]
                behavior_policy_cum_reward[behavior_policy_name] = (
                    discount[np.newaxis, :] * behavior_policy_reward
                ).sum(
                    axis=1
                ) + 1e-10  # to avoid zero division
                behavior_policy_value[behavior_policy_name] = (
                    discount[np.newaxis, :] * behavior_policy_reward
                ).sum(
                    axis=1
                ).mean() + 1e-10  # to avoid zero division
        else:
            behavior_policy = input_dict[list(input_dict.keys())[0]]["behavior_policy"]
            behavior_policy_reward = self.behavior_policy_reward[behavior_policy]
            behavior_policy_cum_reward[behavior_policy] = (
                discount[np.newaxis, :] * behavior_policy_reward
            ).sum(
                axis=1
            ) + 1e-10  # to avoid zero division
            behavior_policy_value[behavior_policy] = (
                discount[np.newaxis, :] * behavior_policy_reward
            ).sum(
                axis=1
            ).mean() + 1e-10  # to avoid zero division

        metric_dict = defaultdict(dict)
        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None and dataset_id is None:
                n_datasets = input_dict.n_datasets
                total_n_datasets = np.array(list(n_datasets.values())).sum()
                baseline = np.zeros(total_n_datasets)

                for i, estimator in enumerate(compared_estimators):
                    for j, metric in enumerate(metrics):
                        topk_metric = np.zeros((max_topk, total_n_datasets))

                        for topk in range(max_topk):
                            l = 0
                            for behavior_policy in input_dict.behavior_policy_names:
                                for dataset_id_ in range(n_datasets[behavior_policy]):
                                    if i == 0:
                                        if true_dict_value_arg == "policy_value":
                                            baseline[l] = behavior_policy_value[
                                                behavior_policy
                                            ]
                                        elif (
                                            true_dict_value_arg
                                            == "conditional_value_at_risk"
                                        ):
                                            baseline_reward = (
                                                behavior_policy_cum_reward[
                                                    behavior_policy
                                                ]
                                            )
                                            baseline[l] = np.sort(baseline_reward)[
                                                : int(len(baseline_reward) * ope_alpha)
                                            ].mean()
                                        elif true_dict_value_arg == "lower_quartile":
                                            baseline_reward = (
                                                behavior_policy_cum_reward[
                                                    behavior_policy
                                                ]
                                            )
                                            baseline[l] = np.quantile(
                                                baseline_reward,
                                                q=ope_alpha,
                                            )

                                    topk_values = ranking_dict[behavior_policy][
                                        dataset_id_
                                    ][estimator][: topk + 1]

                                    if metric == "k-th":
                                        topk_metric[topk, l] = topk_values[-1]
                                    elif metric == "best":
                                        topk_metric[topk, l] = topk_values.max()
                                    elif metric == "worst":
                                        topk_metric[topk, l] = topk_values.min()
                                    elif metric == "mean":
                                        topk_metric[topk, l] = topk_values.mean()
                                    elif metric == "std":
                                        topk_metric[topk, l] = topk_values.std(ddof=1)
                                    else:
                                        topk_metric[topk, l] = (
                                            topk_values < safety_threshold
                                        ).sum() / (topk + 1)

                                    l += 1

                        metric_dict[estimator][metric] = topk_metric

                    if i == 0:
                        baseline = np.tile(baseline, (max_topk, 1))

                    sharpe_ratio = (
                        np.clip(metric_dict[estimator]["best"] - baseline, 0, None)
                        / metric_dict[estimator]["std"]
                    )

                    if clip_sharpe_ratio:
                        sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2)
                        sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                    metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio

            elif behavior_policy_name is None and dataset_id is not None:
                total_n_datasets = len(input_dict.behavior_policy_names)
                baseline = np.zeros(total_n_datasets)

                for i, estimator in enumerate(compared_estimators):
                    for j, metric in enumerate(metrics):
                        topk_metric = np.zeros((max_topk, total_n_datasets))

                        for topk in range(max_topk):
                            for l, behavior_policy in enumerate(
                                input_dict.behavior_policy_names
                            ):
                                if i == 0:
                                    if true_dict_value_arg == "policy_value":
                                        baseline[l] = behavior_policy_value[
                                            behavior_policy
                                        ]
                                    elif (
                                        true_dict_value_arg
                                        == "conditional_value_at_risk"
                                    ):
                                        baseline_reward = behavior_policy_cum_reward[
                                            behavior_policy
                                        ]
                                        baseline[l] = np.sort(baseline_reward)[
                                            : int(len(baseline_reward) * ope_alpha)
                                        ].mean()
                                    elif true_dict_value_arg == "lower_quartile":
                                        baseline_reward = behavior_policy_cum_reward[
                                            behavior_policy
                                        ]
                                        baseline[l] = np.quantile(
                                            baseline_reward,
                                            q=ope_alpha,
                                        )

                                topk_values = ranking_dict[behavior_policy][estimator][
                                    : topk + 1
                                ]

                                if metric == "k-th":
                                    topk_metric[topk, l] = topk_values[-1]
                                elif metric == "best":
                                    topk_metric[topk, l] = topk_values.max()
                                elif metric == "worst":
                                    topk_metric[topk, l] = topk_values.min()
                                elif metric == "mean":
                                    topk_metric[topk, l] = topk_values.mean()
                                elif metric == "std":
                                    topk_metric[topk, l] = topk_values.std(ddof=1)
                                else:
                                    topk_metric[topk, l] = (
                                        topk_values < safety_threshold
                                    ).sum() / (topk + 1)

                        metric_dict[estimator][metric] = topk_metric

                    if i == 0:
                        baseline = np.tile(baseline, (max_topk, 1))

                    sharpe_ratio = (
                        np.clip(metric_dict[estimator]["best"] - baseline, 0, None)
                        / metric_dict[estimator]["std"]
                    )

                    if clip_sharpe_ratio:
                        sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2)
                        sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                    metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio

            elif behavior_policy_name is not None and dataset_id is None:
                total_n_datasets = input_dict.n_datasets[behavior_policy_name]
                if true_dict_value_arg == "policy_value":
                    baseline = behavior_policy_value[behavior_policy_name]
                elif true_dict_value_arg == "conditional_value_at_risk":
                    baseline_reward = behavior_policy_cum_reward[behavior_policy_name]
                    baseline = np.sort(baseline_reward)[
                        : int(len(baseline_reward) * ope_alpha)
                    ].mean()
                elif true_dict_value_arg == "lower_quartile":
                    baseline_reward = behavior_policy_cum_reward[behavior_policy_name]
                    baseline = np.quantile(
                        baseline_reward,
                        q=ope_alpha,
                    )

                for i, estimator in enumerate(compared_estimators):
                    for j, metric in enumerate(metrics):
                        topk_metric = np.zeros((max_topk, total_n_datasets))

                        for topk in range(max_topk):
                            for l in range(total_n_datasets):
                                topk_values = ranking_dict[l][estimator][: topk + 1]

                                if metric == "k-th":
                                    topk_metric[topk, l] = topk_values[-1]
                                elif metric == "best":
                                    topk_metric[topk, l] = topk_values.max()
                                elif metric == "worst":
                                    topk_metric[topk, l] = topk_values.min()
                                elif metric == "mean":
                                    topk_metric[topk, l] = topk_values.mean()
                                elif metric == "std":
                                    topk_metric[topk, l] = topk_values.std(ddof=1)
                                else:
                                    topk_metric[topk, l] = (
                                        topk_values < safety_threshold
                                    ).sum() / (topk + 1)

                        metric_dict[estimator][metric] = topk_metric

                    sharpe_ratio = (
                        np.clip(metric_dict[estimator]["best"] - baseline, 0, None)
                        / metric_dict[estimator]["std"]
                    )

                    if clip_sharpe_ratio:
                        sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2)
                        sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                    metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio

            else:
                total_n_datasets = 1
                if true_dict_value_arg == "policy_value":
                    baseline = behavior_policy_value[behavior_policy_name]
                elif true_dict_value_arg == "conditional_value_at_risk":
                    baseline_reward = behavior_policy_cum_reward[behavior_policy_name]
                    baseline = np.sort(baseline_reward)[
                        : int(len(baseline_reward) * ope_alpha)
                    ].mean()
                elif true_dict_value_arg == "lower_quartile":
                    baseline_reward = behavior_policy_cum_reward[behavior_policy_name]
                    baseline = np.quantile(
                        baseline_reward,
                        q=ope_alpha,
                    )

                for i, estimator in enumerate(compared_estimators):
                    for j, metric in enumerate(metrics):
                        topk_metric = np.zeros((max_topk, total_n_datasets))

                        for topk in range(max_topk):
                            topk_values = ranking_dict[estimator][: topk + 1]

                            if metric == "k-th":
                                topk_metric[topk, 0] = topk_values[-1]
                            elif metric == "best":
                                topk_metric[topk, 0] = topk_values.max()
                            elif metric == "worst":
                                topk_metric[topk, 0] = topk_values.min()
                            elif metric == "mean":
                                topk_metric[topk, 0] = topk_values.mean()
                            elif metric == "std":
                                topk_metric[topk, 0] = topk_values.std(ddof=1)
                            else:
                                topk_metric[topk, 0] = (
                                    topk_values < safety_threshold
                                ).sum() / (topk + 1)

                        metric_dict[estimator][metric] = topk_metric

                    sharpe_ratio = (
                        np.clip(metric_dict[estimator]["best"] - baseline, 0, None)
                        / metric_dict[estimator]["std"]
                    )

                    if clip_sharpe_ratio:
                        sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2)
                        sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                    metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio

        else:
            behavior_policy = input_dict[list(input_dict.keys())[0]]["behavior_policy"]
            if true_dict_value_arg == "policy_value":
                baseline = behavior_policy_value[behavior_policy]
            elif true_dict_value_arg == "conditional_value_at_risk":
                baseline_reward = behavior_policy_cum_reward[behavior_policy]
                baseline = np.sort(baseline_reward)[
                    : int(len(baseline_reward) * ope_alpha)
                ].mean()
            elif true_dict_value_arg == "lower_quartile":
                baseline_reward = behavior_policy_cum_reward[behavior_policy]
                baseline = np.quantile(
                    baseline_reward,
                    q=ope_alpha,
                )

            for i, estimator in enumerate(compared_estimators):
                for j, metric in enumerate(metrics):
                    topk_metric = np.zeros((max_topk, 1))

                    for topk in range(max_topk):
                        topk_values = ranking_dict[estimator][: topk + 1]

                        if metric == "k-th":
                            topk_metric[topk, 0] = topk_values[-1]
                        elif metric == "best":
                            topk_metric[topk, 0] = topk_values.max()
                        elif metric == "worst":
                            topk_metric[topk, 0] = topk_values.min()
                        elif metric == "mean":
                            topk_metric[topk, 0] = topk_values.mean()
                        elif metric == "std":
                            topk_metric[topk, 0] = topk_values.std(ddof=1)
                        else:
                            topk_metric[topk, 0] = (
                                topk_values < safety_threshold
                            ).sum() / (topk + 1)

                    metric_dict[estimator][metric] = topk_metric

                sharpe_ratio = (
                    np.clip(metric_dict[estimator]["best"] - baseline, 0, None)
                    / metric_dict[estimator]["std"]
                )

                if clip_sharpe_ratio:
                    sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2)
                    sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio

        metric_dict = defaultdict_to_dict(metric_dict)

        if return_by_dataframe:
            metrics.extend(["sharpe_ratio"])
            metric_df = []

            for i, estimator in enumerate(compared_estimators):
                metric_df_ = pd.DataFrame()
                metric_df_["topk"] = np.arange(max_topk)
                metric_df_["estimator"] = estimator
                metric_df_ = metric_df_[["estimator", "topk"]]

                for metric in metrics:
                    metric_df_[metric] = metric_dict[estimator][metric].mean(axis=1)

                metric_df.append(metric_df_)

            metric = pd.concat(metric_df, axis=0)

        else:
            metric = metric_dict

        return metric

[docs]    def obtain_topk_policy_value_selected_by_standard_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        max_topk: Optional[int] = None,
        return_safety_violation_rate: bool = False,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        return_by_dataframe: bool = False,
    ):
        """Obtain the topk deployment result (policy value) selected by standard OPE.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        return_safety_violation_rate: bool, default=False.
            Whether to calculate and return the safety violate.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that policy performance refers to the (standard) policy value here. When returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            relative_safety_criteria=relative_safety_criteria,
            gamma=gamma,
        )

        true_dict = self.obtain_true_selection_result(
            input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        estimation_dict = self.select_by_policy_value(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        return self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking",
            true_dict_value_arg="policy_value",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            return_safety_violation_rate=return_safety_violation_rate,
            safety_threshold=safety_threshold,
            return_by_dataframe=return_by_dataframe,
        )

[docs]    def obtain_topk_policy_value_selected_by_cumulative_distribution_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        max_topk: Optional[int] = None,
        return_safety_violation_rate: bool = False,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        return_by_dataframe: bool = False,
    ):
        """Obtain the topk deployment result (policy value) selected by cumulative distribution OPE.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        return_safety_violation_rate: bool, default=False.
            Whether to calculate and return the safety violate.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that policy performance refers to the (standard) policy value here. When returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            relative_safety_criteria=relative_safety_criteria,
            gamma=gamma,
        )

        true_dict = self.obtain_true_selection_result(
            input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        estimation_dict = self.select_by_policy_value_via_cumulative_distribution_ope(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        return self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking",
            true_dict_value_arg="policy_value",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            return_safety_violation_rate=return_safety_violation_rate,
            safety_threshold=safety_threshold,
            return_by_dataframe=return_by_dataframe,
        )

[docs]    def obtain_topk_policy_value_selected_by_lower_bound(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        max_topk: Optional[int] = None,
        return_safety_violation_rate: bool = False,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        cis: List[str] = ["bootstrap"],
        ope_alpha: float = 0.05,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        return_by_dataframe: bool = False,
    ):
        """Obtain the topk deployment (policy value) result selected by its estimated lower bound.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        return_safety_violation_rate: bool, default=False.
            Whether to calculate and return the safety violate.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"]
            Estimation methods for confidence intervals.

        ope_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        n_bootstrap_samples: int, default=100 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that policy performance refers to the (standard) policy value here. When returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            relative_safety_criteria=relative_safety_criteria,
            gamma=gamma,
        )
        if return_safety_violation_rate:
            metrics = ["k-th", "best", "worst", "mean", "std", "safety_violation_rate"]
        else:
            metrics = ["k-th", "best", "worst", "mean", "std"]

        policy_value_dict = self.select_by_policy_value_lower_bound(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            return_true_values=True,
            cis=cis,
            alpha=ope_alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )

        discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma

        behavior_policy_cum_reward = {}
        behavior_policy_value = {}
        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None:
                for behavior_policy in input_dict.behavior_policy_names:
                    behavior_policy_reward = self.behavior_policy_reward[
                        behavior_policy
                    ]
                    behavior_policy_cum_reward[behavior_policy] = (
                        discount[np.newaxis, :] * behavior_policy_reward
                    ).sum(
                        axis=1
                    ) + 1e-10  # to avoid zero division
                    behavior_policy_value[behavior_policy] = (
                        discount[np.newaxis, :] * behavior_policy_reward
                    ).sum(
                        axis=1
                    ).mean() + 1e-10  # to avoid zero division
            else:
                behavior_policy_reward = self.behavior_policy_reward[
                    behavior_policy_name
                ]
                behavior_policy_cum_reward[behavior_policy_name] = (
                    discount[np.newaxis, :] * behavior_policy_reward
                ).sum(
                    axis=1
                ) + 1e-10  # to avoid zero division
                behavior_policy_value[behavior_policy_name] = (
                    discount[np.newaxis, :] * behavior_policy_reward
                ).sum(
                    axis=1
                ).mean() + 1e-10  # to avoid zero division
        else:
            behavior_policy = input_dict[list(input_dict.keys())[0]]["behavior_policy"]
            behavior_policy_reward = self.behavior_policy_reward[behavior_policy]
            behavior_policy_cum_reward[behavior_policy] = (
                discount[np.newaxis, :] * behavior_policy_reward
            ).sum(
                axis=1
            ) + 1e-10  # to avoid zero division
            behavior_policy_value[behavior_policy] = (
                discount[np.newaxis, :] * behavior_policy_reward
            ).sum(
                axis=1
            ).mean() + 1e-10  # to avoid zero division

        metric_dict = defaultdict(lambda: defaultdict(dict))

        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None and dataset_id is None:
                n_datasets = input_dict.n_datasets
                total_n_datasets = np.array(list(n_datasets.values())).sum()
                baseline = np.zeros(total_n_datasets)

                for ci in cis:
                    for i, estimator in enumerate(compared_estimators):
                        for j, metric in enumerate(metrics):
                            topk_metric = np.zeros((max_topk, total_n_datasets))

                            for topk in range(max_topk):
                                l = 0
                                for behavior_policy in input_dict.behavior_policy_names:
                                    for dataset_id_ in range(
                                        n_datasets[behavior_policy]
                                    ):
                                        if i == 0 and ci == cis[0]:
                                            baseline[l] = behavior_policy_value[
                                                behavior_policy
                                            ]

                                        topk_values = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "true_policy_value"
                                        ][
                                            : topk + 1
                                        ]

                                        if metric == "k-th":
                                            topk_metric[topk, l] = topk_values[-1]
                                        elif metric == "best":
                                            topk_metric[topk, l] = topk_values.max()
                                        elif metric == "worst":
                                            topk_metric[topk, l] = topk_values.min()
                                        elif metric == "mean":
                                            topk_metric[topk, l] = topk_values.mean()
                                        elif metric == "std":
                                            topk_metric[topk, l] = topk_values.std(
                                                ddof=1
                                            )
                                        else:
                                            topk_metric[topk, l] = (
                                                topk_values < safety_threshold
                                            ).sum() / (topk + 1)

                                        l += 1

                            metric_dict[ci][estimator][metric] = topk_metric

                        if i == 0 and ci == cis[0]:
                            baseline = np.tile(baseline, (max_topk, 1))

                        sharpe_ratio = (
                            np.clip(
                                metric_dict[ci][estimator]["best"] - baseline, 0, None
                            )
                            / metric_dict[ci][estimator]["std"]
                        )

                        if clip_sharpe_ratio:
                            sharpe_ratio[1:] = np.nan_to_num(
                                sharpe_ratio[1:], posinf=1e2
                            )
                            sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                        metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio

            elif behavior_policy_name is None and dataset_id is not None:
                total_n_datasets = len(input_dict.behavior_policy_names)
                baseline = np.zeros(total_n_datasets)

                for ci in cis:
                    for i, estimator in enumerate(compared_estimators):
                        for j, metric in enumerate(metrics):
                            topk_metric = np.zeros((max_topk, total_n_datasets))

                            for topk in range(max_topk):
                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    if i == 0 and ci == cis[0]:
                                        baseline[l] = behavior_policy_value[
                                            behavior_policy
                                        ]

                                    topk_values = policy_value_dict[behavior_policy][
                                        ci
                                    ][estimator]["true_policy_value"][: topk + 1]

                                    if metric == "k-th":
                                        topk_metric[topk, l] = topk_values[-1]
                                    elif metric == "best":
                                        topk_metric[topk, l] = topk_values.max()
                                    elif metric == "worst":
                                        topk_metric[topk, l] = topk_values.min()
                                    elif metric == "mean":
                                        topk_metric[topk, l] = topk_values.mean()
                                    elif metric == "std":
                                        topk_metric[topk, l] = topk_values.std(ddof=1)
                                    else:
                                        topk_metric[topk, l] = (
                                            topk_values < safety_threshold
                                        ).sum() / (topk + 1)

                            metric_dict[ci][estimator][metric] = topk_metric

                        if i == 0 and ci == cis[0]:
                            baseline = np.tile(baseline, (max_topk, 1))

                        sharpe_ratio = (
                            np.clip(
                                metric_dict[ci][estimator]["best"] - baseline, 0, None
                            )
                            / metric_dict[ci][estimator]["std"]
                        )

                        if clip_sharpe_ratio:
                            sharpe_ratio[1:] = np.nan_to_num(
                                sharpe_ratio[1:], posinf=1e2
                            )
                            sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                        metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio

            elif behavior_policy_name is not None and dataset_id is None:
                total_n_datasets = input_dict.n_datasets[behavior_policy_name]
                baseline = behavior_policy_value[behavior_policy_name]

                for ci in cis:
                    for i, estimator in enumerate(compared_estimators):
                        for j, metric in enumerate(metrics):
                            topk_metric = np.zeros((max_topk, total_n_datasets))

                            for topk in range(max_topk):
                                for l in range(total_n_datasets):
                                    topk_values = policy_value_dict[l][ci][estimator][
                                        "true_policy_value"
                                    ][: topk + 1]

                                    if metric == "k-th":
                                        topk_metric[topk, l] = topk_values[-1]
                                    elif metric == "best":
                                        topk_metric[topk, l] = topk_values.max()
                                    elif metric == "worst":
                                        topk_metric[topk, l] = topk_values.min()
                                    elif metric == "mean":
                                        topk_metric[topk, l] = topk_values.mean()
                                    elif metric == "std":
                                        topk_metric[topk, l] = topk_values.std(ddof=1)
                                    else:
                                        topk_metric[topk, l] = (
                                            topk_values < safety_threshold
                                        ).sum() / (topk + 1)

                            metric_dict[ci][estimator][metric] = topk_metric

                        sharpe_ratio = (
                            np.clip(
                                metric_dict[ci][estimator]["best"] - baseline, 0, None
                            )
                            / metric_dict[ci][estimator]["std"]
                        )

                        if clip_sharpe_ratio:
                            sharpe_ratio[1:] = np.nan_to_num(
                                sharpe_ratio[1:], posinf=1e2
                            )
                            sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                        metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio

            else:
                total_n_datasets = 1
                baseline = behavior_policy_value[behavior_policy_name]

                for ci in cis:
                    for i, estimator in enumerate(compared_estimators):
                        for j, metric in enumerate(metrics):
                            topk_metric = np.zeros((max_topk, total_n_datasets))

                            for topk in range(max_topk):
                                topk_values = policy_value_dict[ci][estimator][
                                    "true_policy_value"
                                ][: topk + 1]

                                if metric == "k-th":
                                    topk_metric[topk, 0] = topk_values[-1]
                                elif metric == "best":
                                    topk_metric[topk, 0] = topk_values.max()
                                elif metric == "worst":
                                    topk_metric[topk, 0] = topk_values.min()
                                elif metric == "mean":
                                    topk_metric[topk, 0] = topk_values.mean()
                                elif metric == "std":
                                    topk_metric[topk, 0] = topk_values.std(ddof=1)
                                else:
                                    topk_metric[topk, 0] = (
                                        topk_values < safety_threshold
                                    ).sum() / (topk + 1)

                            metric_dict[ci][estimator][metric] = topk_metric

                        sharpe_ratio = (
                            np.clip(
                                metric_dict[ci][estimator]["best"] - baseline, 0, None
                            )
                            / metric_dict[ci][estimator]["std"]
                        )

                        if clip_sharpe_ratio:
                            sharpe_ratio[1:] = np.nan_to_num(
                                sharpe_ratio[1:], posinf=1e2
                            )
                            sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                        metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio

        else:
            behavior_policy = input_dict[list(input_dict.keys())[0]]["behavior_policy"]
            baseline = behavior_policy_value[behavior_policy]

            for ci in cis:
                for i, estimator in enumerate(compared_estimators):
                    for j, metric in enumerate(metrics):
                        topk_metric = np.zeros((max_topk, 1))

                        for topk in range(max_topk):
                            topk_values = policy_value_dict[ci][estimator][
                                "true_policy_value"
                            ][: topk + 1]

                            if metric == "k-th":
                                topk_metric[topk, 0] = topk_values[-1]
                            elif metric == "best":
                                topk_metric[topk, 0] = topk_values.max()
                            elif metric == "worst":
                                topk_metric[topk, 0] = topk_values.min()
                            elif metric == "mean":
                                topk_metric[topk, 0] = topk_values.mean()
                            elif metric == "std":
                                topk_metric[topk, 0] = topk_values.std(ddof=1)
                            else:
                                topk_metric[topk, 0] = (
                                    topk_values < safety_threshold
                                ).sum() / (topk + 1)

                        metric_dict[ci][estimator][metric] = topk_metric

                    sharpe_ratio = (
                        np.clip(metric_dict[ci][estimator]["best"] - baseline, 0, None)
                        / metric_dict[ci][estimator]["std"]
                    )

                    if clip_sharpe_ratio:
                        sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2)
                        sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2)

                    metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio

        metric_dict = defaultdict_to_dict(metric_dict)

        if return_by_dataframe:
            metrics.extend(["sharpe_ratio"])
            metric_df = []

            for ci in cis:
                for estimator in compared_estimators:
                    metric_df_ = pd.DataFrame()
                    metric_df_["topk"] = np.arange(max_topk)
                    metric_df_["estimator"] = estimator
                    metric_df_["ci"] = ci
                    metric_df_ = metric_df_[["ci", "estimator", "topk"]]

                    for metric in metrics:
                        metric_df_[metric] = metric_dict[ci][estimator][metric].mean(
                            axis=1
                        )

                    metric_df.append(metric_df_)

            metric = pd.concat(metric_df, axis=0)

        else:
            metric = metric_dict

        return metric

[docs]    def obtain_topk_conditional_value_at_risk_selected_by_standard_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        ope_alpha: float = 0.05,
        max_topk: Optional[int] = None,
        return_safety_violation_rate: bool = False,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        return_by_dataframe: bool = False,
    ):
        """Obtain the topk deployment result (CVaR) selected by standard OPE.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        ope_alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 1]`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        return_safety_violation_rate: bool, default=False.
            Whether to calculate and return the safety violate.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that policy performance refers to CVaR here. When returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            gamma=gamma,
        )

        true_dict = self.obtain_true_selection_result(
            input_dict,
            return_conditional_value_at_risk=True,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            cvar_alpha=ope_alpha,
        )
        estimation_dict = self.select_by_policy_value(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        return self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking_by_conditional_value_at_risk",
            true_dict_value_arg="conditional_value_at_risk",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            ope_alpha=ope_alpha,
            return_safety_violation_rate=return_safety_violation_rate,
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
            return_by_dataframe=return_by_dataframe,
        )

[docs]    def obtain_topk_conditional_value_at_risk_selected_by_cumulative_distribution_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        ope_alpha: float = 0.05,
        max_topk: Optional[int] = None,
        return_safety_violation_rate: bool = False,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        return_by_dataframe: bool = False,
    ):
        """Obtain the topk deployment result (CVaR) selected by cumulative distribution OPE.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        ope_alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 1]`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        return_safety_violation_rate: bool, default=False.
            Whether to calculate and return the safety violate.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that policy performance refers to CVaR here. When returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            gamma=gamma,
        )

        true_dict = self.obtain_true_selection_result(
            input_dict,
            return_conditional_value_at_risk=True,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            cvar_alpha=ope_alpha,
        )
        estimation_dict = self.select_by_conditional_value_at_risk(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=ope_alpha,
        )

        return self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking_by_conditional_value_at_risk",
            true_dict_value_arg="conditional_value_at_risk",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            ope_alpha=ope_alpha,
            return_safety_violation_rate=return_safety_violation_rate,
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
            return_by_dataframe=return_by_dataframe,
        )

[docs]    def obtain_topk_lower_quartile_selected_by_standard_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        ope_alpha: float = 0.05,
        max_topk: Optional[int] = None,
        return_safety_violation_rate: bool = False,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        return_by_dataframe: bool = False,
    ):
        """Obtain the topk deployment result (lower quartile) selected by standard OPE.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 0.5]`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        return_safety_violation_rate: bool, default=False.
            Whether to calculate and return the safety violate.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that policy performance refers to the lower quartile here. When returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            gamma=gamma,
        )

        true_dict = self.obtain_true_selection_result(
            input_dict,
            return_lower_quartile=True,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            quartile_alpha=ope_alpha,
        )
        estimation_dict = self.select_by_policy_value(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        return self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking_by_lower_quartile",
            true_dict_value_arg="lower_quartile",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            ope_alpha=ope_alpha,
            return_safety_violation_rate=return_safety_violation_rate,
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
            return_by_dataframe=return_by_dataframe,
        )

[docs]    def obtain_topk_lower_quartile_selected_by_cumulative_distribution_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        ope_alpha: float = 0.05,
        max_topk: Optional[int] = None,
        return_safety_violation_rate: bool = False,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        return_by_dataframe: bool = False,
    ):
        """Obtain the topk deployment result (lower quartile) selected by cumulative distribution OPE.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 0.5]`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        return_safety_violation_rate: bool, default=False.
            Whether to calculate and return the safety violate.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        return_by_dataframe: bool, default=False
            Whether to return the result in a dataframe format.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that policy performance refers to the lower quartile here. When returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            gamma=gamma,
        )

        true_dict = self.obtain_true_selection_result(
            input_dict,
            return_lower_quartile=True,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            quartile_alpha=ope_alpha,
        )
        estimation_dict = self.select_by_lower_quartile(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=ope_alpha,
        )

        return self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking_by_lower_quartile",
            true_dict_value_arg="lower_quartile",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            ope_alpha=ope_alpha,
            return_safety_violation_rate=return_safety_violation_rate,
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
            return_by_dataframe=return_by_dataframe,
        )

    def _obtain_min_max_val_for_topk_visualization(
        self,
        true_dict: Dict,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
    ):
        """Obtain minimum and maximum policy performance for topk visualization.

        Parameters
        -------
        true_dict: dict
            Dictionary containing the true deployment result.

        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        Return
        -------
        topk_metric_dict/topk_metric_df: dict or dataframe
            Dictionary/dataframe containing the following top-k risk return tradeoff metrics.
            Note that when returning dataframe, the average value will be returned.

            .. code-block:: python

                key: [estimator][
                    k-th,
                    best,  # return
                    worst,  # risk
                    mean,   # risk
                    std,    # risk
                    safety_violation_rate,  # risk
                    sharpe_ratio,  # risk-return tradeoff
                ]

            k-th: ndarray of shape (max_topk, total_n_datasets)
                Policy performance of the k-th deployment policy.

            best: ndarray of shape (max_topk, total_n_datasets)
                Best policy performance among the top-k deployment policies.

            worst: ndarray of shape (max_topk, total_n_datasets)
                Wosrt policy performance among the top-k deployment policies.

            mean: ndarray of shape (max_topk, total_n_datasets)
                Mean policy performance of the top-k deployment policies.

            std: ndarray of shape (max_topk, total_n_datasets)
                Standard deviation of the policy performance among the top-k deployment policies.

            safety_violation_rate: ndarray of shape (max_topk, total_n_datasets)
                Safety violation rate regarding the policy performance of the top-k deployment policies.

            sharpe_ratio: ndarray of shape (max_topk, total_n_datasets)
                Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        """
        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None and dataset_id is None:
                n_datasets = input_dict.n_datasets
                total_n_datasets = np.array(list(n_datasets.values())).sum()
            elif behavior_policy_name is None and dataset_id is not None:
                total_n_datasets = len(input_dict.behavior_policy_names)
            elif behavior_policy_name is not None and dataset_id is None:
                total_n_datasets = input_dict.n_datasets[behavior_policy_name]
            else:
                total_n_datasets = 1

        if isinstance(input_dict, MultipleInputDict):
            min_vals = np.zeros(total_n_datasets)
            max_vals = np.zeros(total_n_datasets)

            if behavior_policy_name is None and dataset_id is None:
                l = 0
                for behavior_policy, n_datasets in input_dict.n_datasets.items():
                    for dataset_id_ in range(n_datasets):
                        min_vals[l] = np.array(
                            list(true_dict[behavior_policy][dataset_id_].values())
                        ).min()
                        max_vals[l] = np.array(
                            list(true_dict[behavior_policy][dataset_id_].values())
                        ).max()
                        l += 1

            elif behavior_policy_name is None and dataset_id is not None:
                for l, behavior_policy in enumerate(input_dict.behavior_policy_names):
                    min_vals[l] = np.array(
                        list(true_dict[behavior_policy].values())
                    ).min()
                    max_vals[l] = np.array(
                        list(true_dict[behavior_policy].values())
                    ).max()

            elif behavior_policy_name is not None and dataset_id is None:
                for l in range(total_n_datasets):
                    min_vals[l] = np.array(list(true_dict[l].values())).min()
                    max_vals[l] = np.array(list(true_dict[l].values())).max()

            else:
                min_vals[0] = np.array(list(true_dict.values())).min()
                max_vals[0] = np.array(list(true_dict.values())).max()

            min_val = min_vals.mean()
            max_val = max_vals.mean()

        else:
            min_val = np.array(list(true_dict.values())).min()
            max_val = np.array(list(true_dict.values())).max()

        return min_val, max_val

    def _visualize_topk_policy_performance(
        self,
        metric_dict: Dict,
        min_val: float,
        max_val: float,
        compared_estimators: Optional[List[str]] = None,
        metrics: List[str] = [
            "k-th",
            "best",
            "worst",
            "mean",
            "std",
            "safety_violation_rate",
            "sharpe_ratio",
        ],
        max_topk: Optional[int] = None,
        safety_threshold: Optional[float] = None,
        visualize_ci: bool = False,
        ci: str = "bootstrap",
        alpha: float = 0.05,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        legend: bool = True,
        ylabel: str = "policy performance",
        ymax_sharpe_ratio: Optional[float] = None,
        fig_dir: Optional[Path] = None,
        fig_name: Optional[str] = None,
    ):
        """Visualize top-k policy deployment performances.

        Parameters
        -------
        metric_dict: dict
            Dictionary containing the top-k risk return tradeoff metrics.

        min_val: float
            Minimum value in the plot.

        max_val: float
            Maximum value in the plot.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            If `None` is given, all the estimators are compared.

        metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"]
            Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report.
            For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized.

            We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        visualize_ci: bool, default=False
            Whether to visualize ci.

        ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        legend: bool, default=True
            Whether to include a legend in the figure.

        ylabel: str, default="policy performance"
            Label of the y-axis.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="topk_policy_value_standard_ope.png"
            Name of the bar figure.

        """
        yaxis_min_val = (
            min_val if safety_threshold is None else min(min_val, safety_threshold)
        )
        yaxis_max_val = (
            max_val if safety_threshold is None else max(max_val, safety_threshold)
        )
        margin = (yaxis_max_val - yaxis_min_val) * 0.05

        plt.style.use("ggplot")
        color = plt.rcParams["axes.prop_cycle"].by_key()["color"]
        n_colors = len(color)

        n_figs = len(metrics)

        fig, axes = plt.subplots(
            nrows=1,
            ncols=n_figs,
            figsize=(6 * n_figs, 4),
        )

        if len(metrics) == 1:
            for i, estimator in enumerate(compared_estimators):
                axes.plot(
                    np.arange(1, max_topk + 1),
                    metric_dict[estimator][metric].mean(axis=1),
                    color=color[i % n_colors],
                    marker=markers[i],
                    label=estimator,
                )

                if visualize_ci:
                    lower = np.zeros(max_topk)
                    upper = np.zeros(max_topk)

                    for topk in range(max_topk):
                        ci_ = self._estimate_confidence_interval[ci](
                            metric_dict[estimator][metric][topk],
                            alpha=alpha,
                            n_bootstrap_samples=n_bootstrap_samples,
                            random_state=random_state,
                        )
                        lower[topk] = ci_[f"{100 * (1. - alpha)}% CI (lower)"]
                        upper[topk] = ci_[f"{100 * (1. - alpha)}% CI (upper)"]

                    axes.fill_between(
                        np.arange(1, max_topk + 1),
                        lower,
                        upper,
                        color=color[i % n_colors],
                        alpha=0.3,
                    )

            if metric in ["k-th", "best", "worst", "mean"]:
                if safety_threshold is not None:
                    axes.plot(
                        np.arange(1, max_topk + 1),
                        np.full(max_topk, safety_threshold),
                        color=dkred,
                        label="safety threshold",
                    )
                    axes.plot(
                        np.arange(1, max_topk + 1),
                        np.full(max_topk, max_val),
                        color="black",
                        linewidth=0.5,
                    )
                    axes.plot(
                        np.arange(1, max_topk + 1),
                        np.full(max_topk, min_val),
                        color="black",
                        linewidth=0.5,
                    )

                axes.set_title(f"{metric}")
                axes.set_ylabel(f"{metric} {ylabel}")
                axes.set_ylim(yaxis_min_val - margin, yaxis_max_val + margin)

            elif metric == "std":
                axes.set_title("std")
                axes.set_ylabel("standard deviation")

            elif metric == "sharpe_ratio":
                axes.plot(
                    np.arange(2, max_topk + 1),
                    np.zeros(max_topk - 1),
                    color="black",
                    linewidth=0.5,
                )

                axes.set_title("sharpe ratio")
                axes.set_ylabel("sharpe ratio")
                axes.set_ylim(0.0, ymax_sharpe_ratio)

            else:
                axes.set_title("safety violation")
                axes.set_ylabel("safety violation rate")
                axes.set_ylim(-0.05, 1.05)

            axes.set_xlabel("# of policies deployed")

            if legend:
                axes.legend(loc="upper right")

        else:
            for j, metric in enumerate(metrics):
                for i, estimator in enumerate(compared_estimators):
                    axes[j].plot(
                        np.arange(1, max_topk + 1),
                        metric_dict[estimator][metric].mean(axis=1),
                        color=color[i % n_colors],
                        marker=markers[i],
                        label=estimator,
                    )

                    if visualize_ci:
                        lower = np.zeros(max_topk)
                        upper = np.zeros(max_topk)

                        for topk in range(max_topk):
                            ci_ = self._estimate_confidence_interval[ci](
                                metric_dict[estimator][metric][topk],
                                alpha=alpha,
                                n_bootstrap_samples=n_bootstrap_samples,
                                random_state=random_state,
                            )
                            lower[topk] = ci_[f"{100 * (1. - alpha)}% CI (lower)"]
                            upper[topk] = ci_[f"{100 * (1. - alpha)}% CI (upper)"]

                        axes[j].fill_between(
                            np.arange(1, max_topk + 1),
                            lower,
                            upper,
                            color=color[i % n_colors],
                            alpha=0.3,
                        )

                if metric in ["k-th", "best", "worst", "mean"]:
                    if safety_threshold is not None:
                        axes[j].plot(
                            np.arange(1, max_topk + 1),
                            np.full(max_topk, safety_threshold),
                            color=dkred,
                            label="safety threshold",
                        )
                        axes[j].plot(
                            np.arange(1, max_topk + 1),
                            np.full(max_topk, max_val),
                            color="black",
                            linewidth=0.5,
                        )
                        axes[j].plot(
                            np.arange(1, max_topk + 1),
                            np.full(max_topk, min_val),
                            color="black",
                            linewidth=0.5,
                        )

                    axes[j].set_title(f"{metric}")
                    axes[j].set_ylabel(f"{metric} {ylabel}")
                    axes[j].set_ylim(yaxis_min_val - margin, yaxis_max_val + margin)

                elif metric == "std":
                    axes[j].set_title("std")
                    axes[j].set_ylabel("standard deviation")

                elif metric == "sharpe_ratio":
                    axes[j].plot(
                        np.arange(2, max_topk + 1),
                        np.zeros(max_topk - 1),
                        color="black",
                        linewidth=0.5,
                    )

                    axes[j].set_title("sharpe ratio")
                    axes[j].set_ylabel("sharpe ratio")
                    axes[j].set_ylim(0.0, ymax_sharpe_ratio)

                else:
                    axes[j].set_title("safety violation")
                    axes[j].set_ylabel("safety violation rate")
                    axes[j].set_ylim(-0.05, 1.05)

                axes[j].set_xlabel("# of policies deployed")

                if legend:
                    axes[j].legend(loc="upper right")

            if legend:
                handles, labels = axes[0].get_legend_handles_labels()
                # n_cols shows err
                # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6))

        fig.subplots_adjust(hspace=0.35, wspace=0.2)
        plt.show()

        if fig_dir:
            fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")

[docs]    def visualize_topk_policy_value_selected_by_standard_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        metrics: List[str] = [
            "k-th",
            "best",
            "worst",
            "mean",
            "std",
            "safety_violation_rate",
            "sharpe_ratio",
        ],
        max_topk: Optional[int] = None,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        ymax_sharpe_ratio: Optional[float] = None,
        visualize_ci: bool = False,
        plot_ci: str = "bootstrap",
        plot_alpha: float = 0.05,
        plot_n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "topk_policy_value_standard_ope.png",
    ):
        """Visualize the topk deployment result (policy value) selected by standard OPE.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"]
            Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report.
            For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized.

            We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.
            Only applicable when using a single behavior policy.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        visualize_ci: bool, default=False
            Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.)

        plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        plot_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        plot_n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        legend: bool, default=True
            Whether to include a legend in the figure.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="topk_policy_value_standard_ope.png"
            Name of the bar figure.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            metrics=metrics,
            safety_threshold=safety_threshold,
            relative_safety_criteria=relative_safety_criteria,
            gamma=gamma,
        )
        self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name)

        true_dict = self.obtain_true_selection_result(
            input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        estimation_dict = self.select_by_policy_value(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        # note: true_dict is transformed in this function, as it is passed by reference
        metric_dict = self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking",
            true_dict_value_arg="policy_value",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            return_safety_violation_rate=("safety_violation_rate" in metrics),
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
        )
        # in the case with single input_dict, true_dict has not been transformed
        if not isinstance(input_dict, MultipleInputDict) or (
            behavior_policy_name is not None and dataset_id is not None
        ):
            true_dict = dict(
                zip(
                    true_dict["ranking"],
                    true_dict["policy_value"],
                )
            )

        min_val, max_val = self._obtain_min_max_val_for_topk_visualization(
            true_dict=true_dict,
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        self._visualize_topk_policy_performance(
            metric_dict=metric_dict,
            min_val=min_val,
            max_val=max_val,
            compared_estimators=compared_estimators,
            metrics=metrics,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            visualize_ci=visualize_ci,
            ci=plot_ci,
            alpha=plot_alpha,
            n_bootstrap_samples=plot_n_bootstrap_samples,
            random_state=random_state,
            legend=legend,
            ylabel="policy value",
            ymax_sharpe_ratio=ymax_sharpe_ratio,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_topk_policy_value_selected_by_cumulative_distribution_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        metrics: List[str] = [
            "k-th",
            "best",
            "worst",
            "mean",
            "std",
            "safety_violation_rate",
            "sharpe_ratio",
        ],
        max_topk: Optional[int] = None,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        ymax_sharpe_ratio: Optional[float] = None,
        visualize_ci: bool = False,
        plot_ci: str = "bootstrap",
        plot_alpha: float = 0.05,
        plot_n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "topk_policy_value_cumulative_distribution_ope.png",
    ):
        """Visualize the topk deployment result (policy value) selected by cumulative distribution OPE.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"]
            Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report.
            For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized.

            We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        visualize_ci: bool, default=False
            Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.)

        plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        plot_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        plot_n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        legend: bool, default=True
            Whether to include a legend in the figure.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="topk_policy_value_cumulative_distribution_ope.png"
            Name of the bar figure.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            metrics=metrics,
            safety_threshold=safety_threshold,
            relative_safety_criteria=relative_safety_criteria,
            gamma=gamma,
        )
        self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name)

        true_dict = self.obtain_true_selection_result(
            input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        estimation_dict = self.select_by_policy_value_via_cumulative_distribution_ope(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        # note: true_dict is transformed in this function, as it is passed by reference
        metric_dict = self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking",
            true_dict_value_arg="policy_value",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            return_safety_violation_rate=("safety_violation_rate" in metrics),
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
        )
        # in the case with single input_dict, true_dict has not been transformed
        if not isinstance(input_dict, MultipleInputDict) or (
            behavior_policy_name is not None and dataset_id is not None
        ):
            true_dict = dict(
                zip(
                    true_dict["ranking"],
                    true_dict["policy_value"],
                )
            )

        min_val, max_val = self._obtain_min_max_val_for_topk_visualization(
            true_dict=true_dict,
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        self._visualize_topk_policy_performance(
            metric_dict=metric_dict,
            min_val=min_val,
            max_val=max_val,
            compared_estimators=compared_estimators,
            metrics=metrics,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            visualize_ci=visualize_ci,
            ci=plot_ci,
            alpha=plot_alpha,
            n_bootstrap_samples=plot_n_bootstrap_samples,
            random_state=random_state,
            legend=legend,
            ylabel="policy value",
            ymax_sharpe_ratio=ymax_sharpe_ratio,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_topk_policy_value_selected_by_lower_bound(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        metrics: List[str] = [
            "k-th",
            "best",
            "worst",
            "mean",
            "std",
            "safety_violation_rate",
            "sharpe_ratio",
        ],
        max_topk: Optional[int] = None,
        safety_threshold: Optional[float] = None,
        relative_safety_criteria: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        ymax_sharpe_ratio: Optional[float] = None,
        ope_cis: List[str] = ["bootstrap"],
        ope_alpha: float = 0.05,
        ope_n_bootstrap_samples: int = 100,
        visualize_ci: bool = False,
        plot_ci: str = "bootstrap",
        plot_alpha: float = 0.05,
        plot_n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "topk_policy_value_standard_ope_lower_bound.png",
    ):
        """Visualize the topk deployment result (policy value) selected by its estimated lower bound.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"]
            Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report.
            For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized.

            We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        safety_threshold: float, default=None.
            A policy whose policy value is below the given threshold is to be considered unsafe.

        relative_safety_criteria: float, default=None
            The relative policy value required to be considered a safe policy.
            For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        ope_cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"]
            Estimation methods for confidence intervals.

        ope_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        ope_n_bootstrap_samples: int, default=100 (> 0)
            Number of resampling performed in the bootstrap procedure.

        visualize_ci: bool, default=False
            Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.)

        plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        plot_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        plot_n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        legend: bool, default=True
            Whether to include a legend in the figure.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="topk_policy_value_standard_ope_lower_bound.png"
            Name of the bar figure.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            metrics=metrics,
            safety_threshold=safety_threshold,
            relative_safety_criteria=relative_safety_criteria,
            gamma=gamma,
        )
        self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name)

        true_dict = self.obtain_true_selection_result(
            input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        metric_dict = self.obtain_topk_policy_value_selected_by_lower_bound(
            input_dict=input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            return_safety_violation_rate=("safety_violation_rate" in metrics),
            safety_threshold=safety_threshold,
            relative_safety_criteria=relative_safety_criteria,
            clip_sharpe_ratio=clip_sharpe_ratio,
            cis=ope_cis,
            ope_alpha=ope_alpha,
            n_bootstrap_samples=ope_n_bootstrap_samples,
            random_state=random_state,
        )

        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None and dataset_id is None:
                for behavior_policy, n_datasets in input_dict.n_datasets.items():
                    for dataset_id_ in range(n_datasets):
                        true_dict[behavior_policy][dataset_id_] = dict(
                            zip(
                                true_dict[behavior_policy][dataset_id_]["policy_value"],
                                true_dict[behavior_policy][dataset_id_]["policy_value"],
                            )
                        )
            elif behavior_policy_name is None and dataset_id is not None:
                for behavior_policy in input_dict.behavior_policy_names:
                    true_dict[behavior_policy] = dict(
                        zip(
                            true_dict[behavior_policy]["policy_value"],
                            true_dict[behavior_policy]["policy_value"],
                        )
                    )
            elif behavior_policy_name is not None and dataset_id is None:
                for dataset_id_ in range(input_dict.n_datasets[behavior_policy_name]):
                    true_dict[dataset_id_] = dict(
                        zip(
                            true_dict[dataset_id_]["policy_value"],
                            true_dict[dataset_id_]["policy_value"],
                        )
                    )
            else:
                true_dict = dict(
                    zip(
                        true_dict["policy_value"],
                        true_dict["policy_value"],
                    )
                )
        else:
            true_dict = dict(
                zip(
                    true_dict["policy_value"],
                    true_dict["policy_value"],
                )
            )

        min_val, max_val = self._obtain_min_max_val_for_topk_visualization(
            true_dict=true_dict,
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        yaxis_min_val = (
            min_val if safety_threshold is None else min(min_val, safety_threshold)
        )
        yaxis_max_val = (
            max_val if safety_threshold is None else max(max_val, safety_threshold)
        )
        margin = (yaxis_max_val - yaxis_min_val) * 0.05

        plt.style.use("ggplot")
        color = plt.rcParams["axes.prop_cycle"].by_key()["color"]
        n_colors = len(color)

        n_rows = len(ope_cis)
        n_cols = len(metrics)

        fig, axes = plt.subplots(
            nrows=n_rows,
            ncols=n_cols,
            figsize=(6 * n_cols, 4 * n_rows),
        )

        if n_rows == 1:
            ope_ci = ope_cis[0]

            if len(metrics) == 1:
                for i, estimator in enumerate(compared_estimators):
                    axes.plot(
                        np.arange(1, max_topk + 1),
                        metric_dict[ope_ci][estimator][metric].mean(axis=1),
                        color=color[i % n_colors],
                        marker=markers[i],
                        label=estimator,
                    )

                    if visualize_ci:
                        lower = np.zeros(max_topk)
                        upper = np.zeros(max_topk)

                        for topk in range(max_topk):
                            ci = self._estimate_confidence_interval[plot_ci](
                                metric_dict[ope_ci][estimator][metric][topk],
                                alpha=plot_alpha,
                                n_bootstrap_samples=plot_n_bootstrap_samples,
                                random_state=random_state,
                            )
                            lower[topk] = ci[f"{100 * (1. - plot_alpha)}% CI (lower)"]
                            upper[topk] = ci[f"{100 * (1. - plot_alpha)}% CI (upper)"]

                        axes.fill_between(
                            np.arange(1, max_topk + 1),
                            lower,
                            upper,
                            color=color[i % n_colors],
                            alpha=0.3,
                        )

                if metric in ["k-th", "best", "worst", "mean"]:
                    if safety_threshold is not None:
                        axes.plot(
                            np.arange(1, max_topk + 1),
                            np.full(max_topk, safety_threshold),
                            color=dkred,
                            label="safety threshold",
                        )
                        axes.plot(
                            np.arange(1, max_topk + 1),
                            np.full(max_topk, max_val),
                            color="black",
                            linewidth=0.5,
                        )
                        axes.plot(
                            np.arange(1, max_topk + 1),
                            np.full(max_topk, min_val),
                            color="black",
                            linewidth=0.5,
                        )

                    axes.set_title(f"{metric}")
                    axes.set_ylabel(f"{metric} policy value")
                    axes.set_ylim(yaxis_min_val - margin, yaxis_max_val + margin)

                elif metric == "std":
                    axes.set_title("std")
                    axes.set_ylabel("standard deviation")

                elif metric == "sharpe_ratio":
                    axes.plot(
                        np.arange(2, max_topk + 1),
                        np.zeros(max_topk - 1),
                        color="black",
                        linewidth=0.5,
                    )

                    axes.set_title("sharpe ratio")
                    axes.set_ylabel("sharpe ratio")
                    axes.set_ylim(0.0, ymax_sharpe_ratio)

                else:
                    axes.set_title("safety violation")
                    axes.set_ylabel("safety violation rate")
                    axes.set_ylim(-0.05, 1.05)

                axes.set_xlabel("# of policies deployed")

                if legend:
                    axes.legend(loc="upper right")

            if legend:
                handles, labels = axes.get_legend_handles_labels()
                # n_cols shows err
                # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6))

            else:
                for j, metric in enumerate(metrics):
                    for i, estimator in enumerate(compared_estimators):
                        axes[j].plot(
                            np.arange(1, max_topk + 1),
                            metric_dict[ope_ci][estimator][metric].mean(axis=1),
                            color=color[i % n_colors],
                            marker=markers[i],
                            label=estimator,
                        )

                        if visualize_ci:
                            lower = np.zeros(max_topk)
                            upper = np.zeros(max_topk)

                            for topk in range(max_topk):
                                ci = self._estimate_confidence_interval[plot_ci](
                                    metric_dict[ope_ci][estimator][metric][topk],
                                    alpha=plot_alpha,
                                    n_bootstrap_samples=plot_n_bootstrap_samples,
                                    random_state=random_state,
                                )
                                lower[topk] = ci[
                                    f"{100 * (1. - plot_alpha)}% CI (lower)"
                                ]
                                upper[topk] = ci[
                                    f"{100 * (1. - plot_alpha)}% CI (upper)"
                                ]

                            axes[j].fill_between(
                                np.arange(1, max_topk + 1),
                                lower,
                                upper,
                                color=color[i % n_colors],
                                alpha=0.3,
                            )

                    if metric in ["k-th", "best", "worst", "mean"]:
                        if safety_threshold is not None:
                            axes[j].plot(
                                np.arange(1, max_topk + 1),
                                np.full(max_topk, safety_threshold),
                                color=dkred,
                                label="safety threshold",
                            )
                            axes[j].plot(
                                np.arange(1, max_topk + 1),
                                np.full(max_topk, max_val),
                                color="black",
                                linewidth=0.5,
                            )
                            axes[j].plot(
                                np.arange(1, max_topk + 1),
                                np.full(max_topk, min_val),
                                color="black",
                                linewidth=0.5,
                            )

                        axes[j].set_title(f"{metric}")
                        axes[j].set_ylabel(f"{metric} policy value")
                        axes[j].set_ylim(yaxis_min_val - margin, yaxis_max_val + margin)

                    elif metric == "std":
                        axes[j].set_title("std")
                        axes[j].set_ylabel("standard deviation")

                    elif metric == "sharpe_ratio":
                        axes.plot(
                            np.arange(2, max_topk + 1),
                            np.zeros(max_topk - 1),
                            color="black",
                            linewidth=0.5,
                        )

                        axes[j].set_title("sharpe ratio")
                        axes[j].set_ylabel("sharpe ratio")
                        axes[j].set_ylim(0.0, ymax_sharpe_ratio)

                    else:
                        axes[j].set_title("safety violation")
                        axes[j].set_ylabel("safety violation rate")
                        axes[j].set_ylim(-0.05, 1.05)

                    axes[j].set_xlabel("# of policies deployed")

                    if legend:
                        axes[j].legend(loc="upper right")

                if legend:
                    handles, labels = axes[0].get_legend_handles_labels()
                    # n_cols shows err
                    # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6))

        else:
            if len(metrics) == 1:
                for l, ope_ci in enumerate(ope_cis):
                    for i, estimator in enumerate(compared_estimators):
                        axes[l].plot(
                            np.arange(1, max_topk + 1),
                            metric_dict[ope_ci][estimator][metric].mean(axis=1),
                            color=color[i % n_colors],
                            marker=markers[i],
                            label=estimator,
                        )

                        if visualize_ci:
                            lower = np.zeros(max_topk)
                            upper = np.zeros(max_topk)

                            for topk in range(max_topk):
                                ci = self._estimate_confidence_interval[plot_ci](
                                    metric_dict[ope_ci][estimator][metric][topk],
                                    alpha=plot_alpha,
                                    n_bootstrap_samples=plot_n_bootstrap_samples,
                                    random_state=random_state,
                                )
                                lower[topk] = ci[
                                    f"{100 * (1. - plot_alpha)}% CI (lower)"
                                ]
                                upper[topk] = ci[
                                    f"{100 * (1. - plot_alpha)}% CI (upper)"
                                ]

                            axes[l].fill_between(
                                np.arange(1, max_topk + 1),
                                lower,
                                upper,
                                color=color[i % n_colors],
                                alpha=0.3,
                            )

                    if metric in ["k-th", "best", "worst", "mean"]:
                        if safety_threshold is not None:
                            axes[l].plot(
                                np.arange(1, max_topk + 1),
                                np.full(max_topk, safety_threshold),
                                color=dkred,
                                label="safety threshold",
                            )
                            axes[l].plot(
                                np.arange(1, max_topk + 1),
                                np.full(max_topk, max_val),
                                color="black",
                                linewidth=0.5,
                            )
                            axes[l].plot(
                                np.arange(1, max_topk + 1),
                                np.full(max_topk, min_val),
                                color="black",
                                linewidth=0.5,
                            )

                        axes[l].set_title(f"{metric}")
                        axes[l].set_ylabel(f"{metric} policy value")
                        axes[l].set_ylim(yaxis_min_val - margin, yaxis_max_val + margin)

                    elif metric == "std":
                        axes[l].set_title("std")
                        axes[l].set_ylabel("standard deviation")

                    elif metric == "sharpe_ratio":
                        axes[l].plot(
                            np.arange(2, max_topk + 1),
                            np.zeros(max_topk - 1),
                            color="black",
                            linewidth=0.5,
                        )

                        axes[l].set_title("sharpe ratio")
                        axes[l].set_ylabel("sharpe ratio")
                        axes[l].set_ylim(0.0, ymax_sharpe_ratio)

                    else:
                        axes[l].set_title("safety violation")
                        axes[l].set_ylabel("safety violation rate")
                        axes[l].set_ylim(-0.05, 1.05)

                    axes[l].set_xlabel("# of policies deployed")

                    if legend:
                        axes[l].legend(loc="upper right")

                if legend:
                    handles, labels = axes[0].get_legend_handles_labels()
                    # n_cols shows err
                    # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6))

            else:
                for l, ope_ci in enumerate(ope_cis):
                    for j, metric in enumerate(metrics):
                        for i, estimator in enumerate(compared_estimators):
                            axes[l, j].plot(
                                np.arange(1, max_topk + 1),
                                metric_dict[ope_ci][estimator][metric].mean(axis=1),
                                color=color[i % n_colors],
                                marker=markers[i],
                                label=estimator,
                            )

                            if visualize_ci:
                                lower = np.zeros(max_topk)
                                upper = np.zeros(max_topk)

                                for topk in range(max_topk):
                                    ci = self._estimate_confidence_interval[plot_ci](
                                        metric_dict[ope_ci][estimator][metric][topk],
                                        alpha=plot_alpha,
                                        n_bootstrap_samples=plot_n_bootstrap_samples,
                                        random_state=random_state,
                                    )
                                    lower[topk] = ci[
                                        f"{100 * (1. - plot_alpha)}% CI (lower)"
                                    ]
                                    upper[topk] = ci[
                                        f"{100 * (1. - plot_alpha)}% CI (upper)"
                                    ]

                                axes[l, j].fill_between(
                                    np.arange(1, max_topk + 1),
                                    lower,
                                    upper,
                                    color=color[i % n_colors],
                                    alpha=0.3,
                                )

                        if metric in ["k-th", "best", "worst", "mean"]:
                            if safety_threshold is not None:
                                axes[l, j].plot(
                                    np.arange(1, max_topk + 1),
                                    np.full(max_topk, safety_threshold),
                                    color=dkred,
                                    label="safety threshold",
                                )
                                axes[l, j].plot(
                                    np.arange(1, max_topk + 1),
                                    np.full(max_topk, max_val),
                                    color="black",
                                    linewidth=0.5,
                                )
                                axes[l, j].plot(
                                    np.arange(1, max_topk + 1),
                                    np.full(max_topk, min_val),
                                    color="black",
                                    linewidth=0.5,
                                )

                            axes[l, j].set_title(f"{metric}")
                            axes[l, j].set_ylabel(f"{metric} policy value")
                            axes[l, j].set_ylim(
                                yaxis_min_val - margin, yaxis_max_val + margin
                            )

                        elif metric == "std":
                            axes[l, j].set_title("std")
                            axes[l, j].set_ylabel("standard deviation")

                        elif metric == "sharpe_ratio":
                            axes[l, j].plot(
                                np.arange(2, max_topk + 1),
                                np.zeros(max_topk - 1),
                                color="black",
                                linewidth=0.5,
                            )

                            axes[l, j].set_title("sharpe ratio")
                            axes[l, j].set_ylabel("sharpe ratio")
                            axes[l, j].set_ylim(0.0, ymax_sharpe_ratio)

                        else:
                            axes[l, j].set_title("safety violation")
                            axes[l, j].set_ylabel("safety violation rate")
                            axes[l, j].set_ylim(-0.05, 1.05)

                        axes[l, j].set_xlabel("# of policies deployed")

                        if legend:
                            axes[l, j].legend(loc="upper right")

                if legend:
                    handles, labels = axes[0, 0].get_legend_handles_labels()
                    # n_cols shows err
                    # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6))

        fig.subplots_adjust(hspace=0.35, wspace=0.2)
        plt.show()

        if fig_dir:
            fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")

[docs]    def visualize_topk_conditional_value_at_risk_selected_by_standard_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        ope_alpha: float = 0.05,
        metrics: List[str] = [
            "k-th",
            "best",
            "worst",
            "mean",
            "std",
            "safety_violation_rate",
            "sharpe_ratio",
        ],
        max_topk: Optional[int] = None,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        ymax_sharpe_ratio: Optional[float] = None,
        visualize_ci: bool = False,
        plot_ci: str = "bootstrap",
        plot_alpha: float = 0.05,
        plot_n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "topk_cvar_standard_ope.png",
    ):
        """Visualize the topk deployment result (CVaR) selected by standard OPE.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        ope_alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 1]`.

        metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"]
            Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report.
            For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized.

            We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        safety_threshold: float, default=0.0 (>= 0)
            The conditional value at risk required to be considered a safe policy.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        visualize_ci: bool, default=False
            Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.)

        plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        plot_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        plot_n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        legend: bool, default=True
            Whether to include a legend in the figure.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="topk_cvar_standard_ope.png"
            Name of the bar figure.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            metrics=metrics,
            safety_threshold=safety_threshold,
            gamma=gamma,
        )
        self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name)

        true_dict = self.obtain_true_selection_result(
            input_dict,
            return_conditional_value_at_risk=True,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            cvar_alpha=ope_alpha,
        )
        estimation_dict = self.select_by_policy_value(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        # note: true_dict is transformed in this function, as it is passed by reference
        metric_dict = self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking_by_conditional_value_at_risk",
            true_dict_value_arg="conditional_value_at_risk",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            ope_alpha=ope_alpha,
            return_safety_violation_rate=("safety_violation_rate" in metrics),
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
        )
        # in the case with single input_dict, true_dict has not been transformed
        if not isinstance(input_dict, MultipleInputDict) or (
            behavior_policy_name is not None and dataset_id is not None
        ):
            true_dict = dict(
                zip(
                    true_dict["ranking_by_conditional_value_at_risk"],
                    true_dict["conditional_value_at_risk"],
                )
            )

        min_val, max_val = self._obtain_min_max_val_for_topk_visualization(
            true_dict=true_dict,
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        self._visualize_topk_policy_performance(
            metric_dict=metric_dict,
            min_val=min_val,
            max_val=max_val,
            compared_estimators=compared_estimators,
            metrics=metrics,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            visualize_ci=visualize_ci,
            ci=plot_ci,
            alpha=plot_alpha,
            n_bootstrap_samples=plot_n_bootstrap_samples,
            random_state=random_state,
            ymax_sharpe_ratio=ymax_sharpe_ratio,
            legend=legend,
            ylabel=f"CVaR ({ope_alpha})",
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_topk_conditional_value_at_risk_selected_by_cumulative_distribution_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        ope_alpha: float = 0.05,
        metrics: List[str] = [
            "k-th",
            "best",
            "worst",
            "mean",
            "std",
            "safety_violation_rate",
            "sharpe_ratio",
        ],
        max_topk: Optional[int] = None,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        ymax_sharpe_ratio: Optional[float] = None,
        visualize_ci: bool = False,
        plot_ci: str = "bootstrap",
        plot_alpha: float = 0.05,
        plot_n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "topk_cvar_cumulative_distribution_ope.png",
    ):
        """Visualize the topk deployment result (CVaR) selected by cumulative distribution OPE.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        ope_alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 1]`.

        metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"]
            Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report.
            For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized.

            We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        safety_threshold: float, default=0.0 (>= 0)
            The conditional value at risk required to be considered a safe policy.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        visualize_ci: bool, default=False
            Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.)

        plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        plot_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        plot_n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        legend: bool, default=True
            Whether to include a legend in the figure.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="topk_cvar_cumulative_distribution_ope.png"
            Name of the bar figure.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            metrics=metrics,
            safety_threshold=safety_threshold,
            gamma=gamma,
        )
        self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name)

        true_dict = self.obtain_true_selection_result(
            input_dict,
            return_conditional_value_at_risk=True,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            cvar_alpha=ope_alpha,
        )
        estimation_dict = self.select_by_conditional_value_at_risk(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=ope_alpha,
        )
        # note: true_dict is transformed in this function, as it is passed by reference
        metric_dict = self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking_by_conditional_value_at_risk",
            true_dict_value_arg="conditional_value_at_risk",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            ope_alpha=ope_alpha,
            return_safety_violation_rate=("safety_violation_rate" in metrics),
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
        )
        # in the case with single input_dict, true_dict has not been transformed
        if not isinstance(input_dict, MultipleInputDict) or (
            behavior_policy_name is not None and dataset_id is not None
        ):
            true_dict = dict(
                zip(
                    true_dict["ranking_by_conditional_value_at_risk"],
                    true_dict["conditional_value_at_risk"],
                )
            )

        min_val, max_val = self._obtain_min_max_val_for_topk_visualization(
            true_dict=true_dict,
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        self._visualize_topk_policy_performance(
            metric_dict=metric_dict,
            min_val=min_val,
            max_val=max_val,
            compared_estimators=compared_estimators,
            metrics=metrics,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            visualize_ci=visualize_ci,
            ci=plot_ci,
            alpha=plot_alpha,
            n_bootstrap_samples=plot_n_bootstrap_samples,
            random_state=random_state,
            legend=legend,
            ylabel=f"CVaR ({ope_alpha})",
            ymax_sharpe_ratio=ymax_sharpe_ratio,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_topk_lower_quartile_selected_by_standard_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        ope_alpha: float = 0.05,
        metrics: List[str] = [
            "k-th",
            "best",
            "worst",
            "mean",
            "std",
            "safety_violation_rate",
            "sharpe_ratio",
        ],
        max_topk: Optional[int] = None,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        ymax_sharpe_ratio: Optional[float] = None,
        visualize_ci: bool = False,
        plot_ci: str = "bootstrap",
        plot_alpha: float = 0.05,
        plot_n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "topk_lower_quartile_standard_ope.png",
    ):
        """Visualize the topk deployment result (lower quartile) selected by standard OPE.

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 0.5]`.

        metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"]
            Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report.
            For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized.

            We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        safety_threshold: float, default=0.0 (>= 0)
            The conditional value at risk required to be considered a safe policy.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        visualize_ci: bool, default=False
            Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.)

        plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        plot_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        plot_n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        legend: bool, default=True
            Whether to include a legend in the figure.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="topk_lower_quartile_standard_ope.png"
            Name of the bar figure.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            metrics=metrics,
            safety_threshold=safety_threshold,
            gamma=gamma,
        )
        self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name)

        true_dict = self.obtain_true_selection_result(
            input_dict,
            return_lower_quartile=True,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            quartile_alpha=ope_alpha,
        )
        estimation_dict = self.select_by_policy_value(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )
        # note: true_dict is transformed in this function, as it is passed by reference
        metric_dict = self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking_by_lower_quartile",
            true_dict_value_arg="lower_quartile",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            ope_alpha=ope_alpha,
            return_safety_violation_rate=("safety_violation_rate" in metrics),
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
        )
        # in the case with single input_dict, true_dict has not been transformed
        if not isinstance(input_dict, MultipleInputDict) or (
            behavior_policy_name is not None and dataset_id is not None
        ):
            true_dict = dict(
                zip(
                    true_dict["ranking_by_lower_quartile"],
                    true_dict["lower_quartile"],
                )
            )

        min_val, max_val = self._obtain_min_max_val_for_topk_visualization(
            true_dict=true_dict,
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        self._visualize_topk_policy_performance(
            metric_dict=metric_dict,
            min_val=min_val,
            max_val=max_val,
            compared_estimators=compared_estimators,
            metrics=metrics,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            visualize_ci=visualize_ci,
            ci=plot_ci,
            alpha=plot_alpha,
            n_bootstrap_samples=plot_n_bootstrap_samples,
            random_state=random_state,
            legend=legend,
            ylabel=f"lower quartile ({ope_alpha})",
            ymax_sharpe_ratio=ymax_sharpe_ratio,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_topk_lower_quartile_selected_by_cumulative_distribution_ope(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        ope_alpha: float = 0.05,
        metrics: List[str] = [
            "k-th",
            "best",
            "worst",
            "mean",
            "std",
            "safety_violation_rate",
            "sharpe_ratio",
        ],
        max_topk: Optional[int] = None,
        safety_threshold: Optional[float] = None,
        clip_sharpe_ratio: bool = False,
        ymax_sharpe_ratio: Optional[float] = None,
        visualize_ci: bool = False,
        plot_ci: str = "bootstrap",
        plot_alpha: float = 0.05,
        plot_n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "topk_lower_quartile_cumulative_distribution_ope.png",
    ):
        """Visualize the topk deployment result (lower quartile) selected by cumulative distribution OPE.

        Parameters
        -------
        input_dict: OPEInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.
            If `None`, the average of the result will be shown.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 0.5]`.

        metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"]
            Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report.
            For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized.

            We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`.

        max_topk: int, default=None
            Maximum number of policies to be deployed.
            If `None` is given, all the policies will be deployed.

        safety_threshold: float, default=0.0 (>= 0)
            The conditional value at risk required to be considered a safe policy.

        clip_sharpe_ratio: bool, default=False
            Whether to clip a large value of SharpeRatio with 1e2.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        visualize_ci: bool, default=False
            Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.)

        plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap"
            Method to estimate the confidence interval.

        plot_alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        plot_n_bootstrap_samples: int, default=10000 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        ymax_sharp_ratio: float, default=None
            Maximum value in y-axis of the plot of SharpeRatio.

        legend: bool, default=True
            Whether to include a legend in the figure.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="topk_lower_quartile_cumulative_distribution_ope.png"
            Name of the bar figure.

        """
        if isinstance(input_dict, MultipleInputDict):
            input_dict_ = input_dict.get(
                behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0
            )
            gamma = list(input_dict_.values())[0]["gamma"]
        else:
            gamma = list(input_dict.values())[0]["gamma"]

        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        max_topk, safety_threshold = self._check_topk_inputs(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            metrics=metrics,
            safety_threshold=safety_threshold,
            gamma=gamma,
        )
        self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name)

        true_dict = self.obtain_true_selection_result(
            input_dict,
            return_lower_quartile=True,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            quartile_alpha=ope_alpha,
        )
        estimation_dict = self.select_by_lower_quartile(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=ope_alpha,
        )
        # note: true_dict is transformed in this function, as it is passed by reference
        metric_dict = self._obtain_topk_policy_performance(
            true_dict=true_dict,
            estimation_dict=estimation_dict,
            input_dict=input_dict,
            true_dict_ranking_arg="ranking_by_lower_quartile",
            true_dict_value_arg="lower_quartile",
            estimation_dict_ranking_arg="estimated_ranking",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            max_topk=max_topk,
            ope_alpha=ope_alpha,
            return_safety_violation_rate=("safety_violation_rate" in metrics),
            safety_threshold=safety_threshold,
            clip_sharpe_ratio=clip_sharpe_ratio,
        )
        # in the case with single input_dict, true_dict has not been transformed
        if not isinstance(input_dict, MultipleInputDict) or (
            behavior_policy_name is not None and dataset_id is not None
        ):
            true_dict = dict(
                zip(
                    true_dict["ranking_by_lower_quartile"],
                    true_dict["lower_quartile"],
                )
            )

        min_val, max_val = self._obtain_min_max_val_for_topk_visualization(
            true_dict=true_dict,
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        self._visualize_topk_policy_performance(
            metric_dict=metric_dict,
            min_val=min_val,
            max_val=max_val,
            compared_estimators=compared_estimators,
            metrics=metrics,
            max_topk=max_topk,
            safety_threshold=safety_threshold,
            visualize_ci=visualize_ci,
            ci=plot_ci,
            alpha=plot_alpha,
            n_bootstrap_samples=plot_n_bootstrap_samples,
            random_state=random_state,
            legend=legend,
            ylabel=f"lower quartile ({ope_alpha})",
            ymax_sharpe_ratio=ymax_sharpe_ratio,
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

    def _visualize_policy_performance_for_validation(
        self,
        estimation_dict: Dict,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        true_value_arg: str,
        estimated_value_arg: str,
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        n_cols: Optional[int] = None,
        share_axes: bool = False,
        legend: bool = True,
        ylabel: str = "policy performance",
        fig_dir: Optional[Path] = None,
        fig_name: Optional[str] = None,
    ):
        """Visualize the correlation between the true and estimated policy performance.

        Parameters
        -------
        estimation_dict: dict
            Dictionary containing the estimated policy performance.

        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        true_value_arg: str
            Name of the key indicating the true policy performance in estimation_dict.

        estimated_value_arg: str
            Name of the key indicating the estimated policy performance in estimation_dict.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        share_axes: bool, default=False
            Whether to share x- and y-axes or not.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        ylabel: str, default="policy performance"
            Label of the y-axis.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default=None
            Name of the bar figure.

        """
        plt.style.use("ggplot")
        color = plt.rcParams["axes.prop_cycle"].by_key()["color"]
        n_colors = len(color)

        n_figs = len(compared_estimators)
        n_cols = min(5, n_figs) if n_cols is None else n_cols
        n_rows = (n_figs - 1) // n_cols + 1

        fig, axes = plt.subplots(
            nrows=n_rows,
            ncols=n_cols,
            figsize=(4 * n_cols, 3 * n_rows),
            sharex=share_axes,
            sharey=share_axes,
        )

        guide_min, guide_max = 1e5, -1e5
        if n_rows == 1:
            for i, estimator in enumerate(compared_estimators):
                if isinstance(input_dict, MultipleInputDict):
                    if behavior_policy_name is None and dataset_id is None:
                        min_val, max_val = np.infty, -np.infty

                        for l, behavior_policy in enumerate(
                            input_dict.behavior_policy_names
                        ):
                            n_datasets = input_dict.n_datasets[behavior_policy]
                            min_vals = np.zeros(n_datasets)
                            max_vals = np.zeros(n_datasets)

                            for dataset_id_ in range(n_datasets):
                                true_policy_value = estimation_dict[behavior_policy][
                                    dataset_id_
                                ][estimator][true_value_arg]
                                estimated_policy_value = estimation_dict[
                                    behavior_policy
                                ][dataset_id_][estimator][estimated_value_arg]

                                if dataset_id_ == 0:
                                    axes[i].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[l % n_colors],
                                        label=behavior_policy,
                                    )
                                else:  # to avoid duplicated labels
                                    axes[i].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[l % n_colors],
                                    )

                                min_vals[dataset_id_] = np.minimum(
                                    np.nanmin(true_policy_value),
                                    np.nanmin(estimated_policy_value),
                                )
                                max_vals[dataset_id_] = np.maximum(
                                    np.nanmax(true_policy_value),
                                    np.nanmax(estimated_policy_value),
                                )

                            min_val = min(min_val, min_vals.min())
                            max_val = max(max_val, max_vals.max())

                    elif behavior_policy_name is None and dataset_id is not None:
                        min_val, max_val = np.infty, -np.infty

                        for l, behavior_policy in enumerate(
                            input_dict.behavior_policy_names
                        ):
                            true_policy_value = estimation_dict[behavior_policy][
                                estimator
                            ][true_value_arg]
                            estimated_policy_value = estimation_dict[behavior_policy][
                                estimator
                            ][estimated_value_arg]

                            axes[i].scatter(
                                true_policy_value,
                                estimated_policy_value,
                                color=color[l % n_colors],
                                label=behavior_policy,
                            )

                            min_val_ = np.minimum(
                                np.nanmin(true_policy_value),
                                np.nanmin(estimated_policy_value),
                            )
                            max_val_ = np.maximum(
                                np.nanmax(true_policy_value),
                                np.nanmax(estimated_policy_value),
                            )

                        min_val = min(min_val, min_val_)
                        max_val = max(max_val, max_val_)

                    elif behavior_policy_name is not None and dataset_id is None:
                        n_datasets = input_dict.n_datasets[behavior_policy_name]
                        min_vals = np.zeros(n_datasets)
                        max_vals = np.zeros(n_datasets)

                        for dataset_id_ in range(n_datasets):
                            true_policy_value = estimation_dict[dataset_id_][estimator][
                                true_value_arg
                            ]
                            estimated_policy_value = estimation_dict[dataset_id_][
                                estimator
                            ][estimated_value_arg]

                            axes[i].scatter(
                                true_policy_value,
                                estimated_policy_value,
                                color=color[0],
                            )

                            min_vals[dataset_id_] = np.minimum(
                                np.nanmin(true_policy_value),
                                np.nanmin(estimated_policy_value),
                            )
                            max_vals[dataset_id_] = np.maximum(
                                np.nanmax(true_policy_value),
                                np.nanmax(estimated_policy_value),
                            )

                        min_val = min_vals.min()
                        max_val = max_vals.max()

                    else:
                        true_policy_value = estimation_dict[estimator][true_value_arg]
                        estimated_policy_value = estimation_dict[estimator][
                            estimated_value_arg
                        ]

                        axes[i].scatter(
                            true_policy_value,
                            estimated_policy_value,
                            color=color[0],
                        )

                        min_val = np.minimum(
                            np.nanmin(true_policy_value),
                            np.nanmin(estimated_policy_value),
                        )
                        max_val = np.maximum(
                            np.nanmax(true_policy_value),
                            np.nanmax(estimated_policy_value),
                        )

                else:
                    true_policy_value = estimation_dict[estimator][true_value_arg]
                    estimated_policy_value = estimation_dict[estimator][
                        estimated_value_arg
                    ]

                    axes[i].scatter(
                        true_policy_value,
                        estimated_policy_value,
                        color=color[0],
                    )

                    min_val = np.minimum(
                        np.nanmin(true_policy_value),
                        np.nanmin(estimated_policy_value),
                    )
                    max_val = np.maximum(
                        np.nanmax(true_policy_value),
                        np.nanmax(estimated_policy_value),
                    )

                axes[i].set_title(estimator)
                axes[i].set_xlabel(f"true {ylabel}")
                axes[i].set_ylabel(f"estimated {ylabel}")

                if (
                    legend
                    and behavior_policy_name is None
                    and isinstance(input_dict, MultipleInputDict)
                ):
                    axes[i].legend(title="behavior_policy", loc="lower right")

                if not share_axes:
                    margin = (max_val - min_val) * 0.05
                    guide = np.linspace(min_val - margin, max_val + margin)
                    axes[i].plot(
                        guide,
                        guide,
                        color="black",
                        linewidth=1.0,
                    )

                guide_min = min_val if guide_min > min_val else guide_min
                guide_max = max_val if guide_max < max_val else guide_max

            if share_axes:
                margin = (guide_max - guide_min) * 0.05
                guide = np.linspace(guide_min - margin, guide_max + margin)
                for i, estimator in enumerate(compared_estimators):
                    axes[i].plot(
                        guide,
                        guide,
                        color="black",
                        linewidth=1.0,
                    )

        else:
            for i, estimator in enumerate(compared_estimators):
                if isinstance(input_dict, MultipleInputDict):
                    if behavior_policy_name is None and dataset_id is None:
                        min_val, max_val = np.infty, -np.infty

                        for l, behavior_policy in enumerate(
                            input_dict.behavior_policy_names
                        ):
                            n_datasets = input_dict.n_datasets[behavior_policy]
                            min_vals = np.zeros(n_datasets)
                            max_vals = np.zeros(n_datasets)

                            for dataset_id_ in range(n_datasets):
                                true_policy_value = estimation_dict[behavior_policy][
                                    dataset_id_
                                ][estimator][true_value_arg]
                                estimated_policy_value = estimation_dict[
                                    behavior_policy
                                ][dataset_id_][estimator][estimated_value_arg]

                                if dataset_id_ == 0:
                                    axes[i // n_cols, i % n_cols].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[l % n_colors],
                                        label=behavior_policy,
                                    )
                                else:  # to avoid duplicated labels
                                    axes[i // n_cols, i % n_cols].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[l % n_colors],
                                    )

                                min_vals[dataset_id_] = np.minimum(
                                    np.nanmin(true_policy_value),
                                    np.nanmin(estimated_policy_value),
                                )
                                max_vals[dataset_id_] = np.maximum(
                                    np.nanmax(true_policy_value),
                                    np.nanmax(estimated_policy_value),
                                )

                            min_val = min(min_val, min_vals.min())
                            max_val = max(max_val, max_vals.max())

                    elif behavior_policy_name is None and dataset_id is not None:
                        min_val, max_val = np.infty, -np.infty

                        for l, behavior_policy in enumerate(
                            input_dict.behavior_policy_names
                        ):
                            true_policy_value = estimation_dict[behavior_policy][
                                estimator
                            ][true_value_arg]
                            estimated_policy_value = estimation_dict[behavior_policy][
                                estimator
                            ][estimated_value_arg]

                            axes[i // n_cols, i % n_cols].scatter(
                                true_policy_value,
                                estimated_policy_value,
                                color=color[l % n_colors],
                                label=behavior_policy,
                            )

                            min_val_ = np.minimum(
                                np.nanmin(true_policy_value),
                                np.nanmin(estimated_policy_value),
                            )
                            max_val_ = np.maximum(
                                np.nanmax(true_policy_value),
                                np.nanmax(estimated_policy_value),
                            )

                        min_val = min(min_val, min_val_)
                        max_val = max(max_val, max_val_)

                    elif behavior_policy_name is not None and dataset_id is None:
                        n_datasets = input_dict.n_datasets[behavior_policy_name]
                        min_vals = np.zeros(n_datasets)
                        max_vals = np.zeros(n_datasets)

                        for dataset_id_ in range(n_datasets):
                            true_policy_value = estimation_dict[dataset_id_][estimator][
                                true_value_arg
                            ]
                            estimated_policy_value = estimation_dict[dataset_id_][
                                estimator
                            ][estimated_value_arg]

                            axes[i // n_cols, i % n_cols].scatter(
                                true_policy_value,
                                estimated_policy_value,
                                color=color[0],
                            )

                            min_vals[dataset_id_] = np.minimum(
                                np.nanmin(true_policy_value),
                                np.nanmin(estimated_policy_value),
                            )
                            max_vals[dataset_id_] = np.maximum(
                                np.nanmax(true_policy_value),
                                np.nanmax(estimated_policy_value),
                            )

                        min_val = min_vals.min()
                        max_val = max_vals.max()

                    else:
                        true_policy_value = estimation_dict[estimator][true_value_arg]
                        estimated_policy_value = estimation_dict[estimator][
                            estimated_value_arg
                        ]

                        axes[i // n_cols, i % n_cols].scatter(
                            true_policy_value,
                            estimated_policy_value,
                            color=color[0],
                        )

                        min_val = np.minimum(
                            np.nanmin(true_policy_value),
                            np.nanmin(estimated_policy_value),
                        )
                        max_val = np.maximum(
                            np.nanmax(true_policy_value),
                            np.nanmax(estimated_policy_value),
                        )

                else:
                    true_policy_value = estimation_dict[estimator][true_value_arg]
                    estimated_policy_value = estimation_dict[estimator][
                        estimated_value_arg
                    ]

                    axes[i // n_cols, i % n_cols].scatter(
                        true_policy_value,
                        estimated_policy_value,
                        color=color[0],
                    )

                    min_val = np.minimum(
                        np.nanmin(true_policy_value),
                        np.nanmin(estimated_policy_value),
                    )
                    max_val = np.maximum(
                        np.nanmax(true_policy_value),
                        np.nanmax(estimated_policy_value),
                    )

                axes[i // n_cols, i % n_cols].set_title(estimator)
                axes[i // n_cols, i % n_cols].set_xlabel(f"true {ylabel}")
                axes[i // n_cols, i % n_cols].set_ylabel(f"estimated {ylabel}")

                if (
                    legend
                    and behavior_policy_name is None
                    and isinstance(input_dict, MultipleInputDict)
                ):
                    axes[i // n_cols, i % n_cols].legend(
                        title="behavior_policy", loc="lower right"
                    )

                if not share_axes:
                    margin = (max_val - min_val) * 0.05
                    guide = np.linspace(min_val - margin, max_val + margin)
                    axes[i // n_cols, i % n_cols].plot(
                        guide,
                        guide,
                        color="black",
                        linewidth=1.0,
                    )

                guide_min = min_val if guide_min > min_val else guide_min
                guide_max = max_val if guide_max < max_val else guide_max

            if share_axes:
                margin = (guide_max - guide_min) * 0.05
                guide = np.linspace(guide_min - margin, guide_max + margin)
                for i, estimator in enumerate(compared_estimators):
                    axes[i // n_cols, i % n_cols].plot(
                        guide,
                        guide,
                        color="black",
                        linewidth=1.0,
                    )

        fig.tight_layout()
        plt.show()

        if fig_dir:
            fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")

[docs]    def visualize_policy_value_for_validation(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        n_cols: Optional[int] = None,
        share_axes: bool = False,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "validation_policy_value_standard_ope.png",
    ):
        """Visualize the true policy value and its estimate (scatter plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        share_axes: bool, default=False
            Whether to share x- and y-axes or not.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="validation_policy_value_standard_ope.png"
            Name of the bar figure.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        self._check_basic_visualization_inputs(
            n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name
        )

        policy_value_dict = self.select_by_policy_value(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            return_true_values=True,
        )

        self._visualize_policy_performance_for_validation(
            estimation_dict=policy_value_dict,
            input_dict=input_dict,
            true_value_arg="true_policy_value",
            estimated_value_arg="estimated_policy_value",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            n_cols=n_cols,
            share_axes=share_axes,
            legend=legend,
            ylabel="policy value",
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_policy_value_of_cumulative_distribution_ope_for_validation(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        n_cols: Optional[int] = None,
        share_axes: bool = False,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "validation_policy_value_cumulative_distribution_ope.png",
    ):
        """Visualize the true policy value and its estimate obtained by cumulative distribution OPE (scatter plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        share_axes: bool, default=False
            Whether to share x- and y-axes or not.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="validation_policy_value_cumulative_distribution_ope.png"
            Name of the bar figure.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        self._check_basic_visualization_inputs(
            n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name
        )

        policy_value_dict = self.select_by_policy_value_via_cumulative_distribution_ope(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            return_true_values=True,
        )

        self._visualize_policy_performance_for_validation(
            estimation_dict=policy_value_dict,
            input_dict=input_dict,
            true_value_arg="true_policy_value",
            estimated_value_arg="estimated_policy_value",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            n_cols=n_cols,
            share_axes=share_axes,
            legend=legend,
            ylabel="policy value",
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_policy_value_lower_bound_for_validation(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        cis: List[str] = ["bootstrap"],
        alpha: float = 0.05,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        n_cols: Optional[int] = None,
        share_axes: bool = False,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "validation_policy_value_lower_bound.png",
    ):
        """Visualize the true policy value and its estimate lower bound (scatter plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"]
            Estimation methods for confidence intervals.

        alpha: float, default=0.05
            Significance level. The value should be within `[0, 1)`.

        n_bootstrap_samples: int, default=100 (> 0)
            Number of resampling performed in the bootstrap procedure.

        random_state: int, default=None (>= 0)
            Random state.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        share_axes: bool, default=False
            Whether to share x- and y-axes or not.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="validation_policy_value_lower_bound.png"
            Name of the bar figure.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="standard_ope"
        )
        self._check_basic_visualization_inputs(
            n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name
        )

        policy_value_dict = self.select_by_policy_value_lower_bound(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            cis=cis,
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
            return_true_values=True,
        )

        plt.style.use("ggplot")
        color = plt.rcParams["axes.prop_cycle"].by_key()["color"]
        n_colors = len(color)

        n_figs = len(compared_estimators) * len(cis)
        if len(cis) == 1:
            n_cols = min(5, n_figs) if n_cols is None else n_cols
        else:
            n_cols = len(cis)
        n_rows = (n_figs - 1) // n_cols + 1

        fig, axes = plt.subplots(
            nrows=n_rows,
            ncols=n_cols,
            figsize=(4 * n_cols, 3 * n_rows),
            sharex=share_axes,
            sharey=share_axes,
        )

        guide_min, guide_max = 1e5, -1e5
        if len(cis) == 1:
            if n_rows == 1:
                for ci in cis:
                    for i, estimator in enumerate(compared_estimators):
                        if isinstance(input_dict, MultipleInputDict):
                            if behavior_policy_name is None and dataset_id is None:
                                min_val, max_val = np.infty, -np.infty

                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    n_datasets = input_dict.n_datasets[behavior_policy]
                                    min_vals = np.zeros(n_datasets)
                                    max_vals = np.zeros(n_datasets)

                                    for dataset_id_ in range(n_datasets):
                                        true_policy_value = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "true_policy_value"
                                        ]
                                        estimated_policy_value = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "estimated_policy_value_lower_bound"
                                        ]

                                        if dataset_id_ == 0:
                                            axes[i].scatter(
                                                true_policy_value,
                                                estimated_policy_value,
                                                color=color[l % n_colors],
                                                label=behavior_policy,
                                            )
                                        else:  # to remove duplicated labels
                                            axes[i].scatter(
                                                true_policy_value,
                                                estimated_policy_value,
                                                color=color[l % n_colors],
                                            )

                                        min_vals[dataset_id_] = np.minimum(
                                            np.nanmin(true_policy_value),
                                            np.nanmin(estimated_policy_value),
                                        )
                                        max_vals[dataset_id_] = np.maximum(
                                            np.nanmax(true_policy_value),
                                            np.nanmax(estimated_policy_value),
                                        )

                                    min_val = min(min_val, min_vals.min())
                                    max_val = max(max_val, max_vals.max())

                            elif (
                                behavior_policy_name is None and dataset_id is not None
                            ):
                                min_val, max_val = np.infty, -np.infty

                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    true_policy_value = policy_value_dict[
                                        behavior_policy
                                    ][ci][estimator]["true_policy_value"]
                                    estimated_policy_value = policy_value_dict[
                                        behavior_policy
                                    ][ci][estimator][
                                        "estimated_policy_value_lower_bound"
                                    ]

                                    axes[i].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[l % n_colors],
                                        label=behavior_policy,
                                    )

                                    min_val_ = np.minimum(
                                        np.nanmin(true_policy_value),
                                        np.nanmin(estimated_policy_value),
                                    )
                                    max_val_ = np.maximum(
                                        np.nanmax(true_policy_value),
                                        np.nanmax(estimated_policy_value),
                                    )

                                min_val = min(min_val, min_val_)
                                max_val = max(max_val, max_val_)

                            elif (
                                behavior_policy_name is not None and dataset_id is None
                            ):
                                n_datasets = input_dict.n_datasets[behavior_policy_name]
                                min_vals = np.zeros(n_datasets)
                                max_vals = np.zeros(n_datasets)

                                for dataset_id_ in range(n_datasets):
                                    true_policy_value = policy_value_dict[dataset_id_][
                                        ci
                                    ][estimator]["true_policy_value"]
                                    estimated_policy_value = policy_value_dict[
                                        dataset_id_
                                    ][ci][estimator][
                                        "estimated_policy_value_lower_bound"
                                    ]

                                    axes[i].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[0],
                                    )

                                    min_vals[dataset_id_] = np.minimum(
                                        np.nanmin(true_policy_value),
                                        np.nanmin(estimated_policy_value),
                                    )
                                    max_vals[dataset_id_] = np.maximum(
                                        np.nanmax(true_policy_value),
                                        np.nanmax(estimated_policy_value),
                                    )

                                min_val = min_vals.min()
                                max_val = max_vals.max()

                            else:
                                true_policy_value = policy_value_dict[ci][estimator][
                                    "true_policy_value"
                                ]
                                estimated_policy_value = policy_value_dict[ci][
                                    estimator
                                ]["estimated_policy_value_lower_bound"]

                                axes[i].scatter(
                                    true_policy_value,
                                    estimated_policy_value,
                                    color=color[0],
                                )

                                min_val = np.minimum(
                                    np.nanmin(true_policy_value),
                                    np.nanmin(estimated_policy_value),
                                )
                                max_val = np.maximum(
                                    np.nanmax(true_policy_value),
                                    np.nanmax(estimated_policy_value),
                                )

                        else:
                            true_policy_value = policy_value_dict[ci][estimator][
                                "true_policy_value"
                            ]
                            estimated_policy_value = policy_value_dict[ci][estimator][
                                "estimated_policy_value_lower_bound"
                            ]

                            axes[i].scatter(
                                true_policy_value,
                                estimated_policy_value,
                                color=color[0],
                            )

                            min_val = np.minimum(
                                np.nanmin(true_policy_value),
                                np.nanmin(estimated_policy_value),
                            )
                            max_val = np.maximum(
                                np.nanmax(true_policy_value),
                                np.nanmax(estimated_policy_value),
                            )

                        axes[i].set_title(f"{ci}, {estimator}")
                        axes[i].set_xlabel("true policy value")
                        axes[i].set_ylabel("estimated policy value lower bound")

                        if (
                            legend
                            and behavior_policy_name is None
                            and isinstance(input_dict, MultipleInputDict)
                        ):
                            axes[i].legend(title="behavior_policy", loc="lower right")

                        if not share_axes:
                            margin = (max_val - min_val) * 0.05
                            guide = np.linspace(min_val - margin, max_val + margin)
                            axes[i].plot(
                                guide,
                                guide,
                                color="black",
                                linewidth=1.0,
                            )

                        guide_min = min_val if guide_min > min_val else guide_min
                        guide_max = max_val if guide_max < max_val else guide_max

                    if share_axes:
                        margin = (guide_max - guide_min) * 0.05
                        guide = np.linspace(guide_min - margin, guide_max + margin)
                        for i, estimator in enumerate(compared_estimators):
                            axes[i].plot(
                                guide,
                                guide,
                                color="black",
                                linewidth=1.0,
                            )

            else:
                for ci in cis:
                    for i, estimator in enumerate(compared_estimators):
                        if isinstance(input_dict, MultipleInputDict):
                            if behavior_policy_name is None and dataset_id is None:
                                min_val, max_val = np.infty, -np.infty

                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    n_datasets = input_dict.n_datasets[behavior_policy]
                                    min_vals = np.zeros(n_datasets)
                                    max_vals = np.zeros(n_datasets)

                                    for dataset_id_ in range(n_datasets):
                                        true_policy_value = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "true_policy_value"
                                        ]
                                        estimated_policy_value = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "estimated_policy_value_lower_bound"
                                        ]

                                        if dataset_id_ == 0:
                                            axes[i // n_cols, i % n_cols].scatter(
                                                true_policy_value,
                                                estimated_policy_value,
                                                color=color[l % n_colors],
                                                label=behavior_policy,
                                            )
                                        else:  # to remove duplicated labels
                                            axes[i // n_cols, i % n_cols].scatter(
                                                true_policy_value,
                                                estimated_policy_value,
                                                color=color[l % n_colors],
                                            )

                                        min_vals[dataset_id_] = np.minimum(
                                            np.nanmin(true_policy_value),
                                            np.nanmin(estimated_policy_value),
                                        )
                                        max_vals[dataset_id_] = np.maximum(
                                            np.nanmax(true_policy_value),
                                            np.nanmax(estimated_policy_value),
                                        )

                                    min_val = min(min_val, min_vals.min())
                                    max_val = max(max_val, max_vals.max())

                            elif (
                                behavior_policy_name is None and dataset_id is not None
                            ):
                                min_val, max_val = np.infty, -np.infty

                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    true_policy_value = policy_value_dict[
                                        behavior_policy
                                    ][ci][estimator]["true_policy_value"]
                                    estimated_policy_value = policy_value_dict[
                                        behavior_policy
                                    ][ci][estimator][
                                        "estimated_policy_value_lower_bound"
                                    ]

                                    axes[i // n_cols, i % n_cols].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[l % n_colors],
                                        label=behavior_policy,
                                    )

                                    min_val_ = np.minimum(
                                        np.nanmin(true_policy_value),
                                        np.nanmin(estimated_policy_value),
                                    )
                                    max_val_ = np.maximum(
                                        np.nanmax(true_policy_value),
                                        np.nanmax(estimated_policy_value),
                                    )

                                min_val = min(min_val, min_val_)
                                max_val = max(max_val, max_val_)

                            elif (
                                behavior_policy_name is not None and dataset_id is None
                            ):
                                n_datasets = input_dict.n_datasets[behavior_policy_name]
                                min_vals = np.zeros(n_datasets)
                                max_vals = np.zeros(n_datasets)

                                for dataset_id_ in range(n_datasets):
                                    true_policy_value = policy_value_dict[dataset_id_][
                                        ci
                                    ][estimator]["true_policy_value"]
                                    estimated_policy_value = policy_value_dict[
                                        dataset_id_
                                    ][ci][estimator][
                                        "estimated_policy_value_lower_bound"
                                    ]

                                    axes[i // n_cols, i % n_cols].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[0],
                                    )

                                    min_vals[dataset_id_] = np.minimum(
                                        np.nanmin(true_policy_value),
                                        np.nanmin(estimated_policy_value),
                                    )
                                    max_vals[dataset_id_] = np.maximum(
                                        np.nanmax(true_policy_value),
                                        np.nanmax(estimated_policy_value),
                                    )

                                min_val = min_vals.min()
                                max_val = max_vals.max()

                            else:
                                true_policy_value = policy_value_dict[ci][estimator][
                                    "true_policy_value"
                                ]
                                estimated_policy_value = policy_value_dict[ci][
                                    estimator
                                ]["estimated_policy_value_lower_bound"]

                                axes[i // n_cols, i % n_cols].scatter(
                                    true_policy_value,
                                    estimated_policy_value,
                                    color=color[0],
                                )

                                min_val = np.minimum(
                                    np.nanmin(true_policy_value),
                                    np.nanmin(estimated_policy_value),
                                )
                                max_val = np.maximum(
                                    np.nanmax(true_policy_value),
                                    np.nanmax(estimated_policy_value),
                                )

                        else:
                            true_policy_value = policy_value_dict[ci][estimator][
                                "true_policy_value"
                            ]
                            estimated_policy_value = policy_value_dict[ci][estimator][
                                "estimated_policy_value_lower_bound"
                            ]

                            axes[i // n_cols, i % n_cols].scatter(
                                true_policy_value,
                                estimated_policy_value,
                                color=color[0],
                            )

                            min_val = np.minimum(
                                np.nanmin(true_policy_value),
                                np.nanmin(estimated_policy_value),
                            )
                            max_val = np.maximum(
                                np.nanmax(true_policy_value),
                                np.nanmax(estimated_policy_value),
                            )

                        axes[i // n_cols, i % n_cols].set_title(f"{ci}, {estimator}")
                        axes[i // n_cols, i % n_cols].set_xlabel("true policy value")
                        axes[i // n_cols, i % n_cols].set_ylabel(
                            "estimated policy value lower bound"
                        )

                        if (
                            legend
                            and behavior_policy_name is None
                            and isinstance(input_dict, MultipleInputDict)
                        ):
                            axes[i // n_cols, i % n_cols].legend(
                                title="behavior_policy",
                                loc="lower right",
                            )

                        if not share_axes:
                            margin = (max_val - min_val) * 0.05
                            guide = np.linspace(min_val - margin, max_val + margin)
                            axes[i // n_cols, i % n_cols].plot(
                                guide,
                                guide,
                                color="black",
                                linewidth=1.0,
                            )

                        guide_min = min_val if guide_min > min_val else guide_min
                        guide_max = max_val if guide_max < max_val else guide_max

                    if share_axes:
                        margin = (guide_max - guide_min) * 0.05
                        guide = np.linspace(guide_min - margin, guide_max + margin)
                        for i, estimator in enumerate(compared_estimators):
                            axes[i // n_cols, i % n_cols].plot(
                                guide,
                                guide,
                                color="black",
                                linewidth=1.0,
                            )

        else:
            if n_cols == 1:
                for j, ci in enumerate(cis):
                    for i, estimator in enumerate(compared_estimators):
                        if isinstance(input_dict, MultipleInputDict):
                            if behavior_policy_name is None and dataset_id is None:
                                min_val, max_val = np.infty, -np.infty

                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    n_datasets = input_dict.n_datasets[behavior_policy]
                                    min_vals = np.zeros(n_datasets)
                                    max_vals = np.zeros(n_datasets)

                                    for dataset_id_ in range(n_datasets):
                                        true_policy_value = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "true_policy_value"
                                        ]
                                        estimated_policy_value = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "estimated_policy_value_lower_bound"
                                        ]

                                        if dataset_id_ == 0:
                                            axes[j].scatter(
                                                true_policy_value,
                                                estimated_policy_value,
                                                color=color[l % n_colors],
                                                label=behavior_policy,
                                            )
                                        else:  # to remove duplicated labels
                                            axes[j].scatter(
                                                true_policy_value,
                                                estimated_policy_value,
                                                color=color[l % n_colors],
                                            )

                                        min_vals[dataset_id_] = np.minimum(
                                            np.nanmin(true_policy_value),
                                            np.nanmin(estimated_policy_value),
                                        )
                                        max_vals[dataset_id_] = np.maximum(
                                            np.nanmax(true_policy_value),
                                            np.nanmax(estimated_policy_value),
                                        )

                                    min_val = min(min_val, min_vals.min())
                                    max_val = max(max_val, max_vals.max())

                            elif (
                                behavior_policy_name is None and dataset_id is not None
                            ):
                                min_val, max_val = np.infty, -np.infty

                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    true_policy_value = policy_value_dict[ci][
                                        behavior_policy
                                    ][ci][estimator]["true_policy_value"]
                                    estimated_policy_value = policy_value_dict[
                                        behavior_policy
                                    ][ci][estimator][
                                        "estimated_policy_value_lower_bound"
                                    ]

                                    axes[j].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[l % n_colors],
                                        label=behavior_policy,
                                    )

                                    min_val_ = np.minimum(
                                        np.nanmin(true_policy_value),
                                        np.nanmin(estimated_policy_value),
                                    )
                                    max_val_ = np.maximum(
                                        np.nanmax(true_policy_value),
                                        np.nanmax(estimated_policy_value),
                                    )

                                min_val = min(min_val, min_val_)
                                max_val = max(max_val, max_val_)

                            elif (
                                behavior_policy_name is not None and dataset_id is None
                            ):
                                n_datasets = input_dict.n_datasets[behavior_policy_name]
                                min_vals = np.zeros(n_datasets)
                                max_vals = np.zeros(n_datasets)

                                for dataset_id_ in range(n_datasets):
                                    true_policy_value = policy_value_dict[dataset_id_][
                                        ci
                                    ][estimator]["true_policy_value"]
                                    estimated_policy_value = policy_value_dict[
                                        dataset_id_
                                    ][ci][estimator][
                                        "estimated_policy_value_lower_bound"
                                    ]

                                    axes[j].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[0],
                                    )

                                    min_vals[dataset_id_] = np.minimum(
                                        np.nanmin(true_policy_value),
                                        np.nanmin(estimated_policy_value),
                                    )
                                    max_vals[dataset_id_] = np.maximum(
                                        np.nanmax(true_policy_value),
                                        np.nanmax(estimated_policy_value),
                                    )

                                min_val = min_vals.min()
                                max_val = max_vals.max()

                            else:
                                true_policy_value = policy_value_dict[ci][estimator][
                                    "true_policy_value"
                                ]
                                estimated_policy_value = policy_value_dict[ci][
                                    estimator
                                ]["estimated_policy_value_lower_bound"]

                                axes[j].scatter(
                                    true_policy_value,
                                    estimated_policy_value,
                                    color=color[0],
                                )

                                min_val = np.minimum(
                                    np.nanmin(true_policy_value),
                                    np.nanmin(estimated_policy_value),
                                )
                                max_val = np.maximum(
                                    np.nanmax(true_policy_value),
                                    np.nanmax(estimated_policy_value),
                                )

                        else:
                            true_policy_value = policy_value_dict[ci][estimator][
                                "true_policy_value"
                            ]
                            estimated_policy_value = policy_value_dict[ci][estimator][
                                "estimated_policy_value_lower_bound"
                            ]

                            axes[j].scatter(
                                true_policy_value,
                                estimated_policy_value,
                                color=color[0],
                            )

                            min_val = np.minimum(
                                np.nanmin(true_policy_value),
                                np.nanmin(estimated_policy_value),
                            )
                            max_val = np.maximum(
                                np.nanmax(true_policy_value),
                                np.nanmax(estimated_policy_value),
                            )

                        axes[j].set_title(f"{ci}, {estimator}")
                        axes[j].set_xlabel("true policy value")
                        axes[j].set_ylabel("estimated policy value lower bound")

                        if (
                            legend
                            and behavior_policy_name is None
                            and isinstance(input_dict, MultipleInputDict)
                        ):
                            axes[j].legend(title="behavior_policy", loc="lower right")

                        if not share_axes:
                            margin = (max_val - min_val) * 0.05
                            guide = np.linspace(min_val - margin, max_val + margin)
                            axes[j].plot(
                                guide,
                                guide,
                                color="black",
                                linewidth=1.0,
                            )

                        guide_min = min_val if guide_min > min_val else guide_min
                        guide_max = max_val if guide_max < max_val else guide_max

                if share_axes:
                    margin = (guide_max - guide_min) * 0.05
                    guide = np.linspace(guide_min - margin, guide_max + margin)
                    for j, ci in enumerate(cis):
                        axes[j].plot(
                            guide,
                            guide,
                            color="black",
                            linewidth=1.0,
                        )

            else:
                for j, ci in enumerate(cis):
                    for i, estimator in enumerate(compared_estimators):
                        if isinstance(input_dict, MultipleInputDict):
                            if behavior_policy_name is None and dataset_id is None:
                                min_val, max_val = np.infty, -np.infty

                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    n_datasets = input_dict.n_datasets[behavior_policy]
                                    min_vals = np.zeros(n_datasets)
                                    max_vals = np.zeros(n_datasets)

                                    for dataset_id_ in range(n_datasets):
                                        true_policy_value = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "true_policy_value"
                                        ]
                                        estimated_policy_value = policy_value_dict[
                                            behavior_policy
                                        ][dataset_id_][ci][estimator][
                                            "estimated_policy_value_lower_bound"
                                        ]

                                        if dataset_id_ == 0:
                                            axes[i, j].scatter(
                                                true_policy_value,
                                                estimated_policy_value,
                                                color=color[l % n_colors],
                                                label=behavior_policy,
                                            )
                                        else:  # to remove duplicated labels
                                            axes[i, j].scatter(
                                                true_policy_value,
                                                estimated_policy_value,
                                                color=color[l % n_colors],
                                            )

                                        min_vals[dataset_id_] = np.minimum(
                                            np.nanmin(true_policy_value),
                                            np.nanmin(estimated_policy_value),
                                        )
                                        max_vals[dataset_id_] = np.maximum(
                                            np.nanmax(true_policy_value),
                                            np.nanmax(estimated_policy_value),
                                        )

                                    min_val = min(min_val, min_vals.min())
                                    max_val = max(max_val, max_vals.max())

                            elif (
                                behavior_policy_name is None and dataset_id is not None
                            ):
                                min_val, max_val = np.infty, -np.infty

                                for l, behavior_policy in enumerate(
                                    input_dict.behavior_policy_names
                                ):
                                    true_policy_value = policy_value_dict[
                                        behavior_policy
                                    ][ci][estimator]["true_policy_value"]
                                    estimated_policy_value = policy_value_dict[
                                        behavior_policy
                                    ][ci][estimator][
                                        "estimated_policy_value_lower_bound"
                                    ]

                                    axes[i, j].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[l % n_colors],
                                        label=behavior_policy,
                                    )

                                    min_val_ = np.minimum(
                                        np.nanmin(true_policy_value),
                                        np.nanmin(estimated_policy_value),
                                    )
                                    max_val_ = np.maximum(
                                        np.nanmax(true_policy_value),
                                        np.nanmax(estimated_policy_value),
                                    )

                                min_val = min(min_val, min_val_)
                                max_val = max(max_val, max_val_)

                            elif (
                                behavior_policy_name is not None and dataset_id is None
                            ):
                                n_datasets = input_dict.n_datasets[behavior_policy_name]
                                min_vals = np.zeros(n_datasets)
                                max_vals = np.zeros(n_datasets)

                                for dataset_id_ in range(n_datasets):
                                    true_policy_value = policy_value_dict[dataset_id_][
                                        ci
                                    ][estimator]["true_policy_value"]
                                    estimated_policy_value = policy_value_dict[
                                        dataset_id_
                                    ][ci][estimator][
                                        "estimated_policy_value_lower_bound"
                                    ]

                                    axes[i, j].scatter(
                                        true_policy_value,
                                        estimated_policy_value,
                                        color=color[0],
                                    )

                                    min_vals[dataset_id_] = np.minimum(
                                        np.nanmin(true_policy_value),
                                        np.nanmin(estimated_policy_value),
                                    )
                                    max_vals[dataset_id_] = np.maximum(
                                        np.nanmax(true_policy_value),
                                        np.nanmax(estimated_policy_value),
                                    )

                                min_val = min_vals.min()
                                max_val = max_vals.max()

                            else:
                                true_policy_value = policy_value_dict[ci][estimator][
                                    "true_policy_value"
                                ]
                                estimated_policy_value = policy_value_dict[ci][
                                    estimator
                                ]["estimated_policy_value_lower_bound"]

                                axes[i, j].scatter(
                                    true_policy_value,
                                    estimated_policy_value,
                                    color=color[0],
                                )

                                min_val = np.minimum(
                                    np.nanmin(true_policy_value),
                                    np.nanmin(estimated_policy_value),
                                )
                                max_val = np.maximum(
                                    np.nanmax(true_policy_value),
                                    np.nanmax(estimated_policy_value),
                                )

                        else:
                            true_policy_value = policy_value_dict[ci][estimator][
                                "true_policy_value"
                            ]
                            estimated_policy_value = policy_value_dict[ci][estimator][
                                "estimated_policy_value_lower_bound"
                            ]

                            axes[i, j].scatter(
                                true_policy_value,
                                estimated_policy_value,
                                color=color[0],
                            )

                            min_val = np.minimum(
                                np.nanmin(true_policy_value),
                                np.nanmin(estimated_policy_value),
                            )
                            max_val = np.maximum(
                                np.nanmax(true_policy_value),
                                np.nanmax(estimated_policy_value),
                            )

                        axes[i, j].set_title(f"{ci}, {estimator}")
                        axes[i, j].set_xlabel("true policy value")
                        axes[i, j].set_ylabel("estimated policy value lower bound")

                        if (
                            legend
                            and behavior_policy_name is None
                            and isinstance(input_dict, MultipleInputDict)
                        ):
                            axes[i, j].legend(
                                title="behavior_policy", loc="lower right"
                            )

                        if not share_axes:
                            margin = (max_val - min_val) * 0.05
                            guide = np.linspace(min_val - margin, max_val + margin)
                            axes[i, j].plot(
                                guide,
                                guide,
                                color="black",
                                linewidth=1.0,
                            )

                        guide_min = min_val if guide_min > min_val else guide_min
                        guide_max = max_val if guide_max < max_val else guide_max

                if share_axes:
                    margin = (guide_max - guide_min) * 0.05
                    guide = np.linspace(guide_min - margin, guide_max + margin)
                    for j, ci in enumerate(cis):
                        for i, estimator in enumerate(compared_estimators):
                            axes[i, j].plot(
                                guide,
                                guide,
                                color="black",
                                linewidth=1.0,
                            )

        fig.tight_layout()
        plt.show()

        if fig_dir:
            fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")

[docs]    def visualize_variance_for_validation(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        n_cols: Optional[int] = None,
        share_axes: bool = False,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "validation_variance.png",
    ):
        """Visualize the true variance and its estimate (scatter plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        share_axes: bool, default=False
            Whether to share x- and y-axes or not.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="validation_variance.png"
            Name of the bar figure.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        self._check_basic_visualization_inputs(
            n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name
        )

        ground_truth_policy_value_dict = self.obtain_true_selection_result(
            input_dict=input_dict,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            return_variance=True,
        )
        estimated_variance_dict = self.cumulative_distribution_ope.estimate_variance(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
        )

        if isinstance(input_dict, MultipleInputDict):
            if behavior_policy_name is None and dataset_id is None:
                candidate_policy_names = defaultdict(list)
                true_variance = defaultdict(list)

                for (
                    behavior_policy,
                    n_datasets,
                ) in input_dict.n_datasets.items():
                    for dataset_id_ in range(n_datasets):
                        candidate_policy_names[behavior_policy].append(
                            ground_truth_policy_value_dict[behavior_policy][
                                dataset_id_
                            ]["ranking"]
                        )
                        true_variance[behavior_policy].append(
                            ground_truth_policy_value_dict[behavior_policy][
                                dataset_id_
                            ]["variance"]
                        )

                candidate_policy_names = defaultdict_to_dict(candidate_policy_names)
                true_variance = defaultdict_to_dict(true_variance)

            elif behavior_policy_name is None and dataset_id is not None:
                candidate_policy_names = {}
                true_variance = {}

                for behavior_policy in input_dict.behavior_policy_names:
                    candidate_policy_names[
                        behavior_policy
                    ] = ground_truth_policy_value_dict[behavior_policy]["ranking"]
                    true_variance[behavior_policy] = ground_truth_policy_value_dict[
                        behavior_policy
                    ]["variance"]

            elif behavior_policy_name is not None and dataset_id is None:
                candidate_policy_names = []
                true_variance = []

                for dataset_id_ in range(input_dict.n_datasets[behavior_policy_name]):
                    candidate_policy_names.append(
                        ground_truth_policy_value_dict[dataset_id_]["ranking"]
                    )
                    true_variance.append(
                        ground_truth_policy_value_dict[dataset_id_]["variance"]
                    )

            else:
                candidate_policy_names = ground_truth_policy_value_dict["ranking"]
                true_variance = ground_truth_policy_value_dict["variance"]

        else:
            candidate_policy_names = ground_truth_policy_value_dict["ranking"]
            true_variance = ground_truth_policy_value_dict["variance"]

        plt.style.use("ggplot")
        color = plt.rcParams["axes.prop_cycle"].by_key()["color"]
        n_colors = len(color)

        n_figs = len(compared_estimators)
        n_cols = min(5, n_figs) if n_cols is None else n_cols
        n_rows = (n_figs - 1) // n_cols + 1

        fig, axes = plt.subplots(
            nrows=n_rows,
            ncols=n_cols,
            figsize=(4 * n_cols, 3 * n_rows),
            sharex=share_axes,
            sharey=share_axes,
        )

        guide_min, guide_max = 1e5, -1e5
        if n_rows == 1:
            for i, estimator in enumerate(compared_estimators):
                if isinstance(input_dict, MultipleInputDict):
                    if behavior_policy_name is None and dataset_id is None:
                        min_val, max_val = np.infty, -np.infty

                        for l, behavior_policy in enumerate(
                            input_dict.behavior_policy_names
                        ):
                            n_datasets = input_dict.n_datasets[behavior_policy]
                            min_vals = np.zeros(n_datasets)
                            max_vals = np.zeros(n_datasets)

                            for dataset_id_ in range(n_datasets):
                                estimated_variance = np.zeros(
                                    len(
                                        candidate_policy_names[behavior_policy][
                                            dataset_id_
                                        ]
                                    )
                                )
                                for j, eval_policy in enumerate(
                                    candidate_policy_names[behavior_policy][dataset_id_]
                                ):
                                    estimated_variance[j] = estimated_variance_dict[
                                        behavior_policy
                                    ][dataset_id_][eval_policy][estimator]

                                if dataset_id_ == 0:
                                    axes[i].scatter(
                                        true_variance[behavior_policy][dataset_id_],
                                        estimated_variance,
                                        color=color[l % n_colors],
                                        label=behavior_policy,
                                    )
                                else:
                                    axes[i].scatter(
                                        true_variance[behavior_policy][dataset_id_],
                                        estimated_variance,
                                        color=color[l % n_colors],
                                    )

                                min_vals[dataset_id_] = np.minimum(
                                    np.nanmin(
                                        true_variance[behavior_policy][dataset_id_]
                                    ),
                                    np.nanmin(estimated_variance),
                                )
                                max_vals[dataset_id_] = np.maximum(
                                    np.nanmax(
                                        true_variance[behavior_policy][dataset_id_]
                                    ),
                                    np.nanmax(estimated_variance),
                                )

                            min_val = min(min_val, min_vals.min())
                            max_val = max(max_val, max_vals.max())

                    elif behavior_policy_name is None and dataset_id is not None:
                        min_val, max_val = np.infty, -np.infty

                        for l, behavior_policy in enumerate(
                            input_dict.behavior_policy_names
                        ):
                            estimated_variance = np.zeros(
                                len(candidate_policy_names[behavior_policy])
                            )
                            for j, eval_policy in enumerate(
                                candidate_policy_names[behavior_policy]
                            ):
                                estimated_variance[j] = estimated_variance_dict[
                                    behavior_policy
                                ][eval_policy][estimator]

                            axes[i].scatter(
                                true_variance[behavior_policy],
                                estimated_variance,
                                color=color[l % n_colors],
                                label=behavior_policy,
                            )

                            min_val_ = np.minimum(
                                np.nanmin(true_variance[behavior_policy]),
                                np.nanmin(estimated_variance),
                            )
                            max_val_ = np.maximum(
                                np.nanmax(true_variance[behavior_policy]),
                                np.nanmax(estimated_variance),
                            )

                        min_val = min(min_val, min_val_)
                        max_val = max(max_val, max_val_)

                    elif behavior_policy_name is not None and dataset_id is None:
                        n_datasets = input_dict.n_datasets[behavior_policy_name]
                        min_vals = np.zeros(n_datasets)
                        max_vals = np.zeros(n_datasets)

                        for dataset_id_ in range(n_datasets):
                            estimated_variance = np.zeros(
                                len(candidate_policy_names[dataset_id_])
                            )
                            for j, eval_policy in enumerate(
                                candidate_policy_names[dataset_id_]
                            ):
                                estimated_variance[j] = estimated_variance_dict[
                                    dataset_id_
                                ][eval_policy][estimator]

                            axes[i].scatter(
                                true_variance[dataset_id_],
                                estimated_variance,
                                color=color[0],
                            )

                            min_vals[dataset_id_] = np.minimum(
                                np.nanmin(true_variance[dataset_id_]),
                                np.nanmin(estimated_variance[dataset_id_]),
                            )
                            max_vals[dataset_id_] = np.maximum(
                                np.nanmax(true_variance[dataset_id_]),
                                np.nanmax(estimated_variance[dataset_id_]),
                            )

                        min_val = min_vals.min()
                        max_val = max_vals.max()

                    else:
                        estimated_variance = np.zeros(len(candidate_policy_names))
                        for j, eval_policy in enumerate(candidate_policy_names):
                            estimated_variance[j] = estimated_variance_dict[
                                eval_policy
                            ][estimator]

                        axes[i].scatter(
                            true_variance,
                            estimated_variance,
                            color=color[0],
                        )

                        min_val = np.minimum(
                            np.nanmin(true_variance),
                            np.nanmin(estimated_variance),
                        )
                        max_val = np.maximum(
                            np.nanmax(true_variance),
                            np.nanmax(estimated_variance),
                        )

                else:
                    estimated_variance = np.zeros(len(candidate_policy_names))
                    for j, eval_policy in enumerate(candidate_policy_names):
                        estimated_variance[j] = estimated_variance_dict[eval_policy][
                            estimator
                        ]

                    axes[i].scatter(
                        true_variance,
                        estimated_variance,
                        color=color[0],
                    )

                    min_val = np.minimum(
                        np.nanmin(true_variance),
                        np.nanmin(estimated_variance),
                    )
                    max_val = np.maximum(
                        np.nanmax(true_variance),
                        np.nanmax(estimated_variance),
                    )

                axes[i].set_title(estimator)
                axes[i].set_xlabel("true variance")
                axes[i].set_ylabel("estimated variance")

                if (
                    legend
                    and behavior_policy_name is None
                    and isinstance(input_dict, MultipleInputDict)
                ):
                    axes[i].legend(title="behavior_policy", loc="lower right")

                if not share_axes:
                    margin = (max_val - min_val) * 0.05
                    guide = np.linspace(min_val - margin, max_val + margin)
                    axes[i].plot(
                        guide,
                        guide,
                        color="black",
                        linewidth=1.0,
                    )

                guide_min = min_val if guide_min > min_val else guide_min
                guide_max = max_val if guide_max < max_val else guide_max

            if share_axes:
                margin = (guide_max - guide_min) * 0.05
                guide = np.linspace(guide_min - margin, guide_max + margin)
                for i, estimator in enumerate(compared_estimators):
                    axes[i].plot(
                        guide,
                        guide,
                        color="black",
                        linewidth=1.0,
                    )

        else:
            for i, estimator in enumerate(compared_estimators):
                if isinstance(input_dict, MultipleInputDict):
                    if behavior_policy_name is None and dataset_id is None:
                        min_val, max_val = np.infty, -np.infty

                        for l, behavior_policy in enumerate(
                            input_dict.behavior_policy_names
                        ):
                            n_datasets = input_dict.n_datasets[behavior_policy]
                            min_vals = np.zeros(n_datasets)
                            max_vals = np.zeros(n_datasets)

                            for dataset_id_ in range(n_datasets):
                                estimated_variance = np.zeros(
                                    len(
                                        candidate_policy_names[behavior_policy][
                                            dataset_id_
                                        ]
                                    )
                                )
                                for j, eval_policy in enumerate(
                                    candidate_policy_names[behavior_policy][dataset_id_]
                                ):
                                    estimated_variance[j] = estimated_variance_dict[
                                        behavior_policy
                                    ][dataset_id_][eval_policy][estimator]

                                if dataset_id_ == 0:
                                    axes[i // n_cols, i % n_cols].scatter(
                                        true_variance[behavior_policy][dataset_id_],
                                        estimated_variance,
                                        color=color[l % n_colors],
                                        label=behavior_policy,
                                    )
                                else:
                                    axes[i // n_cols, i % n_cols].scatter(
                                        true_variance[behavior_policy][dataset_id_],
                                        estimated_variance,
                                        color=color[l % n_colors],
                                    )

                                min_vals[dataset_id_] = np.minimum(
                                    np.nanmin(
                                        true_variance[behavior_policy][dataset_id_]
                                    ),
                                    np.nanmin(estimated_variance),
                                )
                                max_vals[dataset_id_] = np.maximum(
                                    np.nanmax(
                                        true_variance[behavior_policy][dataset_id_]
                                    ),
                                    np.nanmax(estimated_variance),
                                )

                            min_val = min(min_val, min_vals.min())
                            max_val = max(max_val, max_vals.max())

                    elif behavior_policy_name is None and dataset_id is not None:
                        min_val, max_val = np.infty, -np.infty

                        for l, behavior_policy in enumerate(
                            input_dict.behavior_policy_names
                        ):
                            estimated_variance = np.zeros(
                                len(candidate_policy_names[behavior_policy])
                            )
                            for j, eval_policy in enumerate(
                                candidate_policy_names[behavior_policy]
                            ):
                                estimated_variance[j] = estimated_variance_dict[
                                    behavior_policy
                                ][eval_policy][estimator]

                            axes[i // n_cols, i % n_cols].scatter(
                                true_variance[behavior_policy],
                                estimated_variance,
                                color=color[l % n_colors],
                                label=behavior_policy,
                            )

                            min_val_ = np.minimum(
                                np.nanmin(true_variance[behavior_policy]),
                                np.nanmin(estimated_variance),
                            )
                            max_val_ = np.maximum(
                                np.nanmax(true_variance[behavior_policy]),
                                np.nanmax(estimated_variance),
                            )

                        min_val = min(min_val, min_val_)
                        max_val = max(max_val, max_val_)

                    elif behavior_policy_name is not None and dataset_id is None:
                        n_datasets = input_dict.n_datasets[behavior_policy_name]
                        min_vals = np.zeros(n_datasets)
                        max_vals = np.zeros(n_datasets)

                        for dataset_id_ in range(n_datasets):
                            estimated_variance = np.zeros(
                                len(candidate_policy_names[dataset_id_])
                            )
                            for j, eval_policy in enumerate(
                                candidate_policy_names[dataset_id_]
                            ):
                                estimated_variance[j] = estimated_variance_dict[
                                    dataset_id_
                                ][eval_policy][estimator]

                            axes[i // n_cols, i % n_cols].scatter(
                                true_variance[dataset_id_],
                                estimated_variance,
                                color=color[0],
                            )

                            min_vals[dataset_id_] = np.minimum(
                                np.nanmin(true_variance[dataset_id_]),
                                np.nanmin(estimated_variance[dataset_id_]),
                            )
                            max_vals[dataset_id_] = np.maximum(
                                np.nanmax(true_variance[dataset_id_]),
                                np.nanmax(estimated_variance[dataset_id_]),
                            )

                        min_val = min_vals.min()
                        max_val = max_vals.max()

                    else:
                        estimated_variance = np.zeros(len(candidate_policy_names))
                        for j, eval_policy in enumerate(candidate_policy_names):
                            estimated_variance[j] = estimated_variance_dict[
                                eval_policy
                            ][estimator]

                        axes[i // n_cols, i % n_cols].scatter(
                            true_variance,
                            estimated_variance,
                            color=color[0],
                        )

                        min_val = np.minimum(
                            np.nanmin(true_variance),
                            np.nanmin(estimated_variance),
                        )
                        max_val = np.maximum(
                            np.nanmax(true_variance),
                            np.nanmax(estimated_variance),
                        )

                else:
                    estimated_variance = np.zeros(len(candidate_policy_names))
                    for j, eval_policy in enumerate(candidate_policy_names):
                        estimated_variance[j] = estimated_variance_dict[eval_policy][
                            estimator
                        ]

                    axes[i // n_cols, i % n_cols].scatter(
                        true_variance,
                        estimated_variance,
                        color=color[0],
                    )

                    min_val = np.minimum(
                        np.nanmin(true_variance),
                        np.nanmin(estimated_variance),
                    )
                    max_val = np.maximum(
                        np.nanmax(true_variance),
                        np.nanmax(estimated_variance),
                    )

                axes[i // n_cols, i % n_cols].set_title(estimator)
                axes[i // n_cols, i % n_cols].set_xlabel("true variance")
                axes[i // n_cols, i % n_cols].set_ylabel("estimated variance")

                if (
                    legend
                    and behavior_policy_name is None
                    and isinstance(input_dict, MultipleInputDict)
                ):
                    axes[i // n_cols, i % n_cols].legend(
                        title="behavior_policy", loc="lower right"
                    )

                if not share_axes:
                    margin = (max_val - min_val) * 0.05
                    guide = np.linspace(min_val - margin, max_val + margin)
                    axes[i // n_cols, i % n_cols].plot(
                        guide,
                        guide,
                        color="black",
                        linewidth=1.0,
                    )

                guide_min = min_val if guide_min > min_val else guide_min
                guide_max = max_val if guide_max < max_val else guide_max

            if share_axes:
                margin = (guide_max - guide_min) * 0.05
                guide = np.linspace(guide_min - margin, guide_max + margin)
                for i, estimator in enumerate(compared_estimators):
                    axes[i // n_cols, i % n_cols].plot(
                        guide,
                        guide,
                        color="black",
                        linewidth=1.0,
                    )

        fig.tight_layout()
        plt.show()

        if fig_dir:
            fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")

[docs]    def visualize_lower_quartile_for_validation(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        alpha: float = 0.05,
        n_cols: Optional[int] = None,
        share_axes: bool = False,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "validation_lower_quartile.png",
    ):
        """Visualize the true lower quartile and its estimate (scatter plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 0.5]`.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        share_axes: bool, default=False
            Whether to share x- and y-axes or not.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="validation_lower_quartile.png"
            Name of the bar figure.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        self._check_basic_visualization_inputs(
            n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name
        )

        lower_quartile_dict = self.select_by_lower_quartile(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=alpha,
            return_true_values=True,
        )

        self._visualize_policy_performance_for_validation(
            estimation_dict=lower_quartile_dict,
            input_dict=input_dict,
            true_value_arg="true_lower_quartile",
            estimated_value_arg="estimated_lower_quartile",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            n_cols=n_cols,
            share_axes=share_axes,
            legend=legend,
            ylabel=f"lower quartile ({alpha})",
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

[docs]    def visualize_conditional_value_at_risk_for_validation(
        self,
        input_dict: Union[OPEInputDict, MultipleInputDict],
        compared_estimators: Optional[List[str]] = None,
        behavior_policy_name: Optional[str] = None,
        dataset_id: Optional[int] = None,
        alpha: float = 0.05,
        n_cols: Optional[int] = None,
        share_axes: bool = False,
        legend: bool = True,
        fig_dir: Optional[Path] = None,
        fig_name: str = "validation_conditional_value_at_risk.png",
    ):
        """Visualize the true conditional value at risk and its estimate (scatter plot).

        Parameters
        -------
        input_dict: OPEInputDict or MultipleInputDict
            Dictionary of the OPE inputs for each evaluation policy.

            .. code-block:: python

                key: [evaluation_policy][
                    evaluation_policy_action,
                    evaluation_policy_action_dist,
                    state_action_value_prediction,
                    initial_state_value_prediction,
                    state_action_marginal_importance_weight,
                    state_marginal_importance_weight,
                    on_policy_policy_value,
                    gamma,
                    behavior_policy,
                    evaluation_policy,
                    dataset_id,
                ]

            .. seealso::

                :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`.

        compared_estimators: list of str, default=None
            Name of compared estimators.
            When `None` is given, all the estimators are compared.

        behavior_policy_name: str, default=None
            Name of the behavior policy.

        dataset_id: int, default=None
            Id of the logged dataset.

        alpha: float, default=0.05
            Proportion of the shaded region. The value should be within `[0, 1]`.

        n_cols: int, default=None (> 0)
            Number of columns in the figure.

        share_axes: bool, default=False
            Whether to share x- and y-axes or not.

        legend: bool, default=True
            Whether to include a legend in the scatter plot.

        fig_dir: Path, default=None
            Path to store the bar figure.
            If `None` is given, the figure will not be saved.

        fig_name: str, default="validation_conditional_value_at_risk.png"
            Name of the bar figure.

        """
        compared_estimators = self._check_compared_estimators(
            compared_estimators, ope_type="cumulative_distribution_ope"
        )
        self._check_basic_visualization_inputs(
            n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name
        )

        cvar_dict = self.select_by_conditional_value_at_risk(
            input_dict,
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            alpha=alpha,
            return_true_values=True,
        )

        self._visualize_policy_performance_for_validation(
            estimation_dict=cvar_dict,
            input_dict=input_dict,
            true_value_arg="true_conditional_value_at_risk",
            estimated_value_arg="estimated_conditional_value_at_risk",
            compared_estimators=compared_estimators,
            behavior_policy_name=behavior_policy_name,
            dataset_id=dataset_id,
            n_cols=n_cols,
            share_axes=share_axes,
            legend=legend,
            ylabel=f"CVaR ({alpha})",
            fig_dir=fig_dir,
            fig_name=fig_name,
        )

    @property
    def estimators_name(self):
        estimators_name = {
            "standard_ope": None if self.ope is None else self.ope.estimators_name,
            "cumulative_distribution_ope": None
            if self.cumulative_distribution_ope is None
            else self.cumulative_distribution_ope.estimators_name,
        }
        return estimators_name