Source code for scope_rl.ope.ops

# Copyright (c) 2023, Haruka Kiyohara, Ren Kishimoto, HAKUHODO Technologies Inc., and Hanjuku-kaso Co., Ltd. All rights reserved.
# Licensed under the Apache 2.0 License.

"""Meta class to handle Off-Policy Selection (OPS) and evaluation of OPE/OPS."""
from collections import defaultdict
from dataclasses import dataclass
from typing import Optional, Union, List, Dict
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
from sklearn.utils import check_scalar
import matplotlib.pyplot as plt

from .ope import (
    OffPolicyEvaluation,
    CumulativeDistributionOPE,
)
from ..utils import (
    MultipleInputDict,
    estimate_confidence_interval_by_bootstrap,
    estimate_confidence_interval_by_hoeffding,
    estimate_confidence_interval_by_empirical_bernstein,
    estimate_confidence_interval_by_t_test,
    defaultdict_to_dict,
)
from ..types import OPEInputDict

markers = ["o", "v", "^", "s", "p", "P", "*", "h", "X", "D", "d"]
dkred = "#A60628"


[docs]@dataclass class OffPolicySelection: """Class to conduct OPS and evaluation of OPE/OPS with multiple estimators simultaneously. Imported as: :class:`scope_rl.ope.OffPolicySelection` Note ----------- **Off-Policy Selection (OPS)** OPS selects the "best" policy among several candidates based on the policy value or other statistics estimated by OPE. .. math:: \\hat{\\pi} := {\\arg \\max}_{\\pi \\in \\Pi} \hat{J}(\\pi) where :math:`\\Pi` is a set of candidate policies and :math:`\hat{J}(\\cdot)` is some OPE estimates of the policy performance. Below, we describe two types of OPE to estimate such policy performance. **Off-Policy Evaluation (OPE)** (Basic) OPE estimates the expected policy performance called the policy value. .. math:: V(\\pi) := \\mathbb{E} \\left[ \\sum_{t=1}^T \\gamma^{t-1} r_t \\mid \\pi \\right] where :math:`r_t` is the reward observed at each timestep :math:`t`, :math:`T` is the total number of timesteps in an episode, and :math:`\\gamma` is the discount factor. .. seealso:: :class:`OffPolicyEvaluation` **Cumulative Distribution OPE** In contrast, cumulative distribution OPE first estimates the following cumulative distribution function. .. math:: F(t, \\pi) := \\mathbb{E} \\left[ \\mathbb{I} \\left \\{ \\sum_{t=1}^T \\gamma^{t-1} r_t \\leq t \\right \\} \\mid \\pi \\right] Then, cumulative distribution OPE also estimates some risk functions including variance, conditional value at risk, and interquartile range based on the CDF estimate. .. seealso:: :class:`CumulativeDistributionOPE` Parameters ----------- ope: OffPolicyEvaluation, default=None Instance of the (standard) OPE class. cumulative_distribution_ope: CumulativeDistributionOPE, default=None Instance of the cumulative distribution OPE class. Examples ---------- Preparation: .. code-block:: python # import necessary module from SCOPE-RL from scope_rl.dataset import SyntheticDataset from scope_rl.policy import EpsilonGreedyHead from scope_rl.ope import CreateOPEInput from scope_rl.ope import OffPolicySelection from scope_rl.ope import OffPolicyEvaluation as OPE from scope_rl.ope.discrete import TrajectoryWiseImportanceSampling as TIS from scope_rl.ope.discrete import PerDecisionImportanceSampling as PDIS from scope_rl.ope import CumulativeDistributionOPE from scope_rl.ope.discrete import CumulativeDistributionTIS as CD_IS from scope_rl.ope.discrete import CumulativeDistributionSNTIS as CD_SNIS # import necessary module from other libraries import gym import rtbgym from d3rlpy.algos import DoubleDQNConfig from d3rlpy.dataset import create_fifo_replay_buffer from d3rlpy.algos import ConstantEpsilonGreedy # initialize environment env = gym.make("RTBEnv-discrete-v0") # define (RL) agent (i.e., policy) and train on the environment ddqn = DoubleDQNConfig().create() buffer = create_fifo_replay_buffer( limit=10000, env=env, ) explorer = ConstantEpsilonGreedy( epsilon=0.3, ) ddqn.fit_online( env=env, buffer=buffer, explorer=explorer, n_steps=10000, n_steps_per_epoch=1000, ) # convert ddqn policy to stochastic data collection policy behavior_policy = EpsilonGreedyHead( ddqn, n_actions=env.action_space.n, epsilon=0.3, name="ddqn_epsilon_0.3", random_state=12345, ) # initialize dataset class dataset = SyntheticDataset( env=env, max_episode_steps=env.step_per_episode, ) # data collection logged_dataset = dataset.obtain_episodes( behavior_policies=behavior_policy, n_trajectories=100, random_state=12345, ) Create Input for OPE: .. code-block:: python # evaluation policy ddqn_ = EpsilonGreedyHead( base_policy=ddqn, n_actions=env.action_space.n, name="ddqn", epsilon=0.0, random_state=12345 ) random_ = EpsilonGreedyHead( base_policy=ddqn, n_actions=env.action_space.n, name="random", epsilon=1.0, random_state=12345 ) # create input for off-policy evaluation (OPE) prep = CreateOPEInput( env=env, ) input_dict = prep.obtain_whole_inputs( logged_dataset=logged_dataset, evaluation_policies=[ddqn_, random_], n_trajectories_on_policy_evaluation=100, random_state=12345, ) **Off-Policy Evaluation and Selection**: .. code-block:: python # OPS ope = OPE( logged_dataset=logged_dataset, ope_estimators=[TIS(), PDIS()], ) cd_ope = CumulativeDistributionOPE( logged_dataset=logged_dataset, ope_estimators=[ CD_IS(estimator_name="cd_is"), CD_SNIS(estimator_name="cd_snis"), ], ) ops = OffPolicySelection( ope=ope, cumulative_distribution_ope=cd_ope, ) ops_dict = ops.select_by_policy_value( input_dict=input_dict, return_metrics=True, ) **Output**: .. code-block:: python >>> ops_dict {'tis': {'estimated_ranking': ['ddqn', 'random'], 'estimated_policy_value': array([21.3624954, 0.3827044]), 'estimated_relative_policy_value': array([1.44732354, 0.02592848]), 'mean_squared_error': 94.79587393975419, 'rank_correlation': SpearmanrResult(correlation=0.9999999999999999, pvalue=nan), 'regret': (0.0, 1), 'type_i_error_rate': 0.0, 'type_ii_error_rate': 0.0, 'safety_threshold': 13.284}, 'pdis': {'estimated_ranking': ['ddqn', 'random'], 'estimated_policy_value': array([18.02806424, 7.13847486]), 'estimated_relative_policy_value': array([1.22141357, 0.48363651]), 'mean_squared_error': 19.45349619733373, 'rank_correlation': SpearmanrResult(correlation=0.9999999999999999, pvalue=nan), 'regret': (0.0, 1), 'type_i_error_rate': 0.0, 'type_ii_error_rate': 0.0, 'safety_threshold': 13.284}} .. seealso:: * :doc:`Quickstart </documentation/quickstart>` * :doc:`Related tutorials (OPS) </documentation/examples/ops>` and :doc:`related tutorials (assessments) <documentation/examples/assessments>` References ------- Vladislav Kurenkov and Sergey Kolesnikov. "Showing Your Offline Reinforcement Learning Work: Online Evaluation Budget Matters." 2022. Shengpu Tang and Jenna Wiens. "Model Selection for Offline Reinforcement Learning: Practical Considerations for Healthcare Settings." 2021. Justin Fu, Mohammad Norouzi, Ofir Nachum, George Tucker, Ziyu Wang, Alexander Novikov, Mengjiao Yang, Michael R. Zhang, Yutian Chen, Aviral Kumar, Cosmin Paduraru, Sergey Levine, and Tom Le Paine. "Benchmarks for Deep Off-Policy Evaluation." 2021. Tom Le Paine, Cosmin Paduraru, Andrea Michi, Caglar Gulcehre, Konrad Zolna, Alexander Novikov, Ziyu Wang, and Nando de Freitas. "Hyperparameter Selection for Offline Reinforcement Learning." 2020. """ ope: Optional[OffPolicyEvaluation] = None cumulative_distribution_ope: Optional[CumulativeDistributionOPE] = None def __post_init__(self): if self.ope is None and self.cumulative_distribution_ope is None: raise RuntimeError( "one of `ope` or `cumulative_distribution_ope` must be given" ) if self.ope is not None and not isinstance(self.ope, OffPolicyEvaluation): raise RuntimeError("ope must be the instance of OffPolicyEvaluation") if self.cumulative_distribution_ope is not None and not isinstance( self.cumulative_distribution_ope, CumulativeDistributionOPE ): raise RuntimeError( "cumulative_distribution_ope must be the instance of CumulativeDistributionOPE" ) self.step_per_trajectory = self.ope.logged_dataset["step_per_trajectory"] check_scalar( self.step_per_trajectory, name="ope.logged_dataset['step_per_trajectory']", target_type=int, min_val=1, ) self.behavior_policy_reward = {} if self.ope.use_multiple_logged_dataset: for ( behavior_policy ) in self.ope.multiple_logged_dataset.behavior_policy_names: logged_dataset_ = self.ope.multiple_logged_dataset.get( behavior_policy_name=behavior_policy, dataset_id=0 ) self.behavior_policy_reward[behavior_policy] = logged_dataset_[ "reward" ].reshape((-1, self.step_per_trajectory)) if self.ope.disable_reward_after_done: done = logged_dataset_["done"].reshape( (-1, self.step_per_trajectory) ) self.behavior_policy_reward[ behavior_policy ] = self.behavior_policy_reward[behavior_policy] * ( 1 - done ).cumprod( axis=1 ) else: behavior_policy = self.ope.logged_dataset["behavior_policy"] self.behavior_policy_reward[behavior_policy] = self.ope.logged_dataset[ "reward" ].reshape((-1, self.step_per_trajectory)) if self.ope.disable_reward_after_done: done = self.ope.logged_dataset["done"].reshape( (-1, self.step_per_trajectory) ) self.behavior_policy_reward[ behavior_policy ] = self.behavior_policy_reward[behavior_policy] * (1 - done).cumprod( axis=1 ) self._estimate_confidence_interval = { "bootstrap": estimate_confidence_interval_by_bootstrap, "hoeffding": estimate_confidence_interval_by_hoeffding, "bernstein": estimate_confidence_interval_by_empirical_bernstein, "ttest": estimate_confidence_interval_by_t_test, } def _check_compared_estimators( self, compared_estimators: Optional[List[str]] = None, ope_type: str = "standard_ope", ): if ope_type == "standard_ope": if self.ope is None: raise RuntimeError( "ope is not given. Please initialize the class with ope attribute" ) else: if self.cumulative_distribution_ope is None: raise RuntimeError( "cumulative_distribution_ope is not given. Please initialize the class with cumulative_distribution_ope attribute" ) if compared_estimators is None: compared_estimators = self.estimators_name[ope_type] elif not set(compared_estimators).issubset(self.estimators_name[ope_type]): raise ValueError( f"compared_estimators must be a subset of self.estimators_name['{ope_type}'], but found False." ) return compared_estimators def _check_basic_visualization_inputs( self, n_cols: Optional[int] = None, fig_dir: Optional[Path] = None, fig_name: Optional[str] = None, ): if n_cols is not None: check_scalar(n_cols, name="n_cols", target_type=int, min_val=1) if fig_dir is not None and not isinstance(fig_dir, Path): raise ValueError(f"fig_dir must be a Path, but {type(fig_dir)} is given") if fig_name is not None and not isinstance(fig_name, str): raise ValueError(f"fig_dir must be a string, but {type(fig_dir)} is given") def _check_topk_inputs( self, input_dict: Union[OPEInputDict, MultipleInputDict], behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, max_topk: Optional[int] = None, metrics: Optional[List[str]] = None, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, gamma: Optional[float] = None, ): if isinstance(input_dict, MultipleInputDict): max_topk_ = 100 if behavior_policy_name is None: if dataset_id is None: for n_eval_policies in input_dict.n_eval_policies.values(): max_topk_ = min(max_topk_, n_eval_policies.min()) else: for n_eval_policies in input_dict.n_eval_policies.values(): max_topk_ = min(max_topk_, n_eval_policies[dataset_id]) else: if dataset_id is None: max_topk_ = min( max_topk_, input_dict.n_eval_policies[behavior_policy_name].min(), ) else: max_topk_ = input_dict.n_eval_policies[behavior_policy_name][ dataset_id ] else: behavior_policy_name = input_dict[list(input_dict.keys())[0]][ "behavior_policy" ] max_topk_ = len(input_dict) if max_topk is None: max_topk = int(max_topk_) else: check_scalar(max_topk, name="max_topk", target_type=int, min_val=1) max_topk = min(max_topk, max_topk_) if metrics is not None: for metric in metrics: if metric not in [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ]: raise ValueError( f"The elements of metrics must be one of 'k-th', 'best', 'worst', 'mean', 'std', 'safety_violation_rate', or 'sharpe_ratio', but {metric} is given." ) if safety_threshold is None: if relative_safety_criteria is not None: check_scalar( relative_safety_criteria, name="relative_safety_criteria", target_type=float, min_val=0.0, ) discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma if behavior_policy_name is not None: behavior_policy_reward = self.behavior_policy_reward[ behavior_policy_name ] behavior_policy_value = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ).mean() + 1e-10 # to avoid zero division safety_threshold = relative_safety_criteria * behavior_policy_value safety_threshold = float(safety_threshold) elif len(self.behavior_policy_reward) == 1: behavior_policy_reward = list(self.behavior_policy_reward.values())[ 0 ] behavior_policy_value = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ).mean() + 1e-10 # to avoid zero division safety_threshold = relative_safety_criteria * behavior_policy_value safety_threshold = float(safety_threshold) else: safety_threshold = 0.0 else: safety_threshold = 0.0 check_scalar( safety_threshold, name="safety_threshold", target_type=float, ) return max_topk, safety_threshold def _obtain_true_selection_result( self, input_dict: OPEInputDict, return_variance: bool = False, return_lower_quartile: bool = False, return_conditional_value_at_risk: bool = False, return_by_dataframe: bool = False, quartile_alpha: float = 0.05, cvar_alpha: float = 0.05, ): """Obtain the oracle selection result based on the ground-truth policy value. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. return_variance: bool, default=False Whether to return the variance or not. return_lower_quartile: bool. default=False Whether to return the lower interquartile or not. return_conditional_value_at_risk: bool, default=False Whether to return the conditional value at risk or not. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. quartile_alpha: float, default=0.05 Proportion of the shaded region of the interquartile range. cvar_alpha: float, default=0.05 Proportion of the shaded region of the conditional value at risk. Return ------- ground_truth_dict/ground_truth_df: dict or dataframe Dictionary/dataframe containing the following ground-truth (on-policy) metrics. .. code-block:: python key: [ ranking, policy_value, relative_policy_value, variance, ranking_by_lower_quartile, lower_quartile, ranking_by_conditional_value_at_risk, conditional_value_at_risk, parameters, # only when return_by_dataframe == False ] ranking: list of str Name of the candidate policies sorted by the ground-truth policy value. policy_value: list of float Ground-truth policy value of the candidate policies (sorted by ranking). relative_policy_value: list of float Ground-truth relative policy value of the candidate policies compared to the behavior policy (sorted by ranking). variance: list of float Ground-truth variance of the trajectory-wise reward of the candidate policies (sorted by ranking). If return_variance is `False`, `None` is recorded. ranking_by_lower_quartile: list of str Name of the candidate policies sorted by the ground-truth lower quartile of the trajectory-wise reward. If return_lower_quartile is `False`, `None` is recorded. lower_quartile: list of float Ground-truth lower quartile of the candidate policies (sorted by ranking_by_lower_quartile). If return_lower_quartile is `False`, `None` is recorded. ranking_by_conditional_value_at_risk: list of str Name of the candidate policies sorted by the ground-truth conditional value at risk. If return_conditional_value_at_risk is `False`, `None` is recorded. conditional_value_at_risk: list of float Ground-truth conditional value at risk of the candidate policies (sorted by ranking_by_conditional_value_at_risk). If return_conditional_value_at_risk is `False`, `None` is recorded. parameters: dict Dictionary containing quartile_alpha, and cvar_alpha. If return_by_dataframe is `True`, parameters will not be returned. """ candidate_policy_names = list(input_dict.keys()) for eval_policy in candidate_policy_names: if input_dict[eval_policy]["on_policy_policy_value"] is None: raise ValueError( f"one of the candidate policies, {eval_policy}, does not contain on-policy policy value in input_dict" ) behavior_policy = input_dict[eval_policy]["behavior_policy"] n_policies = len(candidate_policy_names) n_samples = len(input_dict[eval_policy]["on_policy_policy_value"]) policy_value = np.zeros(n_policies) for i, eval_policy in enumerate(candidate_policy_names): policy_value[i] = input_dict[eval_policy]["on_policy_policy_value"].mean() ranking_index = np.argsort(policy_value)[::-1] ranking = [candidate_policy_names[ranking_index[i]] for i in range(n_policies)] gamma = input_dict[eval_policy]["gamma"] discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma behavior_policy_reward = self.behavior_policy_reward[behavior_policy] behavior_policy_value = (discount[np.newaxis, :] * behavior_policy_reward).sum( axis=1 ).mean() + 1e-10 # to avoid zero division policy_value = np.sort(policy_value)[::-1] relative_policy_value = policy_value / behavior_policy_value if return_variance: variance = np.zeros(n_policies) for i, eval_policy in enumerate(candidate_policy_names): variance[i] = input_dict[eval_policy]["on_policy_policy_value"].var( ddof=1 ) variance = variance[ranking_index] if return_lower_quartile: lower_quartile = np.zeros(n_policies) for i, eval_policy in enumerate(candidate_policy_names): lower_quartile[i] = np.quantile( input_dict[eval_policy]["on_policy_policy_value"], q=quartile_alpha ) quartile_ranking_index = np.argsort(policy_value)[::-1] ranking_by_lower_quartile = [ candidate_policy_names[quartile_ranking_index[i]] for i in range(n_policies) ] lower_quartile = np.sort(lower_quartile)[::-1] if return_conditional_value_at_risk: cvar = np.zeros(n_policies) for i, eval_policy in enumerate(candidate_policy_names): cvar[i] = np.sort(input_dict[eval_policy]["on_policy_policy_value"])[ : int(n_samples * cvar_alpha) ].mean() cvar_ranking_index = np.argsort(cvar)[::-1] ranking_by_cvar = [ candidate_policy_names[cvar_ranking_index[i]] for i in range(n_policies) ] cvar = np.sort(cvar)[::-1] ground_truth_dict = { "ranking": ranking, "policy_value": policy_value, "relative_policy_value": relative_policy_value, "variance": variance if return_variance else None, "ranking_by_lower_quartile": ranking_by_lower_quartile if return_lower_quartile else None, "lower_quartile": lower_quartile if return_lower_quartile else None, "ranking_by_conditional_value_at_risk": ranking_by_cvar if return_conditional_value_at_risk else None, "conditional_value_at_risk": cvar if return_conditional_value_at_risk else None, "parameters": { "quartile_alpha": quartile_alpha if return_lower_quartile else None, "cvar_alpha": cvar_alpha if return_conditional_value_at_risk else None, }, } if return_by_dataframe: ground_truth_df = pd.DataFrame() for key in ground_truth_dict.keys(): if ground_truth_dict[key] is None or key == "parameters": continue ground_truth_df[key] = ground_truth_dict[key] return ground_truth_df if return_by_dataframe else ground_truth_dict def _select_by_policy_value( self, input_dict: OPEInputDict, compared_estimators: Optional[List[str]] = None, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, top_k_in_eval_metrics: int = 1, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, ): """Rank the candidate policies by their estimated policy values. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. return_true_values: bool, default=False Whether to return the true policy value and corresponding ranking of the candidate policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: mean-squared-error, rank-correlation, regret@k, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. top_k_in_eval_metrics: int, default=1 How many candidate policies are included in regret@k. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None (>= 0) The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [estimator_name][ estimated_ranking, estimated_policy_value, estimated_relative_policy_value, true_ranking, true_policy_value, true_relative_policy_value, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated policy value. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_policy_value: list of float Estimated policy value of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_relative_policy_value: list of float Estimated relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_policy_value: list of float True policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict when return_by_dataframe is `True`. true_relative_policy_value: list of float True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: float Mean-squared-error of the estimators calculated across candidate evaluation policies. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple of float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. regret: tuple of float and int Regret@k and k. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df when return_by_dataframe is `True`. safety_threshold: float A policy whose policy value is below the given threshold is to be considered unsafe. """ behavior_policy_name = list(input_dict.values())[0]["behavior_policy"] dataset_id = list(input_dict.values())[0]["dataset_id"] gamma = list(input_dict.values())[0]["gamma"] discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma behavior_policy_reward = self.behavior_policy_reward[behavior_policy_name] behavior_policy_value = (discount[np.newaxis, :] * behavior_policy_reward).sum( axis=1 ).mean() + 1e-10 # to avoid zero division if safety_threshold is None: if relative_safety_criteria is None: safety_threshold = 0.0 else: safety_threshold = relative_safety_criteria * behavior_policy_value estimated_policy_value_dict = self.ope.estimate_policy_value( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) ground_truth_dict = self.obtain_true_selection_result(input_dict) true_ranking = ground_truth_dict["ranking"] true_policy_value = ground_truth_dict["policy_value"] candidate_policy_names = ( true_ranking if return_metrics else list(input_dict.keys()) ) n_policies = len(candidate_policy_names) ops_dict = {} for i, estimator in enumerate(compared_estimators): estimated_policy_value_ = np.zeros(n_policies) true_policy_value_ = np.zeros(n_policies) for j, eval_policy in enumerate(candidate_policy_names): estimated_policy_value_[j] = estimated_policy_value_dict[eval_policy][ estimator ] true_policy_value_[j] = true_policy_value[j] estimated_ranking_index_ = np.argsort(estimated_policy_value_)[::-1] true_ranking_index_ = np.argsort(true_policy_value_)[::-1] estimated_ranking = [ candidate_policy_names[estimated_ranking_index_[i]] for i in range(n_policies) ] estimated_policy_value = np.sort(estimated_policy_value_)[::-1] estimated_relative_policy_value = ( estimated_policy_value / behavior_policy_value ) if return_metrics: mse = mean_squared_error(true_policy_value, estimated_policy_value_) rankcorr = spearmanr(np.arange(n_policies), estimated_ranking_index_) regret = ( true_policy_value[:top_k_in_eval_metrics].sum() - true_policy_value[estimated_ranking_index_][ :top_k_in_eval_metrics ].sum() ) true_safety = true_policy_value >= safety_threshold estimated_safety = estimated_policy_value_ >= safety_threshold if true_safety.sum() > 0: type_i_error_rate = ( true_safety > estimated_safety ).sum() / true_safety.sum() else: type_i_error_rate = 0.0 if (1 - true_safety).sum() > 0: type_ii_error_rate = (true_safety < estimated_safety).sum() / ( 1 - true_safety ).sum() else: type_ii_error_rate = 0.0 ops_dict[estimator] = { "estimated_ranking": estimated_ranking, "estimated_policy_value": estimated_policy_value, "estimated_relative_policy_value": estimated_relative_policy_value, } if return_true_values: ops_dict[estimator]["true_ranking"] = true_ranking_index_[ estimated_ranking_index_ ] ops_dict[estimator]["true_policy_value"] = true_policy_value_[ estimated_ranking_index_ ] ops_dict[estimator]["true_relative_policy_value"] = ( true_policy_value_[estimated_ranking_index_] / behavior_policy_value ) if return_metrics: ops_dict[estimator]["mean_squared_error"] = mse ops_dict[estimator]["rank_correlation"] = rankcorr ops_dict[estimator]["regret"] = (regret, top_k_in_eval_metrics) ops_dict[estimator]["type_i_error_rate"] = type_i_error_rate ops_dict[estimator]["type_ii_error_rate"] = type_ii_error_rate ops_dict[estimator]["safety_threshold"] = safety_threshold if return_by_dataframe: ranking_df_dict = defaultdict(pd.DataFrame) for i, estimator in enumerate(compared_estimators): ranking_df_ = pd.DataFrame() ranking_df_["estimated_ranking"] = ops_dict[estimator][ "estimated_ranking" ] ranking_df_["estimated_policy_value"] = ops_dict[estimator][ "estimated_policy_value" ] ranking_df_["estimated_relative_policy_value"] = ops_dict[estimator][ "estimated_relative_policy_value" ] if return_true_values: ranking_df_["true_ranking"] = ops_dict[estimator]["true_ranking"] ranking_df_["true_policy_value"] = ops_dict[estimator][ "true_policy_value" ] ranking_df_["true_relative_policy_value"] = ops_dict[estimator][ "true_relative_policy_value" ] ranking_df_dict[estimator] = ranking_df_ ranking_df_dict = defaultdict_to_dict(ranking_df_dict) if return_metrics: ( mse, rankcorr, pvalue, regret, type_i, type_ii, ) = ( [], [], [], [], [], [], ) for i, estimator in enumerate(compared_estimators): mse.append(ops_dict[estimator]["mean_squared_error"]) rankcorr.append(ops_dict[estimator]["rank_correlation"][0]) pvalue.append(ops_dict[estimator]["rank_correlation"][1]) regret.append(ops_dict[estimator]["regret"][0]) type_i.append(ops_dict[estimator]["type_i_error_rate"]) type_ii.append(ops_dict[estimator]["type_ii_error_rate"]) metric_df = pd.DataFrame() metric_df["estimator"] = compared_estimators metric_df["mean_squared_error"] = mse metric_df["rank_correlation"] = rankcorr metric_df["pvalue"] = pvalue metric_df[f"regret@{top_k_in_eval_metrics}"] = regret metric_df["type_i_error_rate"] = type_i metric_df["type_ii_error_rate"] = type_ii dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict return dfs if return_by_dataframe else ops_dict def _select_by_policy_value_via_cumulative_distribution_ope( self, input_dict: OPEInputDict, compared_estimators: Optional[List[str]] = None, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, top_k_in_eval_metrics: int = 1, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, ): """Rank the candidate policies by their estimated policy value via cumulative distribution OPE methods. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. return_true_values: bool, default=False Whether to return the true policy value and corresponding ranking of the candidate policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: mean-squared-error, rank-correlation, regret@k, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. top_k_in_eval_metrics: int, default=1 How many candidate policies are included in regret@k. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None (>= 0) The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [estimator_name][ estimated_ranking, estimated_policy_value, estimated_relative_policy_value, true_ranking, true_policy_value, true_relative_policy_value, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated policy value. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_policy_value: list of float Estimated policy value of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_relative_policy_value: list of float Estimated relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_policy_value: list of float True policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_relative_policy_value: list of float True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: float Mean-squared-error of the estimators calculated across candidate evaluation policies. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple of float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df when return_by_dataframe is `True`. regret: tuple of float and int Regret@k and k. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df when return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df when return_by_dataframe is `True`. safety_threshold: float A policy whose policy value is below the given threshold is to be considered unsafe. """ behavior_policy_name = list(input_dict.values())[0]["behavior_policy"] dataset_id = list(input_dict.values())[0]["dataset_id"] gamma = list(input_dict.values())[0]["gamma"] discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma behavior_policy_reward = self.behavior_policy_reward[behavior_policy_name] behavior_policy_value = (discount[np.newaxis, :] * behavior_policy_reward).sum( axis=1 ).mean() + 1e-10 # to avoid zero division if safety_threshold is None: if relative_safety_criteria is None: safety_threshold = 0.0 else: safety_threshold = relative_safety_criteria * behavior_policy_value estimated_policy_value_dict = self.cumulative_distribution_ope.estimate_mean( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) ground_truth_dict = self.obtain_true_selection_result(input_dict) true_ranking = ground_truth_dict["ranking"] true_policy_value = ground_truth_dict["policy_value"] candidate_policy_names = ( true_ranking if return_metrics else list(input_dict.keys()) ) n_policies = len(candidate_policy_names) ops_dict = {} for i, estimator in enumerate(compared_estimators): estimated_policy_value_ = np.zeros(n_policies) true_policy_value_ = np.zeros(n_policies) for j, eval_policy in enumerate(candidate_policy_names): estimated_policy_value_[j] = estimated_policy_value_dict[eval_policy][ estimator ] true_policy_value_[j] = true_policy_value[j] estimated_ranking_index_ = np.argsort(estimated_policy_value_)[::-1] true_ranking_index_ = np.argsort(true_policy_value_)[::-1] estimated_ranking = [ candidate_policy_names[estimated_ranking_index_[i]] for i in range(n_policies) ] estimated_policy_value = np.sort(estimated_policy_value_)[::-1] estimated_relative_policy_value = ( estimated_policy_value / behavior_policy_value ) if return_metrics: mse = mean_squared_error( true_policy_value, np.nan_to_num(estimated_policy_value_) ) rankcorr = spearmanr(np.arange(n_policies), estimated_ranking_index_) regret = ( true_policy_value[:top_k_in_eval_metrics].sum() - true_policy_value[estimated_ranking_index_][ :top_k_in_eval_metrics ].sum() ) true_safety = true_policy_value >= safety_threshold estimated_safety = estimated_policy_value_ >= safety_threshold if true_safety.sum() > 0: type_i_error_rate = ( true_safety > estimated_safety ).sum() / true_safety.sum() else: type_i_error_rate = 0.0 if (1 - true_safety).sum() > 0: type_ii_error_rate = (true_safety < estimated_safety).sum() / ( 1 - true_safety ).sum() else: type_ii_error_rate = 0.0 ops_dict[estimator] = { "estimated_ranking": estimated_ranking, "estimated_policy_value": estimated_policy_value, "estimated_relative_policy_value": estimated_relative_policy_value, } if return_true_values: ops_dict[estimator]["true_ranking"] = true_ranking_index_[ estimated_ranking_index_ ] ops_dict[estimator]["true_policy_value"] = true_policy_value_[ estimated_ranking_index_ ] ops_dict[estimator]["true_relative_policy_value"] = ( true_policy_value_[estimated_ranking_index_] / behavior_policy_value ) if return_metrics: ops_dict[estimator]["mean_squared_error"] = mse ops_dict[estimator]["rank_correlation"] = rankcorr ops_dict[estimator]["regret"] = (regret, top_k_in_eval_metrics) ops_dict[estimator]["type_i_error_rate"] = type_i_error_rate ops_dict[estimator]["type_ii_error_rate"] = type_ii_error_rate ops_dict[estimator]["safety_threshold"] = safety_threshold if return_by_dataframe: ranking_df_dict = defaultdict(pd.DataFrame) for i, estimator in enumerate(compared_estimators): ranking_df_ = pd.DataFrame() ranking_df_["estimated_ranking"] = ops_dict[estimator][ "estimated_ranking" ] ranking_df_["estimated_policy_value"] = ops_dict[estimator][ "estimated_policy_value" ] ranking_df_["estimated_relative_policy_value"] = ops_dict[estimator][ "estimated_relative_policy_value" ] if return_true_values: ranking_df_["true_ranking"] = ops_dict[estimator]["true_ranking"] ranking_df_["true_policy_value"] = ops_dict[estimator][ "true_policy_value" ] ranking_df_["true_relative_policy_value"] = ops_dict[estimator][ "true_relative_policy_value" ] ranking_df_dict[estimator] = ranking_df_ ranking_df_dict = defaultdict_to_dict(ranking_df_dict) if return_metrics: ( mse, rankcorr, pvalue, regret, type_i, type_ii, ) = ( [], [], [], [], [], [], ) for i, estimator in enumerate(compared_estimators): mse.append(ops_dict[estimator]["mean_squared_error"]) rankcorr.append(ops_dict[estimator]["rank_correlation"][0]) pvalue.append(ops_dict[estimator]["rank_correlation"][1]) regret.append(ops_dict[estimator]["regret"][0]) type_i.append(ops_dict[estimator]["type_i_error_rate"]) type_ii.append(ops_dict[estimator]["type_ii_error_rate"]) metric_df = pd.DataFrame() metric_df["estimator"] = compared_estimators metric_df["mean_squared_error"] = mse metric_df["rank_correlation"] = rankcorr metric_df["pvalue"] = pvalue metric_df[f"regret@{top_k_in_eval_metrics}"] = regret metric_df["type_i_error_rate"] = type_i metric_df["type_ii_error_rate"] = type_ii dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict return dfs if return_by_dataframe else ops_dict def _select_by_policy_value_lower_bound( self, input_dict: OPEInputDict, compared_estimators: Optional[List[str]] = None, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, top_k_in_eval_metrics: int = 1, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, cis: List[str] = ["bootstrap"], alpha: float = 0.05, n_bootstrap_samples: int = 100, random_state: Optional[int] = None, ): """Rank the candidate policies by their estimated policy value lower bound. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. return_true_values: bool, default=False Whether to return the true policy value and corresponding ranking of the candidate policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: rank-correlation, regret@k, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. top_k_in_eval_metrics: int, default=1 How many candidate policies are included in regret@k. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None (>= 0) The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"] Estimation methods for confidence intervals. alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. n_bootstrap_samples: int, default=100 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [ci][estimator_name][ estimated_ranking, estimated_policy_value_lower_bound, estimated_relative_policy_value_lower_bound, true_ranking, true_policy_value, true_relative_policy_value, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated policy value lower bound. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_policy_value_lower_bound: list of float Estimated policy value lower bound of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_relative_policy_value_lower_bound: list of float Estimated relative policy value lower bound of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_policy_value: list of float True policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_relative_policy_value: list of float True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: None This is for API consistency. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple of float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. regret: tuple of float and int Regret@k and k. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. safety_threshold: float A policy whose policy value is below the given threshold is to be considered unsafe. """ ground_truth_dict = self.obtain_true_selection_result(input_dict) true_ranking = ground_truth_dict["ranking"] true_policy_value = ground_truth_dict["policy_value"] behavior_policy_name = list(input_dict.values())[0]["behavior_policy"] dataset_id = list(input_dict.values())[0]["dataset_id"] gamma = list(input_dict.values())[0]["gamma"] discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma behavior_policy_reward = self.behavior_policy_reward[behavior_policy_name] behavior_policy_value = (discount[np.newaxis, :] * behavior_policy_reward).sum( axis=1 ).mean() + 1e-10 # to avoid zero division if safety_threshold is None: if relative_safety_criteria is None: safety_threshold = 0.0 else: safety_threshold = relative_safety_criteria * behavior_policy_value candidate_policy_names = ( true_ranking if return_metrics else list(input_dict.keys()) ) n_policies = len(candidate_policy_names) ops_dict = defaultdict(dict) for ci in cis: estimated_policy_value_interval_dict = self.ope.estimate_intervals( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=alpha, ci=ci, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) for i, estimator in enumerate(compared_estimators): estimated_policy_value_lower_bound_ = np.zeros(n_policies) true_policy_value_ = np.zeros(n_policies) for j, eval_policy in enumerate(candidate_policy_names): estimated_policy_value_lower_bound_[ j ] = estimated_policy_value_interval_dict[eval_policy][estimator][ f"{100 * (1. - alpha)}% CI (lower)" ] true_policy_value_[j] = true_policy_value[j] estimated_ranking_index_ = np.argsort( estimated_policy_value_lower_bound_ )[::-1] true_ranking_index_ = np.argsort(true_policy_value_)[::-1] estimated_ranking = [ candidate_policy_names[estimated_ranking_index_[i]] for i in range(n_policies) ] estimated_policy_value_lower_bound = np.sort( estimated_policy_value_lower_bound_ )[::-1] estimated_relative_policy_value_lower_bound = ( estimated_policy_value_lower_bound / behavior_policy_value ) if return_metrics: rankcorr = spearmanr( np.arange(n_policies), estimated_ranking_index_ ) regret = ( true_policy_value[:top_k_in_eval_metrics].sum() - true_policy_value[estimated_ranking_index_][ :top_k_in_eval_metrics ].sum() ) true_safety = true_policy_value >= safety_threshold estimated_safety = ( estimated_policy_value_lower_bound_ >= safety_threshold ) if true_safety.sum() > 0: type_i_error_rate = ( true_safety > estimated_safety ).sum() / true_safety.sum() else: type_i_error_rate = 0.0 if (1 - true_safety).sum() > 0: type_ii_error_rate = (true_safety < estimated_safety).sum() / ( 1 - true_safety ).sum() else: type_ii_error_rate = 0.0 ops_dict[ci][estimator] = { "estimated_ranking": estimated_ranking, "estimated_policy_value_lower_bound": estimated_policy_value_lower_bound, "estimated_relative_policy_value_lower_bound": estimated_relative_policy_value_lower_bound, } if return_true_values: ops_dict[ci][estimator]["true_ranking"] = true_ranking_index_[ estimated_ranking_index_ ] ops_dict[ci][estimator]["true_policy_value"] = true_policy_value_[ estimated_ranking_index_ ] ops_dict[ci][estimator]["true_relative_policy_value"] = ( true_policy_value_[estimated_ranking_index_] / behavior_policy_value ) if return_metrics: ops_dict[ci][estimator]["mean_squared_error"] = None ops_dict[ci][estimator]["rank_correlation"] = rankcorr ops_dict[ci][estimator]["regret"] = (regret, top_k_in_eval_metrics) ops_dict[ci][estimator]["type_i_error_rate"] = type_i_error_rate ops_dict[ci][estimator]["type_ii_error_rate"] = type_ii_error_rate ops_dict[ci][estimator]["safety_threshold"] = safety_threshold ops_dict = defaultdict_to_dict(ops_dict) if return_by_dataframe: ranking_df_dict = defaultdict(lambda: defaultdict(pd.DataFrame)) for ci in cis: for i, estimator in enumerate(compared_estimators): ranking_df_ = pd.DataFrame() ranking_df_["estimated_ranking"] = ops_dict[ci][estimator][ "estimated_ranking" ] ranking_df_["estimated_policy_value_lower_bound"] = ops_dict[ci][ estimator ]["estimated_policy_value_lower_bound"] ranking_df_[ "estimated_relative_policy_value_lower_bound" ] = ops_dict[ci][estimator][ "estimated_relative_policy_value_lower_bound" ] if return_true_values: ranking_df_["true_ranking"] = ops_dict[ci][estimator][ "true_ranking" ] ranking_df_["true_policy_value"] = ops_dict[ci][estimator][ "true_policy_value" ] ranking_df_["true_relative_policy_value"] = ops_dict[ci][ estimator ]["true_relative_policy_value"] ranking_df_dict[ci][estimator] = ranking_df_ ranking_df_dict = defaultdict_to_dict(ranking_df_dict) if return_metrics: ( ci_, estimator_, rankcorr, pvalue, regret, type_i, type_ii, ) = ( [], [], [], [], [], [], [], ) for ci in cis: for i, estimator in enumerate(compared_estimators): ci_.append(ci) estimator_.append(estimator) rankcorr.append(ops_dict[ci][estimator]["rank_correlation"][0]) pvalue.append(ops_dict[ci][estimator]["rank_correlation"][1]) regret.append(ops_dict[ci][estimator]["regret"][0]) type_i.append(ops_dict[ci][estimator]["type_i_error_rate"]) type_ii.append(ops_dict[ci][estimator]["type_ii_error_rate"]) metric_df = pd.DataFrame() metric_df["ci"] = ci_ metric_df["estimator"] = estimator_ metric_df["mean_squared_error"] = np.nan metric_df["rank_correlation"] = rankcorr metric_df["pvalue"] = pvalue metric_df[f"regret@{top_k_in_eval_metrics}"] = regret metric_df["type_i_error_rate"] = type_i metric_df["type_ii_error_rate"] = type_ii dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict return dfs if return_by_dataframe else ops_dict def _select_by_lower_quartile( self, input_dict: OPEInputDict, compared_estimators: Optional[List[str]] = None, alpha: float = 0.05, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, safety_threshold: float = 0.0, ): """Rank the candidate policies by their estimated lower quartile of the trajectory-wise reward. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 0.5]`. return_true_values: bool, default=False Whether to return the true lower quartile of the trajectory-wise reward and corresponding ranking of the candidate evaluation policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: mean-squared-error, rank-correlation, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. safety_threshold: float, default=0.0 (>= 0) The lower quartile required to be considered a safe policy. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [estimator_name][ estimated_ranking, estimated_lower_quartile, true_ranking, true_lower_quartile, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated lower quartile of the trajectory-wise reward. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_lower_quartile: list of float Estimated lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_lower_quartile: list of float True lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: float Mean-squared-error of the estimated lower quartile of the trajectory-wise reward. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple of float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. regret: None This is for API consistency. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. safety_threshold: float The lower quartile required to be considered a safe policy. """ behavior_policy_name = list(input_dict.values())[0]["behavior_policy"] dataset_id = list(input_dict.values())[0]["dataset_id"] estimated_interquartile_range_dict = ( self.cumulative_distribution_ope.estimate_interquartile_range( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=alpha, ) ) ground_truth_dict = self.obtain_true_selection_result( input_dict, return_lower_quartile=True, quartile_alpha=alpha, ) true_ranking = ground_truth_dict["ranking_by_lower_quartile"] true_lower_quartile = ground_truth_dict["lower_quartile"] candidate_policy_names = ( true_ranking if return_metrics else list(input_dict.keys()) ) n_policies = len(candidate_policy_names) ops_dict = {} for i, estimator in enumerate(compared_estimators): estimated_lower_quartile_ = np.zeros(n_policies) true_lower_quartile_ = np.zeros(n_policies) for j, eval_policy in enumerate(candidate_policy_names): estimated_lower_quartile_[j] = estimated_interquartile_range_dict[ eval_policy ][estimator][f"{100 * (1. - alpha)}% quartile (lower)"] true_lower_quartile_[j] = true_lower_quartile[j] estimated_ranking_index_ = np.argsort(estimated_lower_quartile_)[::-1] true_ranking_index_ = np.argsort(true_lower_quartile_)[::-1] estimated_ranking = [ candidate_policy_names[estimated_ranking_index_[i]] for i in range(n_policies) ] estimated_lower_quartile = np.sort(estimated_lower_quartile_)[::-1] if return_metrics: mse = mean_squared_error(true_lower_quartile, estimated_lower_quartile_) rankcorr = spearmanr(np.arange(n_policies), estimated_ranking_index_) true_safety = true_lower_quartile >= safety_threshold estimated_safety = estimated_lower_quartile_ >= safety_threshold if true_safety.sum() > 0: type_i_error_rate = ( true_safety > estimated_safety ).sum() / true_safety.sum() else: type_i_error_rate = 0.0 if (1 - true_safety).sum() > 0: type_ii_error_rate = (true_safety < estimated_safety).sum() / ( 1 - true_safety ).sum() else: type_ii_error_rate = 0.0 ops_dict[estimator] = { "estimated_ranking": estimated_ranking, "estimated_lower_quartile": estimated_lower_quartile, } if return_true_values: ops_dict[estimator]["true_ranking"] = true_ranking_index_[ estimated_ranking_index_ ] ops_dict[estimator]["true_lower_quartile"] = true_lower_quartile_[ estimated_ranking_index_ ] if return_metrics: ops_dict[estimator]["mean_squared_error"] = mse ops_dict[estimator]["rank_correlation"] = rankcorr ops_dict[estimator]["regret"] = None ops_dict[estimator]["type_i_error_rate"] = type_i_error_rate ops_dict[estimator]["type_ii_error_rate"] = type_ii_error_rate ops_dict[estimator]["safety_threshold"] = safety_threshold if return_by_dataframe: ranking_df_dict = defaultdict(pd.DataFrame) for i, estimator in enumerate(compared_estimators): ranking_df_ = pd.DataFrame() ranking_df_["estimated_ranking"] = ops_dict[estimator][ "estimated_ranking" ] ranking_df_["estimated_lower_quartile"] = ops_dict[estimator][ "estimated_lower_quartile" ] if return_true_values: ranking_df_["true_ranking"] = ops_dict[estimator]["true_ranking"] ranking_df_["true_lower_quartile"] = ops_dict[estimator][ "true_lower_quartile" ] ranking_df_dict[estimator] = ranking_df_ ranking_df_dict = defaultdict_to_dict(ranking_df_dict) if return_metrics: ( mse, rankcorr, pvalue, type_i, type_ii, ) = ( [], [], [], [], [], ) for i, estimator in enumerate(compared_estimators): mse.append(ops_dict[estimator]["mean_squared_error"]) rankcorr.append(ops_dict[estimator]["rank_correlation"][0]) pvalue.append(ops_dict[estimator]["rank_correlation"][1]) type_i.append(ops_dict[estimator]["type_i_error_rate"]) type_ii.append(ops_dict[estimator]["type_ii_error_rate"]) metric_df = pd.DataFrame() metric_df["estimator"] = compared_estimators metric_df["mean_squared_error"] = mse metric_df["rank_correlation"] = rankcorr metric_df["pvalue"] = pvalue metric_df["regret"] = np.nan metric_df["type_i_error_rate"] = type_i metric_df["type_ii_error_rate"] = type_ii dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict return dfs if return_by_dataframe else ops_dict def _select_by_conditional_value_at_risk( self, input_dict: OPEInputDict, compared_estimators: Optional[List[str]] = None, alpha: float = 0.05, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, safety_threshold: float = 0.0, ): """Rank the candidate policies by their estimated conditional value at risk. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 1]`. return_true_values: bool, default=False Whether to return the true conditional value at risk and corresponding ranking of the candidate evaluation policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: mean-squared-error, rank-correlation, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. safety_threshold: float, default=0.0 (>= 0) The conditional value at risk required to be considered a safe policy. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [estimator_name][ estimated_ranking, estimated_conditional_value_at_risk, true_ranking, true_conditional_value_at_risk, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated conditional value at risk. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_conditional_value_at_risk: list of float Estimated conditional value at risk of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) conditional value at risk of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_conditional_value_at_risk: list of float True conditional value at risk of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: float Mean-squared-error of the estimated conditional value at risk. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple or float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. regret: None This is for API consistency. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is True`. safety_threshold: float The conditional value at risk required to be considered a safe policy. """ behavior_policy_name = list(input_dict.values())[0]["behavior_policy"] dataset_id = list(input_dict.values())[0]["dataset_id"] estimated_cvar_dict = ( self.cumulative_distribution_ope.estimate_conditional_value_at_risk( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alphas=alpha, ) ) ground_truth_dict = self.obtain_true_selection_result( input_dict, return_conditional_value_at_risk=True, cvar_alpha=alpha, ) true_ranking = ground_truth_dict["ranking_by_conditional_value_at_risk"] true_cvar = ground_truth_dict["conditional_value_at_risk"] candidate_policy_names = ( true_ranking if return_metrics else list(input_dict.keys()) ) n_policies = len(candidate_policy_names) ops_dict = {} for i, estimator in enumerate(compared_estimators): estimated_cvar_ = np.zeros(n_policies) true_cvar_ = np.zeros(n_policies) for j, eval_policy in enumerate(candidate_policy_names): estimated_cvar_[j] = estimated_cvar_dict[eval_policy][estimator] true_cvar_[j] = true_cvar[j] estimated_ranking_index_ = np.argsort(estimated_cvar_)[::-1] true_ranking_index_ = np.argsort(true_cvar_)[::-1] estimated_cvar_ = np.zeros(n_policies) for j, eval_policy in enumerate(candidate_policy_names): estimated_cvar_[j] = estimated_cvar_dict[eval_policy][estimator] estimated_ranking_index_ = np.argsort(estimated_cvar_)[::-1] estimated_ranking = [ candidate_policy_names[estimated_ranking_index_[i]] for i in range(n_policies) ] estimated_cvar = np.sort(estimated_cvar_)[::-1] if return_metrics: mse = mean_squared_error(true_cvar, np.nan_to_num(estimated_cvar_)) rankcorr = spearmanr(np.arange(n_policies), estimated_ranking_index_) true_safety = true_cvar >= safety_threshold estimated_safety = estimated_cvar_ >= safety_threshold if true_safety.sum() > 0: type_i_error_rate = ( true_safety > estimated_safety ).sum() / true_safety.sum() else: type_i_error_rate = 0.0 if (1 - true_safety).sum() > 0: type_ii_error_rate = (true_safety < estimated_safety).sum() / ( 1 - true_safety ).sum() else: type_ii_error_rate = 0.0 ops_dict[estimator] = { "estimated_ranking": estimated_ranking, "estimated_conditional_value_at_risk": estimated_cvar, } if return_true_values: ops_dict[estimator]["true_ranking"] = true_ranking_index_[ estimated_ranking_index_ ] ops_dict[estimator]["true_conditional_value_at_risk"] = true_cvar_[ estimated_ranking_index_ ] if return_metrics: ops_dict[estimator]["mean_squared_error"] = mse ops_dict[estimator]["rank_correlation"] = rankcorr ops_dict[estimator]["regret"] = None ops_dict[estimator]["type_i_error_rate"] = type_i_error_rate ops_dict[estimator]["type_ii_error_rate"] = type_ii_error_rate ops_dict[estimator]["safety_threshold"] = safety_threshold if return_by_dataframe: ranking_df_dict = defaultdict(pd.DataFrame) for i, estimator in enumerate(compared_estimators): ranking_df_ = pd.DataFrame() ranking_df_["estimated_ranking"] = ops_dict[estimator][ "estimated_ranking" ] ranking_df_["estimated_conditional_value_at_risk"] = ops_dict[ estimator ]["estimated_conditional_value_at_risk"] if return_true_values: ranking_df_["true_ranking"] = ops_dict[estimator]["true_ranking"] ranking_df_["true_conditional_value_at_risk"] = ops_dict[estimator][ "true_conditional_value_at_risk" ] ranking_df_dict[estimator] = ranking_df_ ranking_df_dict = defaultdict_to_dict(ranking_df_dict) if return_metrics: ( mse, rankcorr, pvalue, type_i, type_ii, ) = ( [], [], [], [], [], ) for i, estimator in enumerate(compared_estimators): mse.append(ops_dict[estimator]["mean_squared_error"]) rankcorr.append(ops_dict[estimator]["rank_correlation"][0]) pvalue.append(ops_dict[estimator]["rank_correlation"][1]) type_i.append(ops_dict[estimator]["type_i_error_rate"]) type_ii.append(ops_dict[estimator]["type_ii_error_rate"]) metric_df = pd.DataFrame() metric_df["estimator"] = compared_estimators metric_df["mean_squared_error"] = mse metric_df["rank_correlation"] = rankcorr metric_df["pvalue"] = pvalue metric_df["regret"] = np.nan metric_df["type_i_error_rate"] = type_i metric_df["type_ii_error_rate"] = type_ii dfs = (ranking_df_dict, metric_df) if return_metrics else ranking_df_dict return dfs if return_by_dataframe else ops_dict
[docs] def obtain_true_selection_result( self, input_dict: Union[OPEInputDict, MultipleInputDict], behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, return_variance: bool = False, return_lower_quartile: bool = False, return_conditional_value_at_risk: bool = False, return_by_dataframe: bool = False, quartile_alpha: float = 0.05, cvar_alpha: float = 0.05, ): """Obtain the oracle selection result based on the ground-truth policy value. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. return_variance: bool, default=False Whether to return the variance or not. return_lower_quartile: bool. default=False Whether to return the lower interquartile or not. return_conditional_value_at_risk: bool, default=False Whether to return the conditional value at risk or not. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. quartile_alpha: float, default=0.05 Proportion of the shaded region of the interquartile range. cvar_alpha: float, default=0.05 Proportion of the shaded region of the conditional value at risk. Return ------- ground_truth_dict/ground_truth_df: dict or dataframe (, list of dict or dataframe) Dictionary/dataframe containing the following ground-truth (on-policy) metrics. .. code-block:: python key: [ ranking, policy_value, relative_policy_value, variance, ranking_by_lower_quartile, lower_quartile, ranking_by_conditional_value_at_risk, conditional_value_at_risk, parameters, # only when return_by_dataframe == False ] ranking: list of str Name of the candidate policies sorted by the ground-truth policy value. policy_value: list of float Ground-truth policy value of the candidate policies (sorted by ranking). relative_policy_value: list of float Ground-truth relative policy value of the candidate policies compared to the behavior policy (sorted by ranking). variance: list of float Ground-truth variance of the trajectory-wise reward of the candidate policies (sorted by ranking). If return_variance is `False`, `None` is recorded. ranking_by_lower_quartile: list of str Name of the candidate policies sorted by the ground-truth lower quartile of the trajectory-wise reward. If return_lower_quartile is `False`, `None` is recorded. lower_quartile: list of float Ground-truth lower quartile of the candidate policies (sorted by ranking_by_lower_quartile). If return_lower_quartile is `False`, `None` is recorded. ranking_by_conditional_value_at_risk: list of str Name of the candidate policies sorted by the ground-truth conditional value at risk. If return_conditional_value_at_risk is `False`, `None` is recorded. conditional_value_at_risk: list of float Ground-truth conditional value at risk of the candidate policies (sorted by ranking_by_conditional_value_at_risk). If return_conditional_value_at_risk is `False`, `None` is recorded. parameters: dict Dictionary containing quartile_alpha, and cvar_alpha. If return_by_dataframe is `True`, parameters will not be returned. """ if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: ground_truth = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ground_truth_ = self._obtain_true_selection_result( input_dict_, return_variance=return_variance, return_lower_quartile=return_lower_quartile, return_conditional_value_at_risk=return_conditional_value_at_risk, return_by_dataframe=return_by_dataframe, quartile_alpha=quartile_alpha, cvar_alpha=cvar_alpha, ) ground_truth[behavior_policy].append(ground_truth_) ground_truth = defaultdict_to_dict(ground_truth) elif behavior_policy_name is None and dataset_id is not None: ground_truth = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id ) ground_truth_ = self._obtain_true_selection_result( input_dict_, return_variance=return_variance, return_lower_quartile=return_lower_quartile, return_conditional_value_at_risk=return_conditional_value_at_risk, return_by_dataframe=return_by_dataframe, quartile_alpha=quartile_alpha, cvar_alpha=cvar_alpha, ) ground_truth[behavior_policy] = ground_truth_ elif behavior_policy_name is not None and dataset_id is None: ground_truth = [] for dataset_id_ in range(input_dict.n_datasets[behavior_policy_name]): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ground_truth_ = self._obtain_true_selection_result( input_dict_, return_variance=return_variance, return_lower_quartile=return_lower_quartile, return_conditional_value_at_risk=return_conditional_value_at_risk, return_by_dataframe=return_by_dataframe, quartile_alpha=quartile_alpha, cvar_alpha=cvar_alpha, ) ground_truth.append(ground_truth_) else: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id ) ground_truth = self._obtain_true_selection_result( input_dict_, return_variance=return_variance, return_lower_quartile=return_lower_quartile, return_conditional_value_at_risk=return_conditional_value_at_risk, return_by_dataframe=return_by_dataframe, quartile_alpha=quartile_alpha, cvar_alpha=cvar_alpha, ) else: ground_truth = self._obtain_true_selection_result( input_dict, return_variance=return_variance, return_lower_quartile=return_lower_quartile, return_conditional_value_at_risk=return_conditional_value_at_risk, return_by_dataframe=return_by_dataframe, quartile_alpha=quartile_alpha, cvar_alpha=cvar_alpha, ) return ground_truth
[docs] def select_by_policy_value( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, top_k_in_eval_metrics: int = 1, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, ): """Rank the candidate policies by their estimated policy values. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. return_true_values: bool, default=False Whether to return the true policy value and corresponding ranking of the candidate policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: mean-squared-error, rank-correlation, regret@k, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. top_k_in_eval_metrics: int, default=1 How many candidate policies are included in regret@k. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None (>= 0) The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe (, list of dict or dataframe) Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [estimator_name][ estimated_ranking, estimated_policy_value, estimated_relative_policy_value, true_ranking, true_policy_value, true_relative_policy_value, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated policy value. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_policy_value: list of float Estimated policy value of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_relative_policy_value: list of float Estimated relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_policy_value: list of float True policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict when return_by_dataframe is `True`. true_relative_policy_value: list of float True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: float Mean-squared-error of the estimators calculated across candidate evaluation policies. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple of float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. regret: tuple of float and int Regret@k and k. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df when return_by_dataframe is `True`. safety_threshold: float A policy whose policy value is below the given threshold is to be considered unsafe. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) if self.ope.use_multiple_logged_dataset: if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: if ( self.ope.multiple_logged_dataset.n_datasets != input_dict.n_datasets ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = defaultdict(list) metric_df = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ranking_df[behavior_policy].append(ops_result_[0]) metric_df[behavior_policy].append(ops_result_[1]) ops_result = ( defaultdict_to_dict(ranking_df), defaultdict_to_dict(metric_df), ) else: ops_result = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ops_result[behavior_policy].append(ops_result_) ops_result = defaultdict_to_dict(ops_result) elif behavior_policy_name is None and dataset_id is not None: if ( self.ope.multiple_logged_dataset.behavior_policy_names != input_dict.behavior_policy_names ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies, but found False." ) if return_metrics and return_by_dataframe: ranking_df = {} metric_df = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_policy_value( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ranking_df[behavior_policy] = ops_result_[0] metric_df[behavior_policy] = ops_result_[1] ops_result = (ranking_df, metric_df) else: ops_result = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_policy_value( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ops_result[behavior_policy] = ops_result_ elif behavior_policy_name is not None and dataset_id is None: if ( self.ope.multiple_logged_dataset.n_datasets[ behavior_policy_name ] != input_dict.n_datasets[behavior_policy_name] ): raise ValueError( "Expected that logged datasets and input dicts consists of the same dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = [] metric_df = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ranking_df.append(ops_result_[0]) metric_df.append(ops_result_[1]) ops_result = (ranking_df, metric_df) else: ops_result = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ops_result.append(ops_result_) else: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id ) ops_result = self._select_by_policy_value( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) else: ops_result = self._select_by_policy_value( input_dict, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) else: if isinstance(input_dict, MultipleInputDict): raise ValueError( "when using LoggedDataset, please use InputDict instead of MultipleInputDict" ) ops_result = self._select_by_policy_value( input_dict, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) return ops_result
[docs] def select_by_policy_value_via_cumulative_distribution_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, top_k_in_eval_metrics: int = 1, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, ): """Rank the candidate policies by their estimated policy value via cumulative distribution OPE methods. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. return_true_values: bool, default=False Whether to return the true policy value and corresponding ranking of the candidate policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: mean-squared-error, rank-correlation, regret@k, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. top_k_in_eval_metrics: int, default=1 How many candidate policies are included in regret@k. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None (>= 0) The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe (, list of dict or dataframe) Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [estimator_name][ estimated_ranking, estimated_policy_value, estimated_relative_policy_value, true_ranking, true_policy_value, true_relative_policy_value, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated policy value. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_policy_value: list of float Estimated policy value of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_relative_policy_value: list of float Estimated relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_policy_value: list of float True policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_relative_policy_value: list of float True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: float Mean-squared-error of the estimators calculated across candidate evaluation policies. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple of float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df when return_by_dataframe is `True`. regret: tuple of float and int Regret@k and k. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df when return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df when return_by_dataframe is `True`. safety_threshold: float A policy whose policy value is below the given threshold is to be considered unsafe. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) if self.cumulative_distribution_ope.use_multiple_logged_dataset: if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets != input_dict.n_datasets ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = defaultdict(list) metric_df = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ranking_df[behavior_policy].append(ops_result_[0]) metric_df[behavior_policy].append(ops_result_[1]) ops_result = ( defaultdict_to_dict(ranking_df), defaultdict_to_dict(metric_df), ) else: ops_result = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ops_result[behavior_policy].append(ops_result_) ops_result = defaultdict_to_dict(ops_result) elif behavior_policy_name is None and dataset_id is not None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.behavior_policy_names != input_dict.behavior_policy_names ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies, but found False." ) if return_metrics and return_by_dataframe: ranking_df = {} metric_df = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ranking_df[behavior_policy] = ops_result_[0] metric_df[behavior_policy] = ops_result_[1] ops_result = (ranking_df, metric_df) else: ops_result = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ops_result[behavior_policy] = ops_result_ elif behavior_policy_name is not None and dataset_id is None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets[ behavior_policy_name ] != input_dict.n_datasets[behavior_policy_name] ): raise ValueError( "Expected that logged datasets and input dicts consists of the same dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = [] metric_df = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ranking_df.append(ops_result_[0]) metric_df.append(ops_result_[1]) ops_result = (ranking_df, metric_df) else: ops_result = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value_via_cumulative_distribution_ope( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ops_result.append(ops_result_) else: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id ) ops_result = ( self._select_by_policy_value_via_cumulative_distribution_ope( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ) else: ops_result = ( self._select_by_policy_value_via_cumulative_distribution_ope( input_dict, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) ) else: if isinstance(input_dict, MultipleInputDict): raise ValueError( "when using LoggedDataset, please use InputDict instead of MultipleInputDict" ) ops_result = self._select_by_policy_value_via_cumulative_distribution_ope( input_dict, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, ) return ops_result
[docs] def select_by_policy_value_lower_bound( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, top_k_in_eval_metrics: int = 1, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, cis: List[str] = ["bootstrap"], alpha: float = 0.05, n_bootstrap_samples: int = 100, random_state: Optional[int] = None, ): """Rank the candidate policies by their estimated policy value lower bound. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. return_true_values: bool, default=False Whether to return the true policy value and corresponding ranking of the candidate policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: rank-correlation, regret@k, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. top_k_in_eval_metrics: int, default=1 How many candidate policies are included in regret@k. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None (>= 0) The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"] Estimation methods for confidence intervals. alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. n_bootstrap_samples: int, default=100 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe (, list of dict or dataframe) Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [ci][estimator_name][ estimated_ranking, estimated_policy_value_lower_bound, estimated_relative_policy_value_lower_bound, true_ranking, true_policy_value, true_relative_policy_value, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated policy value lower bound. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_policy_value_lower_bound: list of float Estimated policy value lower bound of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_relative_policy_value_lower_bound: list of float Estimated relative policy value lower bound of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_policy_value: list of float True policy value of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_relative_policy_value: list of float True relative policy value of the candidate policies compared to the behavior policy (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: None This is for API consistency. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple of float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. regret: tuple of float and int Regret@k and k. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. safety_threshold: float A policy whose policy value is below the given threshold is to be considered unsafe. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) if self.ope.use_multiple_logged_dataset: if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: if ( self.ope.multiple_logged_dataset.n_datasets != input_dict.n_datasets ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = defaultdict(list) metric_df = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value_lower_bound( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) ranking_df[behavior_policy].append(ops_result_[0]) metric_df[behavior_policy].append(ops_result_[1]) ops_result = ( defaultdict_to_dict(ranking_df), defaultdict_to_dict(metric_df), ) else: ops_result = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value_lower_bound( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) ops_result[behavior_policy].append(ops_result_) ops_result = defaultdict_to_dict(ops_result) elif behavior_policy_name is None and dataset_id is not None: if ( self.ope.multiple_logged_dataset.behavior_policy_names != input_dict.behavior_policy_names ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies, but found False." ) if return_metrics and return_by_dataframe: ranking_df = {} metric_df = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_policy_value_lower_bound( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) ranking_df[behavior_policy] = ops_result_[0] metric_df[behavior_policy] = ops_result_[1] ops_result = (ranking_df, metric_df) else: ops_result = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_policy_value_lower_bound( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) ops_result[behavior_policy] = ops_result_ elif behavior_policy_name is not None and dataset_id is None: if ( self.ope.multiple_logged_dataset.n_datasets[ behavior_policy_name ] != input_dict.n_datasets[behavior_policy_name] ): raise ValueError( "Expected that logged datasets and input dicts consists of the same dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = [] metric_df = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value_lower_bound( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) ranking_df.append(ops_result_[0]) metric_df.append(ops_result_[1]) ops_result = (ranking_df, metric_df) else: ops_result = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ops_result_ = self._select_by_policy_value_lower_bound( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) ops_result.append(ops_result_) else: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) ops_result = self._select_by_policy_value_lower_bound( input_dict_, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) else: ops_result = self._select_by_policy_value_lower_bound( input_dict, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) else: if isinstance(input_dict, MultipleInputDict): raise ValueError( "when using LoggedDataset, please use InputDict instead of MultipleInputDict" ) ops_result = self._select_by_policy_value_lower_bound( input_dict, compared_estimators=compared_estimators, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, top_k_in_eval_metrics=top_k_in_eval_metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) return ops_result
[docs] def select_by_lower_quartile( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, alpha: float = 0.05, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, safety_threshold: float = 0.0, ): """Rank the candidate policies by their estimated lower quartile of the trajectory-wise reward. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 0.5]`. return_true_values: bool, default=False Whether to return the true lower quartile of the trajectory-wise reward and corresponding ranking of the candidate evaluation policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: mean-squared-error, rank-correlation, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. safety_threshold: float, default=0.0 (>= 0) The lower quartile required to be considered a safe policy. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [estimator_name][ estimated_ranking, estimated_lower_quartile, true_ranking, true_lower_quartile, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated lower quartile of the trajectory-wise reward. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_lower_quartile: list of float Estimated lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_lower_quartile: list of float True lower quartile of the trajectory-wise reward of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: float Mean-squared-error of the estimated lower quartile of the trajectory-wise reward. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple of float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. regret: None This is for API consistency. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. safety_threshold: float The lower quartile required to be considered a safe policy. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) if self.cumulative_distribution_ope.use_multiple_logged_dataset: if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets != input_dict.n_datasets ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = defaultdict(list) metric_df = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_lower_quartile( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ranking_df[behavior_policy].append(ops_result_[0]) metric_df[behavior_policy].append(ops_result_[1]) ops_result = ( defaultdict_to_dict(ranking_df), defaultdict_to_dict(metric_df), ) else: ops_result = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = ( ops_result_ ) = self._select_by_lower_quartile( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ops_result[behavior_policy].append(ops_result_) ops_result = defaultdict_to_dict(ops_result) elif behavior_policy_name is None and dataset_id is not None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.behavior_policy_names != input_dict.behavior_policy_names ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies, but found False." ) if return_metrics and return_by_dataframe: ranking_df = {} metric_df = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_lower_quartile( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ranking_df[behavior_policy] = ops_result_[0] metric_df[behavior_policy] = ops_result_[1] ops_result = (ranking_df, metric_df) else: ops_result = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_lower_quartile( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ops_result[behavior_policy] = ops_result_ elif behavior_policy_name is not None and dataset_id is None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets[ behavior_policy_name ] != input_dict.n_datasets[behavior_policy_name] ): raise ValueError( "Expected that logged datasets and input dicts consists of the same dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = [] metric_df = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_lower_quartile( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ranking_df.append(ops_result_[0]) metric_df.append(ops_result_[1]) ops_result = (ranking_df, metric_df) else: ops_result = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ops_result_ = self._select_by_lower_quartile( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ops_result.append(ops_result_) else: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id ) ops_result = self._select_by_lower_quartile( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) else: ops_result = self._select_by_lower_quartile( input_dict, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) else: if isinstance(input_dict, MultipleInputDict): raise ValueError( "when using LoggedDataset, please use InputDict instead of MultipleInputDict" ) ops_result = self._select_by_lower_quartile( input_dict, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) return ops_result
[docs] def select_by_conditional_value_at_risk( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, alpha: float = 0.05, return_true_values: bool = False, return_metrics: bool = False, return_by_dataframe: bool = False, safety_threshold: float = 0.0, ): """Rank the candidate policies by their estimated conditional value at risk. Parameters ------- input_dict: OPEInputDict or MultipleLoggedDataset Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 1]`. return_true_values: bool, default=False Whether to return the true conditional value at risk and corresponding ranking of the candidate evaluation policies. return_metrics: bool, default=False Whether to return the following evaluation metrics in terms of OPE and OPS: mean-squared-error, rank-correlation, and Type I and Type II error rate. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. safety_threshold: float, default=0.0 (>= 0) The conditional value at risk required to be considered a safe policy. Return ------- ops_dict/(ranking_df_dict, metric_df): dict or dataframe (, list of dict or dataframe) Dictionary/dataframe containing the result of OPS conducted by OPE estimators. .. code-block:: python key: [estimator_name][ estimated_ranking, estimated_conditional_value_at_risk, true_ranking, true_conditional_value_at_risk, mean_squared_error, rank_correlation, regret, type_i_error_rate, type_ii_error_rate, ] estimated_ranking: list of str Name of the candidate policies sorted by the estimated conditional value at risk. Recorded in ranking_df_dict if return_by_dataframe is `True`. estimated_conditional_value_at_risk: list of float Estimated conditional value at risk of the candidate policies (sorted by estimated_ranking). Recorded in ranking_df_dict if return_by_dataframe is `True`. true_ranking: list of int Ranking index of the (true) conditional value at risk of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. true_conditional_value_at_risk: list of float True conditional value at risk of the candidate policies (sorted by estimated_ranking). Recorded only when return_true_values is `True`. Recorded in ranking_df_dict if return_by_dataframe is `True`. mean_squared_error: float Mean-squared-error of the estimated conditional value at risk. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. rank_correlation: tuple or float Rank correlation coefficient between the true ranking and the estimated ranking, and its pvalue. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. regret: None This is for API consistency. Recorded in metric_df if return_by_dataframe is `True`. type_i_error_rate: float Type I error rate of the hypothetical test. True Negative when the policy is safe but estimated as unsafe. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is `True`. type_ii_error_rate: float Type II error rate of the hypothetical test. False Positive when the policy is unsafe but undetected. Recorded only when return_metric is `True`. Recorded in metric_df if return_by_dataframe is True`. safety_threshold: float The conditional value at risk required to be considered a safe policy. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) if self.cumulative_distribution_ope.use_multiple_logged_dataset: if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets != input_dict.n_datasets ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies and dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = defaultdict(list) metric_df = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_conditional_value_at_risk( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ranking_df[behavior_policy].append(ops_result_[0]) metric_df[behavior_policy].append(ops_result_[1]) ops_result = ( defaultdict_to_dict(ranking_df), defaultdict_to_dict(metric_df), ) else: ops_result = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id_, ) ops_result_ = self._select_by_conditional_value_at_risk( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ops_result[behavior_policy].append(ops_result_) ops_result = defaultdict_to_dict(ops_result) elif behavior_policy_name is None and dataset_id is not None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.behavior_policy_names != input_dict.behavior_policy_names ): raise ValueError( "Expected that logged datasets and input dicts consists of the same behavior policies, but found False." ) if return_metrics and return_by_dataframe: ranking_df = {} metric_df = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_conditional_value_at_risk( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ranking_df[behavior_policy] = ops_result_[0] metric_df[behavior_policy] = ops_result_[1] ops_result = (ranking_df, metric_df) else: ops_result = {} for behavior_policy in input_dict.behavior_policy_names: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy, dataset_id=dataset_id, ) ops_result_ = self._select_by_conditional_value_at_risk( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ops_result[behavior_policy] = ops_result_ elif behavior_policy_name is not None and dataset_id is None: if ( self.cumulative_distribution_ope.multiple_logged_dataset.n_datasets[ behavior_policy_name ] != input_dict.n_datasets[behavior_policy_name] ): raise ValueError( "Expected that logged datasets and input dicts consists of the same dataset ids, but found False." ) if return_metrics and return_by_dataframe: ranking_df = [] metric_df = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ops_result_ = self._select_by_conditional_value_at_risk( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ranking_df.append(ops_result_[0]) metric_df.append(ops_result_[1]) ops_result = (ranking_df, metric_df) else: ops_result = [] for dataset_id_ in range( input_dict.n_datasets[behavior_policy_name] ): input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id_, ) ops_result_ = self._select_by_conditional_value_at_risk( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) ops_result.append(ops_result_) else: input_dict_ = input_dict.get( behavior_policy_name=behavior_policy_name, dataset_id=dataset_id ) ops_result = self._select_by_conditional_value_at_risk( input_dict_, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) else: ops_result = self._select_by_conditional_value_at_risk( input_dict, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) else: if isinstance(input_dict, MultipleInputDict): raise ValueError( "when using LoggedDataset, please use InputDict instead of MultipleInputDict" ) ops_result = self._select_by_conditional_value_at_risk( input_dict, compared_estimators=compared_estimators, alpha=alpha, return_true_values=return_true_values, return_metrics=return_metrics, return_by_dataframe=return_by_dataframe, safety_threshold=safety_threshold, ) return ops_result
[docs] def visualize_policy_value_for_selection( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, alpha: float = 0.05, ci: str = "bootstrap", n_bootstrap_samples: int = 100, random_state: Optional[int] = None, is_relative: bool = False, hue: str = "estimator", sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_policy_value_standard_ope.png", ): """Visualize the policy value estimated by OPE estimators (box plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. is_relative: bool, default=False If `True`, the method visualizes the estimated policy value of the evaluation policy relative to the on-policy policy value of the behavior policy. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. sharey: bool, default=False If `True`, the y-axis will be shared among different estimators or evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_policy_value_standard_ope.png" Name of the bar figure. """ self.ope.visualize_off_policy_estimates( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=alpha, ci=ci, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, is_relative=is_relative, hue=hue, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_cumulative_distribution_function_for_selection( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, hue: str = "estimator", legend: bool = True, n_cols: Optional[int] = None, fig_dir: Optional[Path] = None, fig_name: str = "estimated_cumulative_distribution_function.png", ) -> None: """Visualize the cumulative distribution function (cdf plot). Parameters ------- input_dict: OPEInputDict or MultipleLoggedDataset Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. legend: bool, default=True Whether to include a legend in the figure. n_cols: int, default=None (> 0) Number of columns in the figure. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_cumulative_distribution_function.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_cumulative_distribution_function( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, hue=hue, legend=legend, n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_policy_value_of_cumulative_distribution_ope_for_selection( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, alpha: float = 0.05, is_relative: bool = False, hue: str = "estimator", sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_policy_value_cumulative_distribution_ope.png", ) -> None: """Visualize the policy value estimated by cumulative distribution OPE estimators (box plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. alpha: float, default=0.05 Significance level. The value should bw within `[0, 1)`. is_relative: bool, default=False If `True`, the method visualizes the estimated policy value of the evaluation policy relative to the ground-truth policy value of the behavior policy. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. sharey: bool, default=False If `True`, the y-axis will be shared among different evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_policy_value_cumulative_distribution_ope.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_policy_value( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=alpha, is_relative=is_relative, hue=hue, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_conditional_value_at_risk_for_selection( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, alphas: Optional[np.ndarray] = None, hue: str = "estimator", legend: bool = True, n_cols: Optional[int] = None, sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_conditional_value_at_risk.png", ) -> None: """Visualize the conditional value at risk estimated by cumulative distribution OPE estimators (cdf plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. alphas: array-like of shape (n_alpha, ), default=None Set of proportions of the shaded region. The values should be within `[0, 1)`. If `None` is given, :class:`np.linspace(0, 1, 21)` will be used. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. legend: bool, default=True Whether to include a legend in the figure. n_cols: int, default=None (> 0) Number of columns in the figure. sharey: bool, default=False If `True`, the y-axis will be shared among different evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_conditional_value_at_risk.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_conditional_value_at_risk( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alphas=alphas, hue=hue, legend=legend, n_cols=n_cols, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_interquartile_range_for_selection( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, alpha: float = 0.05, hue: str = "estimator", sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_interquartile_range.png", ) -> None: """Visualize the interquartile range estimated by cumulative distribution OPE estimators (box plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. sharey: bool, default=False If `True`, the y-axis will be shared among different evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_interquartile_range.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_interquartile_range( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=alpha, hue=hue, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_policy_value_with_multiple_estimates_standard_ope( self, input_dict: MultipleInputDict, compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, plot_type: str = "ci", hue: str = "estimator", legend: bool = True, sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_policy_value_multiple_standard_ope.png", ) -> None: """Visualize the policy value estimated by OPE estimators across multiple logged dataset. Note ------- This function is applicable only when MultipleLoggedDataset is used and MultipleInputDict is collected by the same evaluation policy across logged datasets. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. plot_type: {"ci", "scatter", "violin"}, default="ci" Type of plot. If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals. If "scatter" is given, we get a scatter plot of estimated values. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. legend: bool, default=True Whether to include a legend in the scatter plot. sharey: bool, default=False If `True`, the y-axis will be shared among different estimators or evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_policy_value_multiple.png" Name of the bar figure. """ self.ope.visualize_policy_value_with_multiple_estimates( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, plot_type=plot_type, hue=hue, legend=legend, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_cumulative_distribution_function_with_multiple_estimates( self, input_dict: MultipleInputDict, compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, scale_min: Optional[float] = None, scale_max: Optional[float] = None, n_partition: Optional[int] = None, plot_type: str = "ci_hue", hue: str = "estimator", legend: bool = True, n_cols: Optional[int] = None, sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_policy_value_multiple.png", ) -> None: """Visualize the policy value estimated by OPE estimators across multiple logged dataset. Note ------- This function is applicable only when MultipleLoggedDataset is used and MultipleInputDict is collected by the same evaluation policy across logged datasets. This function is not applicable when the data-driven reward scaler is used. Please set ``scale_min``, ``scale_max``, and ``n_partition`` to use. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. scale_min: float, default=None Minimum value of the reward scale in the CDF. scale_max: float, default=None Maximum value of the reward scale in the CDF. n_partition: int, default=None Number of partitions in the reward scale (x-axis of the CDF). plot_type: {"ci_hue", "ci_behavior_policy", "enumerate"}, default="ci_hue" Type of plot. If "ci" is given, the method visualizes the average policy value and its 95% confidence intervals based on the multiple estimate. If "enumerate" is given, we get a scatter plot of estimated values. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. legend: bool, default=True Whether to include a legend in the scatter plot. sharey: bool, default=False If `True`, the y-axis will be shared among different estimators or evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_policy_value_multiple.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_cumulative_distribution_function_with_multiple_estimates( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, scale_min=scale_min, scale_max=scale_max, n_partition=n_partition, plot_type=plot_type, hue=hue, legend=legend, n_cols=n_cols, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_policy_value_with_multiple_estimates_cumulative_distribution_ope( self, input_dict: MultipleInputDict, compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, plot_type: str = "ci", hue: str = "estimator", legend: bool = True, sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_policy_value_multiple_cumulative_distribution_ope.png", ) -> None: """Visualize the policy value estimated by OPE estimators across multiple logged dataset. Note ------- This function is applicable only when MultipleLoggedDataset is used and MultipleInputDict is collected by the same evaluation policy across logged datasets. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. plot_type: {"ci", "scatter", "violin"}, default="ci" Type of plot. If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals. If "scatter" is given, we get a scatter plot of estimated values. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. legend: bool, default=True Whether to include a legend in the scatter plot. sharey: bool, default=False If `True`, the y-axis will be shared among different estimators or evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_policy_value_multiple.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_policy_value_with_multiple_estimates( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, plot_type=plot_type, hue=hue, legend=legend, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_variance_with_multiple_estimates( self, input_dict: MultipleInputDict, compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, plot_type: str = "ci", hue: str = "estimator", legend: bool = True, sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_variance_multiple.png", ) -> None: """Visualize the variance of the trajectory-wise reward under the evaluation policy estimated by OPE estimators across multiple logged dataset. Note ------- This function is applicable only when MultipleLoggedDataset is used and MultipleInputDict is collected by the same evaluation policy across logged datasets. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. plot_type: {"ci", "scatter", "violin"}, default="ci" Type of plot. If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals. If "scatter" is given, we get a scatter plot of estimated values. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. legend: bool, default=True Whether to include a legend in the scatter plot. sharey: bool, default=False If `True`, the y-axis will be shared among different estimators or evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_variance_multiple.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_variance_with_multiple_estimates( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, plot_type=plot_type, hue=hue, legend=legend, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_conditional_value_at_risk_with_multiple_estimates( self, input_dict: MultipleInputDict, compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, alpha: float = 0.05, plot_type: str = "ci", hue: str = "estimator", legend: bool = True, sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_conditional_value_at_risk_multiple.png", ) -> None: """Visualize the conditional value at risk of the trajectory-wise reward under the evaluation policy estimated by OPE estimators across multiple logged dataset. Note ------- This function is applicable only when MultipleLoggedDataset is used and MultipleInputDict is collected by the same evaluation policy across logged datasets. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. alpha: float = 0.05. Proportion of the shaded region in CVaR estimate. The value should be within `[0, 1)`. plot_type: {"ci", "scatter", "violin"}, default="ci" Type of plot. If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals. If "scatter" is given, we get a scatter plot of estimated values. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. legend: bool, default=True Whether to include a legend in the scatter plot. sharey: bool, default=False If `True`, the y-axis will be shared among different estimators or evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_conditional_value_at_risk_multiple.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_conditional_value_at_risk_with_multiple_estimates( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, alpha=alpha, plot_type=plot_type, hue=hue, legend=legend, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_lower_quartile_with_multiple_estimates( self, input_dict: MultipleInputDict, compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, alpha: float = 0.05, plot_type: str = "ci", hue: str = "estimator", legend: bool = True, sharey: bool = False, fig_dir: Optional[Path] = None, fig_name: str = "estimated_conditional_value_at_risk_multiple.png", ) -> None: """Visualize the lower quartile of the trajectory-wise reward under the evaluation policy estimated by OPE estimators across multiple logged dataset. Note ------- This function is applicable only when MultipleLoggedDataset is used and MultipleInputDict is collected by the same evaluation policy across logged datasets. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. alpha: float = 0.05. Proportion of the shaded region in CVaR estimate. The value should be within `[0, 1)`. plot_type: {"ci", "scatter", "violin"}, default="ci" Type of plot. If "ci" is given, we get the empirical average of the estimated values with their estimated confidence intervals. If "scatter" is given, we get a scatter plot of estimated values. hue: {"estimator", "policy"}, default="estimator" Hue of the plot. legend: bool, default=True Whether to include a legend in the scatter plot. sharey: bool, default=False If `True`, the y-axis will be shared among different estimators or evaluation policies. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="estimated_conditional_value_at_risk_multiple.png" Name of the bar figure. """ self.cumulative_distribution_ope.visualize_lower_quartile_with_multiple_estimates( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, alpha=alpha, plot_type=plot_type, hue=hue, legend=legend, sharey=sharey, fig_dir=fig_dir, fig_name=fig_name, )
def _obtain_topk_policy_performance( self, true_dict: Dict, estimation_dict: Dict, input_dict: Union[OPEInputDict, MultipleInputDict], true_dict_ranking_arg: str, true_dict_value_arg: str, estimation_dict_ranking_arg: str, compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, max_topk: Optional[int] = None, ope_alpha: Optional[float] = None, return_safety_violation_rate: bool = False, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, return_by_dataframe: bool = False, ): """Calculate top-k policy deployment performances. Parameters ------- true_dict: dict Dictionary containing the true policy performance. estimation_dict: dict Dictionary containing the estimated policy performance. input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. true_dict_ranking_arg: str Name of the key indicating the ranked list of the candidate policies in true_dict. true_dict_value_arg: str Name of the key indicating the true policy performance of the candidate policies in true_dict. estimation_dict_ranking_arg: str Name of the ley indicaing the estimated ranking of the candidate policies in true_dict. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. behavior_policy: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. max_topk: int, default=None Maximum number of policies to be deployed. ope_alpha: float, default=None Significance level. The value should be within `[0, 1)`. return_safety_violation_rate: bool, default=False. Whether to calculate and return the safety violate. safety_threshold: float, default=0.0 (>= 0) The conditional value at risk required to be considered a safe policy. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that when returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if return_safety_violation_rate: metrics = ["k-th", "best", "worst", "mean", "std", "safety_violation_rate"] else: metrics = ["k-th", "best", "worst", "mean", "std"] if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: ranking_dict = defaultdict(list) for behavior_policy, n_datasets in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): true_dict[behavior_policy][dataset_id_] = dict( zip( true_dict[behavior_policy][dataset_id_][ true_dict_ranking_arg ], true_dict[behavior_policy][dataset_id_][ true_dict_value_arg ], ) ) tmp_ranking_dict = dict() for i, estimator in enumerate(compared_estimators): policy_performance = np.zeros( input_dict.n_eval_policies[behavior_policy][dataset_id_] ) estimated_ranking = estimation_dict[behavior_policy][ dataset_id_ ][estimator]["estimated_ranking"] for i, eval_policy in enumerate(estimated_ranking): policy_performance[i] = true_dict[behavior_policy][ dataset_id_ ][eval_policy] tmp_ranking_dict[estimator] = policy_performance ranking_dict[behavior_policy].append(tmp_ranking_dict) elif behavior_policy_name is None and dataset_id is not None: ranking_dict = {} for behavior_policy in input_dict.behavior_policy_names: true_dict[behavior_policy] = dict( zip( true_dict[behavior_policy][true_dict_ranking_arg], true_dict[behavior_policy][true_dict_value_arg], ) ) tmp_ranking_dict = dict() for i, estimator in enumerate(compared_estimators): policy_performance = np.zeros( input_dict.n_eval_policies[behavior_policy][dataset_id] ) estimated_ranking = estimation_dict[behavior_policy][estimator][ estimation_dict_ranking_arg ] for i, eval_policy in enumerate(estimated_ranking): policy_performance[i] = true_dict[behavior_policy][ eval_policy ] tmp_ranking_dict[estimator] = policy_performance ranking_dict[behavior_policy] = tmp_ranking_dict elif behavior_policy_name is not None and dataset_id is None: ranking_dict = [] for dataset_id_ in range(input_dict.n_datasets[behavior_policy_name]): true_dict[dataset_id_] = dict( zip( true_dict[dataset_id_][true_dict_ranking_arg], true_dict[dataset_id_][true_dict_value_arg], ) ) tmp_ranking_dict = dict() for i, estimator in enumerate(compared_estimators): policy_performance = np.zeros( input_dict.n_eval_policies[behavior_policy_name][ dataset_id_ ] ) estimated_ranking = estimation_dict[dataset_id_][estimator][ estimation_dict_ranking_arg ] for i, eval_policy in enumerate(estimated_ranking): policy_performance[i] = true_dict[dataset_id_][eval_policy] tmp_ranking_dict[estimator] = policy_performance ranking_dict.append(tmp_ranking_dict) else: true_dict = dict( zip( true_dict[true_dict_ranking_arg], true_dict[true_dict_value_arg], ) ) ranking_dict = dict() for i, estimator in enumerate(compared_estimators): policy_performance = np.zeros( input_dict.n_eval_policies[behavior_policy_name][dataset_id] ) estimated_ranking = estimation_dict[estimator][ estimation_dict_ranking_arg ] for i, eval_policy in enumerate(estimated_ranking): policy_performance[i] = true_dict[eval_policy] ranking_dict[estimator] = policy_performance else: true_dict = dict( zip( true_dict[true_dict_ranking_arg], true_dict[true_dict_value_arg], ) ) ranking_dict = dict() for i, estimator in enumerate(compared_estimators): policy_performance = np.zeros((len(input_dict),)) estimated_ranking = estimation_dict[estimator][ estimation_dict_ranking_arg ] for i, eval_policy in enumerate(estimated_ranking): policy_performance[i] = true_dict[eval_policy] ranking_dict[estimator] = policy_performance if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma behavior_policy_cum_reward = {} behavior_policy_value = {} if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None: for behavior_policy in input_dict.behavior_policy_names: behavior_policy_reward = self.behavior_policy_reward[ behavior_policy ] behavior_policy_cum_reward[behavior_policy] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ) + 1e-10 # to avoid zero division behavior_policy_value[behavior_policy] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ).mean() + 1e-10 # to avoid zero division else: behavior_policy_reward = self.behavior_policy_reward[ behavior_policy_name ] behavior_policy_cum_reward[behavior_policy_name] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ) + 1e-10 # to avoid zero division behavior_policy_value[behavior_policy_name] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ).mean() + 1e-10 # to avoid zero division else: behavior_policy = input_dict[list(input_dict.keys())[0]]["behavior_policy"] behavior_policy_reward = self.behavior_policy_reward[behavior_policy] behavior_policy_cum_reward[behavior_policy] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ) + 1e-10 # to avoid zero division behavior_policy_value[behavior_policy] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ).mean() + 1e-10 # to avoid zero division metric_dict = defaultdict(dict) if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: n_datasets = input_dict.n_datasets total_n_datasets = np.array(list(n_datasets.values())).sum() baseline = np.zeros(total_n_datasets) for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, total_n_datasets)) for topk in range(max_topk): l = 0 for behavior_policy in input_dict.behavior_policy_names: for dataset_id_ in range(n_datasets[behavior_policy]): if i == 0: if true_dict_value_arg == "policy_value": baseline[l] = behavior_policy_value[ behavior_policy ] elif ( true_dict_value_arg == "conditional_value_at_risk" ): baseline_reward = ( behavior_policy_cum_reward[ behavior_policy ] ) baseline[l] = np.sort(baseline_reward)[ : int(len(baseline_reward) * ope_alpha) ].mean() elif true_dict_value_arg == "lower_quartile": baseline_reward = ( behavior_policy_cum_reward[ behavior_policy ] ) baseline[l] = np.quantile( baseline_reward, q=ope_alpha, ) topk_values = ranking_dict[behavior_policy][ dataset_id_ ][estimator][: topk + 1] if metric == "k-th": topk_metric[topk, l] = topk_values[-1] elif metric == "best": topk_metric[topk, l] = topk_values.max() elif metric == "worst": topk_metric[topk, l] = topk_values.min() elif metric == "mean": topk_metric[topk, l] = topk_values.mean() elif metric == "std": topk_metric[topk, l] = topk_values.std(ddof=1) else: topk_metric[topk, l] = ( topk_values < safety_threshold ).sum() / (topk + 1) l += 1 metric_dict[estimator][metric] = topk_metric if i == 0: baseline = np.tile(baseline, (max_topk, 1)) sharpe_ratio = ( np.clip(metric_dict[estimator]["best"] - baseline, 0, None) / metric_dict[estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio elif behavior_policy_name is None and dataset_id is not None: total_n_datasets = len(input_dict.behavior_policy_names) baseline = np.zeros(total_n_datasets) for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, total_n_datasets)) for topk in range(max_topk): for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): if i == 0: if true_dict_value_arg == "policy_value": baseline[l] = behavior_policy_value[ behavior_policy ] elif ( true_dict_value_arg == "conditional_value_at_risk" ): baseline_reward = behavior_policy_cum_reward[ behavior_policy ] baseline[l] = np.sort(baseline_reward)[ : int(len(baseline_reward) * ope_alpha) ].mean() elif true_dict_value_arg == "lower_quartile": baseline_reward = behavior_policy_cum_reward[ behavior_policy ] baseline[l] = np.quantile( baseline_reward, q=ope_alpha, ) topk_values = ranking_dict[behavior_policy][estimator][ : topk + 1 ] if metric == "k-th": topk_metric[topk, l] = topk_values[-1] elif metric == "best": topk_metric[topk, l] = topk_values.max() elif metric == "worst": topk_metric[topk, l] = topk_values.min() elif metric == "mean": topk_metric[topk, l] = topk_values.mean() elif metric == "std": topk_metric[topk, l] = topk_values.std(ddof=1) else: topk_metric[topk, l] = ( topk_values < safety_threshold ).sum() / (topk + 1) metric_dict[estimator][metric] = topk_metric if i == 0: baseline = np.tile(baseline, (max_topk, 1)) sharpe_ratio = ( np.clip(metric_dict[estimator]["best"] - baseline, 0, None) / metric_dict[estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio elif behavior_policy_name is not None and dataset_id is None: total_n_datasets = input_dict.n_datasets[behavior_policy_name] if true_dict_value_arg == "policy_value": baseline = behavior_policy_value[behavior_policy_name] elif true_dict_value_arg == "conditional_value_at_risk": baseline_reward = behavior_policy_cum_reward[behavior_policy_name] baseline = np.sort(baseline_reward)[ : int(len(baseline_reward) * ope_alpha) ].mean() elif true_dict_value_arg == "lower_quartile": baseline_reward = behavior_policy_cum_reward[behavior_policy_name] baseline = np.quantile( baseline_reward, q=ope_alpha, ) for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, total_n_datasets)) for topk in range(max_topk): for l in range(total_n_datasets): topk_values = ranking_dict[l][estimator][: topk + 1] if metric == "k-th": topk_metric[topk, l] = topk_values[-1] elif metric == "best": topk_metric[topk, l] = topk_values.max() elif metric == "worst": topk_metric[topk, l] = topk_values.min() elif metric == "mean": topk_metric[topk, l] = topk_values.mean() elif metric == "std": topk_metric[topk, l] = topk_values.std(ddof=1) else: topk_metric[topk, l] = ( topk_values < safety_threshold ).sum() / (topk + 1) metric_dict[estimator][metric] = topk_metric sharpe_ratio = ( np.clip(metric_dict[estimator]["best"] - baseline, 0, None) / metric_dict[estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio else: total_n_datasets = 1 if true_dict_value_arg == "policy_value": baseline = behavior_policy_value[behavior_policy_name] elif true_dict_value_arg == "conditional_value_at_risk": baseline_reward = behavior_policy_cum_reward[behavior_policy_name] baseline = np.sort(baseline_reward)[ : int(len(baseline_reward) * ope_alpha) ].mean() elif true_dict_value_arg == "lower_quartile": baseline_reward = behavior_policy_cum_reward[behavior_policy_name] baseline = np.quantile( baseline_reward, q=ope_alpha, ) for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, total_n_datasets)) for topk in range(max_topk): topk_values = ranking_dict[estimator][: topk + 1] if metric == "k-th": topk_metric[topk, 0] = topk_values[-1] elif metric == "best": topk_metric[topk, 0] = topk_values.max() elif metric == "worst": topk_metric[topk, 0] = topk_values.min() elif metric == "mean": topk_metric[topk, 0] = topk_values.mean() elif metric == "std": topk_metric[topk, 0] = topk_values.std(ddof=1) else: topk_metric[topk, 0] = ( topk_values < safety_threshold ).sum() / (topk + 1) metric_dict[estimator][metric] = topk_metric sharpe_ratio = ( np.clip(metric_dict[estimator]["best"] - baseline, 0, None) / metric_dict[estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio else: behavior_policy = input_dict[list(input_dict.keys())[0]]["behavior_policy"] if true_dict_value_arg == "policy_value": baseline = behavior_policy_value[behavior_policy] elif true_dict_value_arg == "conditional_value_at_risk": baseline_reward = behavior_policy_cum_reward[behavior_policy] baseline = np.sort(baseline_reward)[ : int(len(baseline_reward) * ope_alpha) ].mean() elif true_dict_value_arg == "lower_quartile": baseline_reward = behavior_policy_cum_reward[behavior_policy] baseline = np.quantile( baseline_reward, q=ope_alpha, ) for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, 1)) for topk in range(max_topk): topk_values = ranking_dict[estimator][: topk + 1] if metric == "k-th": topk_metric[topk, 0] = topk_values[-1] elif metric == "best": topk_metric[topk, 0] = topk_values.max() elif metric == "worst": topk_metric[topk, 0] = topk_values.min() elif metric == "mean": topk_metric[topk, 0] = topk_values.mean() elif metric == "std": topk_metric[topk, 0] = topk_values.std(ddof=1) else: topk_metric[topk, 0] = ( topk_values < safety_threshold ).sum() / (topk + 1) metric_dict[estimator][metric] = topk_metric sharpe_ratio = ( np.clip(metric_dict[estimator]["best"] - baseline, 0, None) / metric_dict[estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[estimator]["sharpe_ratio"] = sharpe_ratio metric_dict = defaultdict_to_dict(metric_dict) if return_by_dataframe: metrics.extend(["sharpe_ratio"]) metric_df = [] for i, estimator in enumerate(compared_estimators): metric_df_ = pd.DataFrame() metric_df_["topk"] = np.arange(max_topk) metric_df_["estimator"] = estimator metric_df_ = metric_df_[["estimator", "topk"]] for metric in metrics: metric_df_[metric] = metric_dict[estimator][metric].mean(axis=1) metric_df.append(metric_df_) metric = pd.concat(metric_df, axis=0) else: metric = metric_dict return metric
[docs] def obtain_topk_policy_value_selected_by_standard_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, max_topk: Optional[int] = None, return_safety_violation_rate: bool = False, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, return_by_dataframe: bool = False, ): """Obtain the topk deployment result (policy value) selected by standard OPE. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. return_safety_violation_rate: bool, default=False. Whether to calculate and return the safety violate. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that policy performance refers to the (standard) policy value here. When returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, gamma=gamma, ) true_dict = self.obtain_true_selection_result( input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) estimation_dict = self.select_by_policy_value( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) return self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking", true_dict_value_arg="policy_value", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, return_safety_violation_rate=return_safety_violation_rate, safety_threshold=safety_threshold, return_by_dataframe=return_by_dataframe, )
[docs] def obtain_topk_policy_value_selected_by_cumulative_distribution_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, max_topk: Optional[int] = None, return_safety_violation_rate: bool = False, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, return_by_dataframe: bool = False, ): """Obtain the topk deployment result (policy value) selected by cumulative distribution OPE. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. return_safety_violation_rate: bool, default=False. Whether to calculate and return the safety violate. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that policy performance refers to the (standard) policy value here. When returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, gamma=gamma, ) true_dict = self.obtain_true_selection_result( input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) estimation_dict = self.select_by_policy_value_via_cumulative_distribution_ope( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) return self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking", true_dict_value_arg="policy_value", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, return_safety_violation_rate=return_safety_violation_rate, safety_threshold=safety_threshold, return_by_dataframe=return_by_dataframe, )
[docs] def obtain_topk_policy_value_selected_by_lower_bound( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, max_topk: Optional[int] = None, return_safety_violation_rate: bool = False, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, clip_sharpe_ratio: bool = False, cis: List[str] = ["bootstrap"], ope_alpha: float = 0.05, n_bootstrap_samples: int = 100, random_state: Optional[int] = None, return_by_dataframe: bool = False, ): """Obtain the topk deployment (policy value) result selected by its estimated lower bound. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. return_safety_violation_rate: bool, default=False. Whether to calculate and return the safety violate. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"] Estimation methods for confidence intervals. ope_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. n_bootstrap_samples: int, default=100 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that policy performance refers to the (standard) policy value here. When returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, gamma=gamma, ) if return_safety_violation_rate: metrics = ["k-th", "best", "worst", "mean", "std", "safety_violation_rate"] else: metrics = ["k-th", "best", "worst", "mean", "std"] policy_value_dict = self.select_by_policy_value_lower_bound( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, return_true_values=True, cis=cis, alpha=ope_alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) discount = np.full(self.step_per_trajectory, gamma).cumprod() / gamma behavior_policy_cum_reward = {} behavior_policy_value = {} if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None: for behavior_policy in input_dict.behavior_policy_names: behavior_policy_reward = self.behavior_policy_reward[ behavior_policy ] behavior_policy_cum_reward[behavior_policy] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ) + 1e-10 # to avoid zero division behavior_policy_value[behavior_policy] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ).mean() + 1e-10 # to avoid zero division else: behavior_policy_reward = self.behavior_policy_reward[ behavior_policy_name ] behavior_policy_cum_reward[behavior_policy_name] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ) + 1e-10 # to avoid zero division behavior_policy_value[behavior_policy_name] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ).mean() + 1e-10 # to avoid zero division else: behavior_policy = input_dict[list(input_dict.keys())[0]]["behavior_policy"] behavior_policy_reward = self.behavior_policy_reward[behavior_policy] behavior_policy_cum_reward[behavior_policy] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ) + 1e-10 # to avoid zero division behavior_policy_value[behavior_policy] = ( discount[np.newaxis, :] * behavior_policy_reward ).sum( axis=1 ).mean() + 1e-10 # to avoid zero division metric_dict = defaultdict(lambda: defaultdict(dict)) if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: n_datasets = input_dict.n_datasets total_n_datasets = np.array(list(n_datasets.values())).sum() baseline = np.zeros(total_n_datasets) for ci in cis: for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, total_n_datasets)) for topk in range(max_topk): l = 0 for behavior_policy in input_dict.behavior_policy_names: for dataset_id_ in range( n_datasets[behavior_policy] ): if i == 0 and ci == cis[0]: baseline[l] = behavior_policy_value[ behavior_policy ] topk_values = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "true_policy_value" ][ : topk + 1 ] if metric == "k-th": topk_metric[topk, l] = topk_values[-1] elif metric == "best": topk_metric[topk, l] = topk_values.max() elif metric == "worst": topk_metric[topk, l] = topk_values.min() elif metric == "mean": topk_metric[topk, l] = topk_values.mean() elif metric == "std": topk_metric[topk, l] = topk_values.std( ddof=1 ) else: topk_metric[topk, l] = ( topk_values < safety_threshold ).sum() / (topk + 1) l += 1 metric_dict[ci][estimator][metric] = topk_metric if i == 0 and ci == cis[0]: baseline = np.tile(baseline, (max_topk, 1)) sharpe_ratio = ( np.clip( metric_dict[ci][estimator]["best"] - baseline, 0, None ) / metric_dict[ci][estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num( sharpe_ratio[1:], posinf=1e2 ) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio elif behavior_policy_name is None and dataset_id is not None: total_n_datasets = len(input_dict.behavior_policy_names) baseline = np.zeros(total_n_datasets) for ci in cis: for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, total_n_datasets)) for topk in range(max_topk): for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): if i == 0 and ci == cis[0]: baseline[l] = behavior_policy_value[ behavior_policy ] topk_values = policy_value_dict[behavior_policy][ ci ][estimator]["true_policy_value"][: topk + 1] if metric == "k-th": topk_metric[topk, l] = topk_values[-1] elif metric == "best": topk_metric[topk, l] = topk_values.max() elif metric == "worst": topk_metric[topk, l] = topk_values.min() elif metric == "mean": topk_metric[topk, l] = topk_values.mean() elif metric == "std": topk_metric[topk, l] = topk_values.std(ddof=1) else: topk_metric[topk, l] = ( topk_values < safety_threshold ).sum() / (topk + 1) metric_dict[ci][estimator][metric] = topk_metric if i == 0 and ci == cis[0]: baseline = np.tile(baseline, (max_topk, 1)) sharpe_ratio = ( np.clip( metric_dict[ci][estimator]["best"] - baseline, 0, None ) / metric_dict[ci][estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num( sharpe_ratio[1:], posinf=1e2 ) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio elif behavior_policy_name is not None and dataset_id is None: total_n_datasets = input_dict.n_datasets[behavior_policy_name] baseline = behavior_policy_value[behavior_policy_name] for ci in cis: for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, total_n_datasets)) for topk in range(max_topk): for l in range(total_n_datasets): topk_values = policy_value_dict[l][ci][estimator][ "true_policy_value" ][: topk + 1] if metric == "k-th": topk_metric[topk, l] = topk_values[-1] elif metric == "best": topk_metric[topk, l] = topk_values.max() elif metric == "worst": topk_metric[topk, l] = topk_values.min() elif metric == "mean": topk_metric[topk, l] = topk_values.mean() elif metric == "std": topk_metric[topk, l] = topk_values.std(ddof=1) else: topk_metric[topk, l] = ( topk_values < safety_threshold ).sum() / (topk + 1) metric_dict[ci][estimator][metric] = topk_metric sharpe_ratio = ( np.clip( metric_dict[ci][estimator]["best"] - baseline, 0, None ) / metric_dict[ci][estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num( sharpe_ratio[1:], posinf=1e2 ) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio else: total_n_datasets = 1 baseline = behavior_policy_value[behavior_policy_name] for ci in cis: for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, total_n_datasets)) for topk in range(max_topk): topk_values = policy_value_dict[ci][estimator][ "true_policy_value" ][: topk + 1] if metric == "k-th": topk_metric[topk, 0] = topk_values[-1] elif metric == "best": topk_metric[topk, 0] = topk_values.max() elif metric == "worst": topk_metric[topk, 0] = topk_values.min() elif metric == "mean": topk_metric[topk, 0] = topk_values.mean() elif metric == "std": topk_metric[topk, 0] = topk_values.std(ddof=1) else: topk_metric[topk, 0] = ( topk_values < safety_threshold ).sum() / (topk + 1) metric_dict[ci][estimator][metric] = topk_metric sharpe_ratio = ( np.clip( metric_dict[ci][estimator]["best"] - baseline, 0, None ) / metric_dict[ci][estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num( sharpe_ratio[1:], posinf=1e2 ) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio else: behavior_policy = input_dict[list(input_dict.keys())[0]]["behavior_policy"] baseline = behavior_policy_value[behavior_policy] for ci in cis: for i, estimator in enumerate(compared_estimators): for j, metric in enumerate(metrics): topk_metric = np.zeros((max_topk, 1)) for topk in range(max_topk): topk_values = policy_value_dict[ci][estimator][ "true_policy_value" ][: topk + 1] if metric == "k-th": topk_metric[topk, 0] = topk_values[-1] elif metric == "best": topk_metric[topk, 0] = topk_values.max() elif metric == "worst": topk_metric[topk, 0] = topk_values.min() elif metric == "mean": topk_metric[topk, 0] = topk_values.mean() elif metric == "std": topk_metric[topk, 0] = topk_values.std(ddof=1) else: topk_metric[topk, 0] = ( topk_values < safety_threshold ).sum() / (topk + 1) metric_dict[ci][estimator][metric] = topk_metric sharpe_ratio = ( np.clip(metric_dict[ci][estimator]["best"] - baseline, 0, None) / metric_dict[ci][estimator]["std"] ) if clip_sharpe_ratio: sharpe_ratio[1:] = np.nan_to_num(sharpe_ratio[1:], posinf=1e2) sharpe_ratio[1:] = np.clip(sharpe_ratio[1:], 0.0, 1e2) metric_dict[ci][estimator]["sharpe_ratio"] = sharpe_ratio metric_dict = defaultdict_to_dict(metric_dict) if return_by_dataframe: metrics.extend(["sharpe_ratio"]) metric_df = [] for ci in cis: for estimator in compared_estimators: metric_df_ = pd.DataFrame() metric_df_["topk"] = np.arange(max_topk) metric_df_["estimator"] = estimator metric_df_["ci"] = ci metric_df_ = metric_df_[["ci", "estimator", "topk"]] for metric in metrics: metric_df_[metric] = metric_dict[ci][estimator][metric].mean( axis=1 ) metric_df.append(metric_df_) metric = pd.concat(metric_df, axis=0) else: metric = metric_dict return metric
[docs] def obtain_topk_conditional_value_at_risk_selected_by_standard_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ope_alpha: float = 0.05, max_topk: Optional[int] = None, return_safety_violation_rate: bool = False, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, return_by_dataframe: bool = False, ): """Obtain the topk deployment result (CVaR) selected by standard OPE. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. ope_alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 1]`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. return_safety_violation_rate: bool, default=False. Whether to calculate and return the safety violate. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that policy performance refers to CVaR here. When returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, safety_threshold=safety_threshold, gamma=gamma, ) true_dict = self.obtain_true_selection_result( input_dict, return_conditional_value_at_risk=True, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, cvar_alpha=ope_alpha, ) estimation_dict = self.select_by_policy_value( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) return self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking_by_conditional_value_at_risk", true_dict_value_arg="conditional_value_at_risk", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, ope_alpha=ope_alpha, return_safety_violation_rate=return_safety_violation_rate, safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, return_by_dataframe=return_by_dataframe, )
[docs] def obtain_topk_conditional_value_at_risk_selected_by_cumulative_distribution_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ope_alpha: float = 0.05, max_topk: Optional[int] = None, return_safety_violation_rate: bool = False, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, return_by_dataframe: bool = False, ): """Obtain the topk deployment result (CVaR) selected by cumulative distribution OPE. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. ope_alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 1]`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. return_safety_violation_rate: bool, default=False. Whether to calculate and return the safety violate. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that policy performance refers to CVaR here. When returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, safety_threshold=safety_threshold, gamma=gamma, ) true_dict = self.obtain_true_selection_result( input_dict, return_conditional_value_at_risk=True, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, cvar_alpha=ope_alpha, ) estimation_dict = self.select_by_conditional_value_at_risk( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=ope_alpha, ) return self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking_by_conditional_value_at_risk", true_dict_value_arg="conditional_value_at_risk", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, ope_alpha=ope_alpha, return_safety_violation_rate=return_safety_violation_rate, safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, return_by_dataframe=return_by_dataframe, )
[docs] def obtain_topk_lower_quartile_selected_by_standard_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ope_alpha: float = 0.05, max_topk: Optional[int] = None, return_safety_violation_rate: bool = False, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, return_by_dataframe: bool = False, ): """Obtain the topk deployment result (lower quartile) selected by standard OPE. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 0.5]`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. return_safety_violation_rate: bool, default=False. Whether to calculate and return the safety violate. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that policy performance refers to the lower quartile here. When returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, safety_threshold=safety_threshold, gamma=gamma, ) true_dict = self.obtain_true_selection_result( input_dict, return_lower_quartile=True, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, quartile_alpha=ope_alpha, ) estimation_dict = self.select_by_policy_value( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) return self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking_by_lower_quartile", true_dict_value_arg="lower_quartile", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, ope_alpha=ope_alpha, return_safety_violation_rate=return_safety_violation_rate, safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, return_by_dataframe=return_by_dataframe, )
[docs] def obtain_topk_lower_quartile_selected_by_cumulative_distribution_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ope_alpha: float = 0.05, max_topk: Optional[int] = None, return_safety_violation_rate: bool = False, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, return_by_dataframe: bool = False, ): """Obtain the topk deployment result (lower quartile) selected by cumulative distribution OPE. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 0.5]`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. return_safety_violation_rate: bool, default=False. Whether to calculate and return the safety violate. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. return_by_dataframe: bool, default=False Whether to return the result in a dataframe format. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that policy performance refers to the lower quartile here. When returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, safety_threshold=safety_threshold, gamma=gamma, ) true_dict = self.obtain_true_selection_result( input_dict, return_lower_quartile=True, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, quartile_alpha=ope_alpha, ) estimation_dict = self.select_by_lower_quartile( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=ope_alpha, ) return self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking_by_lower_quartile", true_dict_value_arg="lower_quartile", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, ope_alpha=ope_alpha, return_safety_violation_rate=return_safety_violation_rate, safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, return_by_dataframe=return_by_dataframe, )
def _obtain_min_max_val_for_topk_visualization( self, true_dict: Dict, input_dict: Union[OPEInputDict, MultipleInputDict], behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ): """Obtain minimum and maximum policy performance for topk visualization. Parameters ------- true_dict: dict Dictionary containing the true deployment result. input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. Return ------- topk_metric_dict/topk_metric_df: dict or dataframe Dictionary/dataframe containing the following top-k risk return tradeoff metrics. Note that when returning dataframe, the average value will be returned. .. code-block:: python key: [estimator][ k-th, best, # return worst, # risk mean, # risk std, # risk safety_violation_rate, # risk sharpe_ratio, # risk-return tradeoff ] k-th: ndarray of shape (max_topk, total_n_datasets) Policy performance of the k-th deployment policy. best: ndarray of shape (max_topk, total_n_datasets) Best policy performance among the top-k deployment policies. worst: ndarray of shape (max_topk, total_n_datasets) Wosrt policy performance among the top-k deployment policies. mean: ndarray of shape (max_topk, total_n_datasets) Mean policy performance of the top-k deployment policies. std: ndarray of shape (max_topk, total_n_datasets) Standard deviation of the policy performance among the top-k deployment policies. safety_violation_rate: ndarray of shape (max_topk, total_n_datasets) Safety violation rate regarding the policy performance of the top-k deployment policies. sharpe_ratio: ndarray of shape (max_topk, total_n_datasets) Risk-return tradeoff metrics defined as follows: :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. """ if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: n_datasets = input_dict.n_datasets total_n_datasets = np.array(list(n_datasets.values())).sum() elif behavior_policy_name is None and dataset_id is not None: total_n_datasets = len(input_dict.behavior_policy_names) elif behavior_policy_name is not None and dataset_id is None: total_n_datasets = input_dict.n_datasets[behavior_policy_name] else: total_n_datasets = 1 if isinstance(input_dict, MultipleInputDict): min_vals = np.zeros(total_n_datasets) max_vals = np.zeros(total_n_datasets) if behavior_policy_name is None and dataset_id is None: l = 0 for behavior_policy, n_datasets in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): min_vals[l] = np.array( list(true_dict[behavior_policy][dataset_id_].values()) ).min() max_vals[l] = np.array( list(true_dict[behavior_policy][dataset_id_].values()) ).max() l += 1 elif behavior_policy_name is None and dataset_id is not None: for l, behavior_policy in enumerate(input_dict.behavior_policy_names): min_vals[l] = np.array( list(true_dict[behavior_policy].values()) ).min() max_vals[l] = np.array( list(true_dict[behavior_policy].values()) ).max() elif behavior_policy_name is not None and dataset_id is None: for l in range(total_n_datasets): min_vals[l] = np.array(list(true_dict[l].values())).min() max_vals[l] = np.array(list(true_dict[l].values())).max() else: min_vals[0] = np.array(list(true_dict.values())).min() max_vals[0] = np.array(list(true_dict.values())).max() min_val = min_vals.mean() max_val = max_vals.mean() else: min_val = np.array(list(true_dict.values())).min() max_val = np.array(list(true_dict.values())).max() return min_val, max_val def _visualize_topk_policy_performance( self, metric_dict: Dict, min_val: float, max_val: float, compared_estimators: Optional[List[str]] = None, metrics: List[str] = [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ], max_topk: Optional[int] = None, safety_threshold: Optional[float] = None, visualize_ci: bool = False, ci: str = "bootstrap", alpha: float = 0.05, n_bootstrap_samples: int = 100, random_state: Optional[int] = None, legend: bool = True, ylabel: str = "policy performance", ymax_sharpe_ratio: Optional[float] = None, fig_dir: Optional[Path] = None, fig_name: Optional[str] = None, ): """Visualize top-k policy deployment performances. Parameters ------- metric_dict: dict Dictionary containing the top-k risk return tradeoff metrics. min_val: float Minimum value in the plot. max_val: float Maximum value in the plot. compared_estimators: list of str, default=None Name of compared estimators. If `None` is given, all the estimators are compared. metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"] Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report. For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized. We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. max_topk: int, default=None Maximum number of policies to be deployed. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. visualize_ci: bool, default=False Whether to visualize ci. ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. legend: bool, default=True Whether to include a legend in the figure. ylabel: str, default="policy performance" Label of the y-axis. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="topk_policy_value_standard_ope.png" Name of the bar figure. """ yaxis_min_val = ( min_val if safety_threshold is None else min(min_val, safety_threshold) ) yaxis_max_val = ( max_val if safety_threshold is None else max(max_val, safety_threshold) ) margin = (yaxis_max_val - yaxis_min_val) * 0.05 plt.style.use("ggplot") color = plt.rcParams["axes.prop_cycle"].by_key()["color"] n_colors = len(color) n_figs = len(metrics) fig, axes = plt.subplots( nrows=1, ncols=n_figs, figsize=(6 * n_figs, 4), ) if len(metrics) == 1: for i, estimator in enumerate(compared_estimators): axes.plot( np.arange(1, max_topk + 1), metric_dict[estimator][metric].mean(axis=1), color=color[i % n_colors], marker=markers[i], label=estimator, ) if visualize_ci: lower = np.zeros(max_topk) upper = np.zeros(max_topk) for topk in range(max_topk): ci_ = self._estimate_confidence_interval[ci]( metric_dict[estimator][metric][topk], alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) lower[topk] = ci_[f"{100 * (1. - alpha)}% CI (lower)"] upper[topk] = ci_[f"{100 * (1. - alpha)}% CI (upper)"] axes.fill_between( np.arange(1, max_topk + 1), lower, upper, color=color[i % n_colors], alpha=0.3, ) if metric in ["k-th", "best", "worst", "mean"]: if safety_threshold is not None: axes.plot( np.arange(1, max_topk + 1), np.full(max_topk, safety_threshold), color=dkred, label="safety threshold", ) axes.plot( np.arange(1, max_topk + 1), np.full(max_topk, max_val), color="black", linewidth=0.5, ) axes.plot( np.arange(1, max_topk + 1), np.full(max_topk, min_val), color="black", linewidth=0.5, ) axes.set_title(f"{metric}") axes.set_ylabel(f"{metric} {ylabel}") axes.set_ylim(yaxis_min_val - margin, yaxis_max_val + margin) elif metric == "std": axes.set_title("std") axes.set_ylabel("standard deviation") elif metric == "sharpe_ratio": axes.plot( np.arange(2, max_topk + 1), np.zeros(max_topk - 1), color="black", linewidth=0.5, ) axes.set_title("sharpe ratio") axes.set_ylabel("sharpe ratio") axes.set_ylim(0.0, ymax_sharpe_ratio) else: axes.set_title("safety violation") axes.set_ylabel("safety violation rate") axes.set_ylim(-0.05, 1.05) axes.set_xlabel("# of policies deployed") if legend: axes.legend(loc="upper right") else: for j, metric in enumerate(metrics): for i, estimator in enumerate(compared_estimators): axes[j].plot( np.arange(1, max_topk + 1), metric_dict[estimator][metric].mean(axis=1), color=color[i % n_colors], marker=markers[i], label=estimator, ) if visualize_ci: lower = np.zeros(max_topk) upper = np.zeros(max_topk) for topk in range(max_topk): ci_ = self._estimate_confidence_interval[ci]( metric_dict[estimator][metric][topk], alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) lower[topk] = ci_[f"{100 * (1. - alpha)}% CI (lower)"] upper[topk] = ci_[f"{100 * (1. - alpha)}% CI (upper)"] axes[j].fill_between( np.arange(1, max_topk + 1), lower, upper, color=color[i % n_colors], alpha=0.3, ) if metric in ["k-th", "best", "worst", "mean"]: if safety_threshold is not None: axes[j].plot( np.arange(1, max_topk + 1), np.full(max_topk, safety_threshold), color=dkred, label="safety threshold", ) axes[j].plot( np.arange(1, max_topk + 1), np.full(max_topk, max_val), color="black", linewidth=0.5, ) axes[j].plot( np.arange(1, max_topk + 1), np.full(max_topk, min_val), color="black", linewidth=0.5, ) axes[j].set_title(f"{metric}") axes[j].set_ylabel(f"{metric} {ylabel}") axes[j].set_ylim(yaxis_min_val - margin, yaxis_max_val + margin) elif metric == "std": axes[j].set_title("std") axes[j].set_ylabel("standard deviation") elif metric == "sharpe_ratio": axes[j].plot( np.arange(2, max_topk + 1), np.zeros(max_topk - 1), color="black", linewidth=0.5, ) axes[j].set_title("sharpe ratio") axes[j].set_ylabel("sharpe ratio") axes[j].set_ylim(0.0, ymax_sharpe_ratio) else: axes[j].set_title("safety violation") axes[j].set_ylabel("safety violation rate") axes[j].set_ylim(-0.05, 1.05) axes[j].set_xlabel("# of policies deployed") if legend: axes[j].legend(loc="upper right") if legend: handles, labels = axes[0].get_legend_handles_labels() # n_cols shows err # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6)) fig.subplots_adjust(hspace=0.35, wspace=0.2) plt.show() if fig_dir: fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")
[docs] def visualize_topk_policy_value_selected_by_standard_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, metrics: List[str] = [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ], max_topk: Optional[int] = None, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, clip_sharpe_ratio: bool = False, ymax_sharpe_ratio: Optional[float] = None, visualize_ci: bool = False, plot_ci: str = "bootstrap", plot_alpha: float = 0.05, plot_n_bootstrap_samples: int = 100, random_state: Optional[int] = None, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "topk_policy_value_standard_ope.png", ): """Visualize the topk deployment result (policy value) selected by standard OPE. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"] Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report. For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized. We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. Only applicable when using a single behavior policy. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. visualize_ci: bool, default=False Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.) plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. plot_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. plot_n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. legend: bool, default=True Whether to include a legend in the figure. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="topk_policy_value_standard_ope.png" Name of the bar figure. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, metrics=metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, gamma=gamma, ) self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name) true_dict = self.obtain_true_selection_result( input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) estimation_dict = self.select_by_policy_value( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) # note: true_dict is transformed in this function, as it is passed by reference metric_dict = self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking", true_dict_value_arg="policy_value", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, return_safety_violation_rate=("safety_violation_rate" in metrics), safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, ) # in the case with single input_dict, true_dict has not been transformed if not isinstance(input_dict, MultipleInputDict) or ( behavior_policy_name is not None and dataset_id is not None ): true_dict = dict( zip( true_dict["ranking"], true_dict["policy_value"], ) ) min_val, max_val = self._obtain_min_max_val_for_topk_visualization( true_dict=true_dict, input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) self._visualize_topk_policy_performance( metric_dict=metric_dict, min_val=min_val, max_val=max_val, compared_estimators=compared_estimators, metrics=metrics, max_topk=max_topk, safety_threshold=safety_threshold, visualize_ci=visualize_ci, ci=plot_ci, alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, legend=legend, ylabel="policy value", ymax_sharpe_ratio=ymax_sharpe_ratio, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_topk_policy_value_selected_by_cumulative_distribution_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, metrics: List[str] = [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ], max_topk: Optional[int] = None, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, clip_sharpe_ratio: bool = False, ymax_sharpe_ratio: Optional[float] = None, visualize_ci: bool = False, plot_ci: str = "bootstrap", plot_alpha: float = 0.05, plot_n_bootstrap_samples: int = 100, random_state: Optional[int] = None, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "topk_policy_value_cumulative_distribution_ope.png", ): """Visualize the topk deployment result (policy value) selected by cumulative distribution OPE. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"] Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report. For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized. We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. visualize_ci: bool, default=False Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.) plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. plot_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. plot_n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. legend: bool, default=True Whether to include a legend in the figure. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="topk_policy_value_cumulative_distribution_ope.png" Name of the bar figure. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, metrics=metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, gamma=gamma, ) self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name) true_dict = self.obtain_true_selection_result( input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) estimation_dict = self.select_by_policy_value_via_cumulative_distribution_ope( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) # note: true_dict is transformed in this function, as it is passed by reference metric_dict = self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking", true_dict_value_arg="policy_value", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, return_safety_violation_rate=("safety_violation_rate" in metrics), safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, ) # in the case with single input_dict, true_dict has not been transformed if not isinstance(input_dict, MultipleInputDict) or ( behavior_policy_name is not None and dataset_id is not None ): true_dict = dict( zip( true_dict["ranking"], true_dict["policy_value"], ) ) min_val, max_val = self._obtain_min_max_val_for_topk_visualization( true_dict=true_dict, input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) self._visualize_topk_policy_performance( metric_dict=metric_dict, min_val=min_val, max_val=max_val, compared_estimators=compared_estimators, metrics=metrics, max_topk=max_topk, safety_threshold=safety_threshold, visualize_ci=visualize_ci, ci=plot_ci, alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, legend=legend, ylabel="policy value", ymax_sharpe_ratio=ymax_sharpe_ratio, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_topk_policy_value_selected_by_lower_bound( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, metrics: List[str] = [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ], max_topk: Optional[int] = None, safety_threshold: Optional[float] = None, relative_safety_criteria: Optional[float] = None, clip_sharpe_ratio: bool = False, ymax_sharpe_ratio: Optional[float] = None, ope_cis: List[str] = ["bootstrap"], ope_alpha: float = 0.05, ope_n_bootstrap_samples: int = 100, visualize_ci: bool = False, plot_ci: str = "bootstrap", plot_alpha: float = 0.05, plot_n_bootstrap_samples: int = 100, random_state: Optional[int] = None, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "topk_policy_value_standard_ope_lower_bound.png", ): """Visualize the topk deployment result (policy value) selected by its estimated lower bound. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"] Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report. For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized. We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. safety_threshold: float, default=None. A policy whose policy value is below the given threshold is to be considered unsafe. relative_safety_criteria: float, default=None The relative policy value required to be considered a safe policy. For example, when 0.9 is given, candidate policy must exceed 90\\% of the behavior policy performance. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. ope_cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"] Estimation methods for confidence intervals. ope_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. ope_n_bootstrap_samples: int, default=100 (> 0) Number of resampling performed in the bootstrap procedure. visualize_ci: bool, default=False Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.) plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. plot_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. plot_n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. legend: bool, default=True Whether to include a legend in the figure. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="topk_policy_value_standard_ope_lower_bound.png" Name of the bar figure. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, metrics=metrics, safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, gamma=gamma, ) self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name) true_dict = self.obtain_true_selection_result( input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) metric_dict = self.obtain_topk_policy_value_selected_by_lower_bound( input_dict=input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, return_safety_violation_rate=("safety_violation_rate" in metrics), safety_threshold=safety_threshold, relative_safety_criteria=relative_safety_criteria, clip_sharpe_ratio=clip_sharpe_ratio, cis=ope_cis, ope_alpha=ope_alpha, n_bootstrap_samples=ope_n_bootstrap_samples, random_state=random_state, ) if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: for behavior_policy, n_datasets in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): true_dict[behavior_policy][dataset_id_] = dict( zip( true_dict[behavior_policy][dataset_id_]["policy_value"], true_dict[behavior_policy][dataset_id_]["policy_value"], ) ) elif behavior_policy_name is None and dataset_id is not None: for behavior_policy in input_dict.behavior_policy_names: true_dict[behavior_policy] = dict( zip( true_dict[behavior_policy]["policy_value"], true_dict[behavior_policy]["policy_value"], ) ) elif behavior_policy_name is not None and dataset_id is None: for dataset_id_ in range(input_dict.n_datasets[behavior_policy_name]): true_dict[dataset_id_] = dict( zip( true_dict[dataset_id_]["policy_value"], true_dict[dataset_id_]["policy_value"], ) ) else: true_dict = dict( zip( true_dict["policy_value"], true_dict["policy_value"], ) ) else: true_dict = dict( zip( true_dict["policy_value"], true_dict["policy_value"], ) ) min_val, max_val = self._obtain_min_max_val_for_topk_visualization( true_dict=true_dict, input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) yaxis_min_val = ( min_val if safety_threshold is None else min(min_val, safety_threshold) ) yaxis_max_val = ( max_val if safety_threshold is None else max(max_val, safety_threshold) ) margin = (yaxis_max_val - yaxis_min_val) * 0.05 plt.style.use("ggplot") color = plt.rcParams["axes.prop_cycle"].by_key()["color"] n_colors = len(color) n_rows = len(ope_cis) n_cols = len(metrics) fig, axes = plt.subplots( nrows=n_rows, ncols=n_cols, figsize=(6 * n_cols, 4 * n_rows), ) if n_rows == 1: ope_ci = ope_cis[0] if len(metrics) == 1: for i, estimator in enumerate(compared_estimators): axes.plot( np.arange(1, max_topk + 1), metric_dict[ope_ci][estimator][metric].mean(axis=1), color=color[i % n_colors], marker=markers[i], label=estimator, ) if visualize_ci: lower = np.zeros(max_topk) upper = np.zeros(max_topk) for topk in range(max_topk): ci = self._estimate_confidence_interval[plot_ci]( metric_dict[ope_ci][estimator][metric][topk], alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, ) lower[topk] = ci[f"{100 * (1. - plot_alpha)}% CI (lower)"] upper[topk] = ci[f"{100 * (1. - plot_alpha)}% CI (upper)"] axes.fill_between( np.arange(1, max_topk + 1), lower, upper, color=color[i % n_colors], alpha=0.3, ) if metric in ["k-th", "best", "worst", "mean"]: if safety_threshold is not None: axes.plot( np.arange(1, max_topk + 1), np.full(max_topk, safety_threshold), color=dkred, label="safety threshold", ) axes.plot( np.arange(1, max_topk + 1), np.full(max_topk, max_val), color="black", linewidth=0.5, ) axes.plot( np.arange(1, max_topk + 1), np.full(max_topk, min_val), color="black", linewidth=0.5, ) axes.set_title(f"{metric}") axes.set_ylabel(f"{metric} policy value") axes.set_ylim(yaxis_min_val - margin, yaxis_max_val + margin) elif metric == "std": axes.set_title("std") axes.set_ylabel("standard deviation") elif metric == "sharpe_ratio": axes.plot( np.arange(2, max_topk + 1), np.zeros(max_topk - 1), color="black", linewidth=0.5, ) axes.set_title("sharpe ratio") axes.set_ylabel("sharpe ratio") axes.set_ylim(0.0, ymax_sharpe_ratio) else: axes.set_title("safety violation") axes.set_ylabel("safety violation rate") axes.set_ylim(-0.05, 1.05) axes.set_xlabel("# of policies deployed") if legend: axes.legend(loc="upper right") if legend: handles, labels = axes.get_legend_handles_labels() # n_cols shows err # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6)) else: for j, metric in enumerate(metrics): for i, estimator in enumerate(compared_estimators): axes[j].plot( np.arange(1, max_topk + 1), metric_dict[ope_ci][estimator][metric].mean(axis=1), color=color[i % n_colors], marker=markers[i], label=estimator, ) if visualize_ci: lower = np.zeros(max_topk) upper = np.zeros(max_topk) for topk in range(max_topk): ci = self._estimate_confidence_interval[plot_ci]( metric_dict[ope_ci][estimator][metric][topk], alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, ) lower[topk] = ci[ f"{100 * (1. - plot_alpha)}% CI (lower)" ] upper[topk] = ci[ f"{100 * (1. - plot_alpha)}% CI (upper)" ] axes[j].fill_between( np.arange(1, max_topk + 1), lower, upper, color=color[i % n_colors], alpha=0.3, ) if metric in ["k-th", "best", "worst", "mean"]: if safety_threshold is not None: axes[j].plot( np.arange(1, max_topk + 1), np.full(max_topk, safety_threshold), color=dkred, label="safety threshold", ) axes[j].plot( np.arange(1, max_topk + 1), np.full(max_topk, max_val), color="black", linewidth=0.5, ) axes[j].plot( np.arange(1, max_topk + 1), np.full(max_topk, min_val), color="black", linewidth=0.5, ) axes[j].set_title(f"{metric}") axes[j].set_ylabel(f"{metric} policy value") axes[j].set_ylim(yaxis_min_val - margin, yaxis_max_val + margin) elif metric == "std": axes[j].set_title("std") axes[j].set_ylabel("standard deviation") elif metric == "sharpe_ratio": axes.plot( np.arange(2, max_topk + 1), np.zeros(max_topk - 1), color="black", linewidth=0.5, ) axes[j].set_title("sharpe ratio") axes[j].set_ylabel("sharpe ratio") axes[j].set_ylim(0.0, ymax_sharpe_ratio) else: axes[j].set_title("safety violation") axes[j].set_ylabel("safety violation rate") axes[j].set_ylim(-0.05, 1.05) axes[j].set_xlabel("# of policies deployed") if legend: axes[j].legend(loc="upper right") if legend: handles, labels = axes[0].get_legend_handles_labels() # n_cols shows err # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6)) else: if len(metrics) == 1: for l, ope_ci in enumerate(ope_cis): for i, estimator in enumerate(compared_estimators): axes[l].plot( np.arange(1, max_topk + 1), metric_dict[ope_ci][estimator][metric].mean(axis=1), color=color[i % n_colors], marker=markers[i], label=estimator, ) if visualize_ci: lower = np.zeros(max_topk) upper = np.zeros(max_topk) for topk in range(max_topk): ci = self._estimate_confidence_interval[plot_ci]( metric_dict[ope_ci][estimator][metric][topk], alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, ) lower[topk] = ci[ f"{100 * (1. - plot_alpha)}% CI (lower)" ] upper[topk] = ci[ f"{100 * (1. - plot_alpha)}% CI (upper)" ] axes[l].fill_between( np.arange(1, max_topk + 1), lower, upper, color=color[i % n_colors], alpha=0.3, ) if metric in ["k-th", "best", "worst", "mean"]: if safety_threshold is not None: axes[l].plot( np.arange(1, max_topk + 1), np.full(max_topk, safety_threshold), color=dkred, label="safety threshold", ) axes[l].plot( np.arange(1, max_topk + 1), np.full(max_topk, max_val), color="black", linewidth=0.5, ) axes[l].plot( np.arange(1, max_topk + 1), np.full(max_topk, min_val), color="black", linewidth=0.5, ) axes[l].set_title(f"{metric}") axes[l].set_ylabel(f"{metric} policy value") axes[l].set_ylim(yaxis_min_val - margin, yaxis_max_val + margin) elif metric == "std": axes[l].set_title("std") axes[l].set_ylabel("standard deviation") elif metric == "sharpe_ratio": axes[l].plot( np.arange(2, max_topk + 1), np.zeros(max_topk - 1), color="black", linewidth=0.5, ) axes[l].set_title("sharpe ratio") axes[l].set_ylabel("sharpe ratio") axes[l].set_ylim(0.0, ymax_sharpe_ratio) else: axes[l].set_title("safety violation") axes[l].set_ylabel("safety violation rate") axes[l].set_ylim(-0.05, 1.05) axes[l].set_xlabel("# of policies deployed") if legend: axes[l].legend(loc="upper right") if legend: handles, labels = axes[0].get_legend_handles_labels() # n_cols shows err # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6)) else: for l, ope_ci in enumerate(ope_cis): for j, metric in enumerate(metrics): for i, estimator in enumerate(compared_estimators): axes[l, j].plot( np.arange(1, max_topk + 1), metric_dict[ope_ci][estimator][metric].mean(axis=1), color=color[i % n_colors], marker=markers[i], label=estimator, ) if visualize_ci: lower = np.zeros(max_topk) upper = np.zeros(max_topk) for topk in range(max_topk): ci = self._estimate_confidence_interval[plot_ci]( metric_dict[ope_ci][estimator][metric][topk], alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, ) lower[topk] = ci[ f"{100 * (1. - plot_alpha)}% CI (lower)" ] upper[topk] = ci[ f"{100 * (1. - plot_alpha)}% CI (upper)" ] axes[l, j].fill_between( np.arange(1, max_topk + 1), lower, upper, color=color[i % n_colors], alpha=0.3, ) if metric in ["k-th", "best", "worst", "mean"]: if safety_threshold is not None: axes[l, j].plot( np.arange(1, max_topk + 1), np.full(max_topk, safety_threshold), color=dkred, label="safety threshold", ) axes[l, j].plot( np.arange(1, max_topk + 1), np.full(max_topk, max_val), color="black", linewidth=0.5, ) axes[l, j].plot( np.arange(1, max_topk + 1), np.full(max_topk, min_val), color="black", linewidth=0.5, ) axes[l, j].set_title(f"{metric}") axes[l, j].set_ylabel(f"{metric} policy value") axes[l, j].set_ylim( yaxis_min_val - margin, yaxis_max_val + margin ) elif metric == "std": axes[l, j].set_title("std") axes[l, j].set_ylabel("standard deviation") elif metric == "sharpe_ratio": axes[l, j].plot( np.arange(2, max_topk + 1), np.zeros(max_topk - 1), color="black", linewidth=0.5, ) axes[l, j].set_title("sharpe ratio") axes[l, j].set_ylabel("sharpe ratio") axes[l, j].set_ylim(0.0, ymax_sharpe_ratio) else: axes[l, j].set_title("safety violation") axes[l, j].set_ylabel("safety violation rate") axes[l, j].set_ylim(-0.05, 1.05) axes[l, j].set_xlabel("# of policies deployed") if legend: axes[l, j].legend(loc="upper right") if legend: handles, labels = axes[0, 0].get_legend_handles_labels() # n_cols shows err # fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), n_cols=min(len(labels), 6)) fig.subplots_adjust(hspace=0.35, wspace=0.2) plt.show() if fig_dir: fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")
[docs] def visualize_topk_conditional_value_at_risk_selected_by_standard_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ope_alpha: float = 0.05, metrics: List[str] = [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ], max_topk: Optional[int] = None, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, ymax_sharpe_ratio: Optional[float] = None, visualize_ci: bool = False, plot_ci: str = "bootstrap", plot_alpha: float = 0.05, plot_n_bootstrap_samples: int = 100, random_state: Optional[int] = None, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "topk_cvar_standard_ope.png", ): """Visualize the topk deployment result (CVaR) selected by standard OPE. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. ope_alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 1]`. metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"] Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report. For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized. We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. safety_threshold: float, default=0.0 (>= 0) The conditional value at risk required to be considered a safe policy. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. visualize_ci: bool, default=False Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.) plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. plot_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. plot_n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. legend: bool, default=True Whether to include a legend in the figure. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="topk_cvar_standard_ope.png" Name of the bar figure. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, metrics=metrics, safety_threshold=safety_threshold, gamma=gamma, ) self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name) true_dict = self.obtain_true_selection_result( input_dict, return_conditional_value_at_risk=True, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, cvar_alpha=ope_alpha, ) estimation_dict = self.select_by_policy_value( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) # note: true_dict is transformed in this function, as it is passed by reference metric_dict = self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking_by_conditional_value_at_risk", true_dict_value_arg="conditional_value_at_risk", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, ope_alpha=ope_alpha, return_safety_violation_rate=("safety_violation_rate" in metrics), safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, ) # in the case with single input_dict, true_dict has not been transformed if not isinstance(input_dict, MultipleInputDict) or ( behavior_policy_name is not None and dataset_id is not None ): true_dict = dict( zip( true_dict["ranking_by_conditional_value_at_risk"], true_dict["conditional_value_at_risk"], ) ) min_val, max_val = self._obtain_min_max_val_for_topk_visualization( true_dict=true_dict, input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) self._visualize_topk_policy_performance( metric_dict=metric_dict, min_val=min_val, max_val=max_val, compared_estimators=compared_estimators, metrics=metrics, max_topk=max_topk, safety_threshold=safety_threshold, visualize_ci=visualize_ci, ci=plot_ci, alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, ymax_sharpe_ratio=ymax_sharpe_ratio, legend=legend, ylabel=f"CVaR ({ope_alpha})", fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_topk_conditional_value_at_risk_selected_by_cumulative_distribution_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ope_alpha: float = 0.05, metrics: List[str] = [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ], max_topk: Optional[int] = None, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, ymax_sharpe_ratio: Optional[float] = None, visualize_ci: bool = False, plot_ci: str = "bootstrap", plot_alpha: float = 0.05, plot_n_bootstrap_samples: int = 100, random_state: Optional[int] = None, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "topk_cvar_cumulative_distribution_ope.png", ): """Visualize the topk deployment result (CVaR) selected by cumulative distribution OPE. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. ope_alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 1]`. metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"] Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report. For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized. We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. safety_threshold: float, default=0.0 (>= 0) The conditional value at risk required to be considered a safe policy. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. visualize_ci: bool, default=False Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.) plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. plot_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. plot_n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. legend: bool, default=True Whether to include a legend in the figure. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="topk_cvar_cumulative_distribution_ope.png" Name of the bar figure. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, metrics=metrics, safety_threshold=safety_threshold, gamma=gamma, ) self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name) true_dict = self.obtain_true_selection_result( input_dict, return_conditional_value_at_risk=True, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, cvar_alpha=ope_alpha, ) estimation_dict = self.select_by_conditional_value_at_risk( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=ope_alpha, ) # note: true_dict is transformed in this function, as it is passed by reference metric_dict = self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking_by_conditional_value_at_risk", true_dict_value_arg="conditional_value_at_risk", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, ope_alpha=ope_alpha, return_safety_violation_rate=("safety_violation_rate" in metrics), safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, ) # in the case with single input_dict, true_dict has not been transformed if not isinstance(input_dict, MultipleInputDict) or ( behavior_policy_name is not None and dataset_id is not None ): true_dict = dict( zip( true_dict["ranking_by_conditional_value_at_risk"], true_dict["conditional_value_at_risk"], ) ) min_val, max_val = self._obtain_min_max_val_for_topk_visualization( true_dict=true_dict, input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) self._visualize_topk_policy_performance( metric_dict=metric_dict, min_val=min_val, max_val=max_val, compared_estimators=compared_estimators, metrics=metrics, max_topk=max_topk, safety_threshold=safety_threshold, visualize_ci=visualize_ci, ci=plot_ci, alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, legend=legend, ylabel=f"CVaR ({ope_alpha})", ymax_sharpe_ratio=ymax_sharpe_ratio, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_topk_lower_quartile_selected_by_standard_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ope_alpha: float = 0.05, metrics: List[str] = [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ], max_topk: Optional[int] = None, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, ymax_sharpe_ratio: Optional[float] = None, visualize_ci: bool = False, plot_ci: str = "bootstrap", plot_alpha: float = 0.05, plot_n_bootstrap_samples: int = 100, random_state: Optional[int] = None, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "topk_lower_quartile_standard_ope.png", ): """Visualize the topk deployment result (lower quartile) selected by standard OPE. Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 0.5]`. metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"] Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report. For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized. We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. safety_threshold: float, default=0.0 (>= 0) The conditional value at risk required to be considered a safe policy. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. visualize_ci: bool, default=False Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.) plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. plot_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. plot_n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. legend: bool, default=True Whether to include a legend in the figure. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="topk_lower_quartile_standard_ope.png" Name of the bar figure. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, metrics=metrics, safety_threshold=safety_threshold, gamma=gamma, ) self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name) true_dict = self.obtain_true_selection_result( input_dict, return_lower_quartile=True, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, quartile_alpha=ope_alpha, ) estimation_dict = self.select_by_policy_value( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) # note: true_dict is transformed in this function, as it is passed by reference metric_dict = self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking_by_lower_quartile", true_dict_value_arg="lower_quartile", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, ope_alpha=ope_alpha, return_safety_violation_rate=("safety_violation_rate" in metrics), safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, ) # in the case with single input_dict, true_dict has not been transformed if not isinstance(input_dict, MultipleInputDict) or ( behavior_policy_name is not None and dataset_id is not None ): true_dict = dict( zip( true_dict["ranking_by_lower_quartile"], true_dict["lower_quartile"], ) ) min_val, max_val = self._obtain_min_max_val_for_topk_visualization( true_dict=true_dict, input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) self._visualize_topk_policy_performance( metric_dict=metric_dict, min_val=min_val, max_val=max_val, compared_estimators=compared_estimators, metrics=metrics, max_topk=max_topk, safety_threshold=safety_threshold, visualize_ci=visualize_ci, ci=plot_ci, alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, legend=legend, ylabel=f"lower quartile ({ope_alpha})", ymax_sharpe_ratio=ymax_sharpe_ratio, fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_topk_lower_quartile_selected_by_cumulative_distribution_ope( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, ope_alpha: float = 0.05, metrics: List[str] = [ "k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio", ], max_topk: Optional[int] = None, safety_threshold: Optional[float] = None, clip_sharpe_ratio: bool = False, ymax_sharpe_ratio: Optional[float] = None, visualize_ci: bool = False, plot_ci: str = "bootstrap", plot_alpha: float = 0.05, plot_n_bootstrap_samples: int = 100, random_state: Optional[int] = None, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "topk_lower_quartile_cumulative_distribution_ope.png", ): """Visualize the topk deployment result (lower quartile) selected by cumulative distribution OPE. Parameters ------- input_dict: OPEInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. If `None`, the average of the result will be shown. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 0.5]`. metrics: list of {"k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"}, default=["k-th", "best", "worst", "mean", "std", "safety_violation_rate", "sharpe_ratio"] Indicate which of the policy performance among {"best", "worst", "mean", "std"}, sharpe ratio, and safety violation rate to report. For "k-th", it means that the policy performance of the (estimated) k-th policy will be visualized. We define the sharpe ratio for OPE as :math:`S(\\hat{V}) := (\\mathrm{Best@k} - V(\\pi_0)) / \\mathrm{Std@k}`. max_topk: int, default=None Maximum number of policies to be deployed. If `None` is given, all the policies will be deployed. safety_threshold: float, default=0.0 (>= 0) The conditional value at risk required to be considered a safe policy. clip_sharpe_ratio: bool, default=False Whether to clip a large value of SharpeRatio with 1e2. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. visualize_ci: bool, default=False Whether to visualize ci. (Only applicable when :class:`MultipleInputDict` is given.) plot_ci: {"bootstrap", "hoeffding", "bernstein", "ttest"}, default="bootstrap" Method to estimate the confidence interval. plot_alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. plot_n_bootstrap_samples: int, default=10000 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. ymax_sharp_ratio: float, default=None Maximum value in y-axis of the plot of SharpeRatio. legend: bool, default=True Whether to include a legend in the figure. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="topk_lower_quartile_cumulative_distribution_ope.png" Name of the bar figure. """ if isinstance(input_dict, MultipleInputDict): input_dict_ = input_dict.get( behavior_policy_name=input_dict.behavior_policy_names[0], dataset_id=0 ) gamma = list(input_dict_.values())[0]["gamma"] else: gamma = list(input_dict.values())[0]["gamma"] compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) max_topk, safety_threshold = self._check_topk_inputs( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, metrics=metrics, safety_threshold=safety_threshold, gamma=gamma, ) self._check_basic_visualization_inputs(fig_dir=fig_dir, fig_name=fig_name) true_dict = self.obtain_true_selection_result( input_dict, return_lower_quartile=True, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, quartile_alpha=ope_alpha, ) estimation_dict = self.select_by_lower_quartile( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=ope_alpha, ) # note: true_dict is transformed in this function, as it is passed by reference metric_dict = self._obtain_topk_policy_performance( true_dict=true_dict, estimation_dict=estimation_dict, input_dict=input_dict, true_dict_ranking_arg="ranking_by_lower_quartile", true_dict_value_arg="lower_quartile", estimation_dict_ranking_arg="estimated_ranking", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, max_topk=max_topk, ope_alpha=ope_alpha, return_safety_violation_rate=("safety_violation_rate" in metrics), safety_threshold=safety_threshold, clip_sharpe_ratio=clip_sharpe_ratio, ) # in the case with single input_dict, true_dict has not been transformed if not isinstance(input_dict, MultipleInputDict) or ( behavior_policy_name is not None and dataset_id is not None ): true_dict = dict( zip( true_dict["ranking_by_lower_quartile"], true_dict["lower_quartile"], ) ) min_val, max_val = self._obtain_min_max_val_for_topk_visualization( true_dict=true_dict, input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) self._visualize_topk_policy_performance( metric_dict=metric_dict, min_val=min_val, max_val=max_val, compared_estimators=compared_estimators, metrics=metrics, max_topk=max_topk, safety_threshold=safety_threshold, visualize_ci=visualize_ci, ci=plot_ci, alpha=plot_alpha, n_bootstrap_samples=plot_n_bootstrap_samples, random_state=random_state, legend=legend, ylabel=f"lower quartile ({ope_alpha})", ymax_sharpe_ratio=ymax_sharpe_ratio, fig_dir=fig_dir, fig_name=fig_name, )
def _visualize_policy_performance_for_validation( self, estimation_dict: Dict, input_dict: Union[OPEInputDict, MultipleInputDict], true_value_arg: str, estimated_value_arg: str, compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, n_cols: Optional[int] = None, share_axes: bool = False, legend: bool = True, ylabel: str = "policy performance", fig_dir: Optional[Path] = None, fig_name: Optional[str] = None, ): """Visualize the correlation between the true and estimated policy performance. Parameters ------- estimation_dict: dict Dictionary containing the estimated policy performance. input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. true_value_arg: str Name of the key indicating the true policy performance in estimation_dict. estimated_value_arg: str Name of the key indicating the estimated policy performance in estimation_dict. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. n_cols: int, default=None (> 0) Number of columns in the figure. share_axes: bool, default=False Whether to share x- and y-axes or not. legend: bool, default=True Whether to include a legend in the scatter plot. ylabel: str, default="policy performance" Label of the y-axis. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default=None Name of the bar figure. """ plt.style.use("ggplot") color = plt.rcParams["axes.prop_cycle"].by_key()["color"] n_colors = len(color) n_figs = len(compared_estimators) n_cols = min(5, n_figs) if n_cols is None else n_cols n_rows = (n_figs - 1) // n_cols + 1 fig, axes = plt.subplots( nrows=n_rows, ncols=n_cols, figsize=(4 * n_cols, 3 * n_rows), sharex=share_axes, sharey=share_axes, ) guide_min, guide_max = 1e5, -1e5 if n_rows == 1: for i, estimator in enumerate(compared_estimators): if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): n_datasets = input_dict.n_datasets[behavior_policy] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = estimation_dict[behavior_policy][ dataset_id_ ][estimator][true_value_arg] estimated_policy_value = estimation_dict[ behavior_policy ][dataset_id_][estimator][estimated_value_arg] if dataset_id_ == 0: axes[i].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) else: # to avoid duplicated labels axes[i].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_vals.min()) max_val = max(max_val, max_vals.max()) elif behavior_policy_name is None and dataset_id is not None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): true_policy_value = estimation_dict[behavior_policy][ estimator ][true_value_arg] estimated_policy_value = estimation_dict[behavior_policy][ estimator ][estimated_value_arg] axes[i].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) min_val_ = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val_ = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_val_) max_val = max(max_val, max_val_) elif behavior_policy_name is not None and dataset_id is None: n_datasets = input_dict.n_datasets[behavior_policy_name] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = estimation_dict[dataset_id_][estimator][ true_value_arg ] estimated_policy_value = estimation_dict[dataset_id_][ estimator ][estimated_value_arg] axes[i].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min_vals.min() max_val = max_vals.max() else: true_policy_value = estimation_dict[estimator][true_value_arg] estimated_policy_value = estimation_dict[estimator][ estimated_value_arg ] axes[i].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) else: true_policy_value = estimation_dict[estimator][true_value_arg] estimated_policy_value = estimation_dict[estimator][ estimated_value_arg ] axes[i].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) axes[i].set_title(estimator) axes[i].set_xlabel(f"true {ylabel}") axes[i].set_ylabel(f"estimated {ylabel}") if ( legend and behavior_policy_name is None and isinstance(input_dict, MultipleInputDict) ): axes[i].legend(title="behavior_policy", loc="lower right") if not share_axes: margin = (max_val - min_val) * 0.05 guide = np.linspace(min_val - margin, max_val + margin) axes[i].plot( guide, guide, color="black", linewidth=1.0, ) guide_min = min_val if guide_min > min_val else guide_min guide_max = max_val if guide_max < max_val else guide_max if share_axes: margin = (guide_max - guide_min) * 0.05 guide = np.linspace(guide_min - margin, guide_max + margin) for i, estimator in enumerate(compared_estimators): axes[i].plot( guide, guide, color="black", linewidth=1.0, ) else: for i, estimator in enumerate(compared_estimators): if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): n_datasets = input_dict.n_datasets[behavior_policy] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = estimation_dict[behavior_policy][ dataset_id_ ][estimator][true_value_arg] estimated_policy_value = estimation_dict[ behavior_policy ][dataset_id_][estimator][estimated_value_arg] if dataset_id_ == 0: axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) else: # to avoid duplicated labels axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_vals.min()) max_val = max(max_val, max_vals.max()) elif behavior_policy_name is None and dataset_id is not None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): true_policy_value = estimation_dict[behavior_policy][ estimator ][true_value_arg] estimated_policy_value = estimation_dict[behavior_policy][ estimator ][estimated_value_arg] axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) min_val_ = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val_ = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_val_) max_val = max(max_val, max_val_) elif behavior_policy_name is not None and dataset_id is None: n_datasets = input_dict.n_datasets[behavior_policy_name] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = estimation_dict[dataset_id_][estimator][ true_value_arg ] estimated_policy_value = estimation_dict[dataset_id_][ estimator ][estimated_value_arg] axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min_vals.min() max_val = max_vals.max() else: true_policy_value = estimation_dict[estimator][true_value_arg] estimated_policy_value = estimation_dict[estimator][ estimated_value_arg ] axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) else: true_policy_value = estimation_dict[estimator][true_value_arg] estimated_policy_value = estimation_dict[estimator][ estimated_value_arg ] axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) axes[i // n_cols, i % n_cols].set_title(estimator) axes[i // n_cols, i % n_cols].set_xlabel(f"true {ylabel}") axes[i // n_cols, i % n_cols].set_ylabel(f"estimated {ylabel}") if ( legend and behavior_policy_name is None and isinstance(input_dict, MultipleInputDict) ): axes[i // n_cols, i % n_cols].legend( title="behavior_policy", loc="lower right" ) if not share_axes: margin = (max_val - min_val) * 0.05 guide = np.linspace(min_val - margin, max_val + margin) axes[i // n_cols, i % n_cols].plot( guide, guide, color="black", linewidth=1.0, ) guide_min = min_val if guide_min > min_val else guide_min guide_max = max_val if guide_max < max_val else guide_max if share_axes: margin = (guide_max - guide_min) * 0.05 guide = np.linspace(guide_min - margin, guide_max + margin) for i, estimator in enumerate(compared_estimators): axes[i // n_cols, i % n_cols].plot( guide, guide, color="black", linewidth=1.0, ) fig.tight_layout() plt.show() if fig_dir: fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")
[docs] def visualize_policy_value_for_validation( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, n_cols: Optional[int] = None, share_axes: bool = False, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "validation_policy_value_standard_ope.png", ): """Visualize the true policy value and its estimate (scatter plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. n_cols: int, default=None (> 0) Number of columns in the figure. share_axes: bool, default=False Whether to share x- and y-axes or not. legend: bool, default=True Whether to include a legend in the scatter plot. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="validation_policy_value_standard_ope.png" Name of the bar figure. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) self._check_basic_visualization_inputs( n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name ) policy_value_dict = self.select_by_policy_value( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, return_true_values=True, ) self._visualize_policy_performance_for_validation( estimation_dict=policy_value_dict, input_dict=input_dict, true_value_arg="true_policy_value", estimated_value_arg="estimated_policy_value", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, n_cols=n_cols, share_axes=share_axes, legend=legend, ylabel="policy value", fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_policy_value_of_cumulative_distribution_ope_for_validation( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, n_cols: Optional[int] = None, share_axes: bool = False, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "validation_policy_value_cumulative_distribution_ope.png", ): """Visualize the true policy value and its estimate obtained by cumulative distribution OPE (scatter plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. n_cols: int, default=None (> 0) Number of columns in the figure. share_axes: bool, default=False Whether to share x- and y-axes or not. legend: bool, default=True Whether to include a legend in the scatter plot. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="validation_policy_value_cumulative_distribution_ope.png" Name of the bar figure. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) self._check_basic_visualization_inputs( n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name ) policy_value_dict = self.select_by_policy_value_via_cumulative_distribution_ope( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, return_true_values=True, ) self._visualize_policy_performance_for_validation( estimation_dict=policy_value_dict, input_dict=input_dict, true_value_arg="true_policy_value", estimated_value_arg="estimated_policy_value", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, n_cols=n_cols, share_axes=share_axes, legend=legend, ylabel="policy value", fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_policy_value_lower_bound_for_validation( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, cis: List[str] = ["bootstrap"], alpha: float = 0.05, n_bootstrap_samples: int = 100, random_state: Optional[int] = None, n_cols: Optional[int] = None, share_axes: bool = False, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "validation_policy_value_lower_bound.png", ): """Visualize the true policy value and its estimate lower bound (scatter plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. cis: list of {"bootstrap", "hoeffding", "bernstein", "ttest"}, default=["bootstrap"] Estimation methods for confidence intervals. alpha: float, default=0.05 Significance level. The value should be within `[0, 1)`. n_bootstrap_samples: int, default=100 (> 0) Number of resampling performed in the bootstrap procedure. random_state: int, default=None (>= 0) Random state. n_cols: int, default=None (> 0) Number of columns in the figure. share_axes: bool, default=False Whether to share x- and y-axes or not. legend: bool, default=True Whether to include a legend in the scatter plot. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="validation_policy_value_lower_bound.png" Name of the bar figure. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="standard_ope" ) self._check_basic_visualization_inputs( n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name ) policy_value_dict = self.select_by_policy_value_lower_bound( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, cis=cis, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, return_true_values=True, ) plt.style.use("ggplot") color = plt.rcParams["axes.prop_cycle"].by_key()["color"] n_colors = len(color) n_figs = len(compared_estimators) * len(cis) if len(cis) == 1: n_cols = min(5, n_figs) if n_cols is None else n_cols else: n_cols = len(cis) n_rows = (n_figs - 1) // n_cols + 1 fig, axes = plt.subplots( nrows=n_rows, ncols=n_cols, figsize=(4 * n_cols, 3 * n_rows), sharex=share_axes, sharey=share_axes, ) guide_min, guide_max = 1e5, -1e5 if len(cis) == 1: if n_rows == 1: for ci in cis: for i, estimator in enumerate(compared_estimators): if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): n_datasets = input_dict.n_datasets[behavior_policy] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "estimated_policy_value_lower_bound" ] if dataset_id_ == 0: axes[i].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) else: # to remove duplicated labels axes[i].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_vals.min()) max_val = max(max_val, max_vals.max()) elif ( behavior_policy_name is None and dataset_id is not None ): min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): true_policy_value = policy_value_dict[ behavior_policy ][ci][estimator]["true_policy_value"] estimated_policy_value = policy_value_dict[ behavior_policy ][ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) min_val_ = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val_ = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_val_) max_val = max(max_val, max_val_) elif ( behavior_policy_name is not None and dataset_id is None ): n_datasets = input_dict.n_datasets[behavior_policy_name] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = policy_value_dict[dataset_id_][ ci ][estimator]["true_policy_value"] estimated_policy_value = policy_value_dict[ dataset_id_ ][ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min_vals.min() max_val = max_vals.max() else: true_policy_value = policy_value_dict[ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ci][ estimator ]["estimated_policy_value_lower_bound"] axes[i].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) else: true_policy_value = policy_value_dict[ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) axes[i].set_title(f"{ci}, {estimator}") axes[i].set_xlabel("true policy value") axes[i].set_ylabel("estimated policy value lower bound") if ( legend and behavior_policy_name is None and isinstance(input_dict, MultipleInputDict) ): axes[i].legend(title="behavior_policy", loc="lower right") if not share_axes: margin = (max_val - min_val) * 0.05 guide = np.linspace(min_val - margin, max_val + margin) axes[i].plot( guide, guide, color="black", linewidth=1.0, ) guide_min = min_val if guide_min > min_val else guide_min guide_max = max_val if guide_max < max_val else guide_max if share_axes: margin = (guide_max - guide_min) * 0.05 guide = np.linspace(guide_min - margin, guide_max + margin) for i, estimator in enumerate(compared_estimators): axes[i].plot( guide, guide, color="black", linewidth=1.0, ) else: for ci in cis: for i, estimator in enumerate(compared_estimators): if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): n_datasets = input_dict.n_datasets[behavior_policy] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "estimated_policy_value_lower_bound" ] if dataset_id_ == 0: axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) else: # to remove duplicated labels axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_vals.min()) max_val = max(max_val, max_vals.max()) elif ( behavior_policy_name is None and dataset_id is not None ): min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): true_policy_value = policy_value_dict[ behavior_policy ][ci][estimator]["true_policy_value"] estimated_policy_value = policy_value_dict[ behavior_policy ][ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) min_val_ = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val_ = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_val_) max_val = max(max_val, max_val_) elif ( behavior_policy_name is not None and dataset_id is None ): n_datasets = input_dict.n_datasets[behavior_policy_name] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = policy_value_dict[dataset_id_][ ci ][estimator]["true_policy_value"] estimated_policy_value = policy_value_dict[ dataset_id_ ][ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min_vals.min() max_val = max_vals.max() else: true_policy_value = policy_value_dict[ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ci][ estimator ]["estimated_policy_value_lower_bound"] axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) else: true_policy_value = policy_value_dict[ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i // n_cols, i % n_cols].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) axes[i // n_cols, i % n_cols].set_title(f"{ci}, {estimator}") axes[i // n_cols, i % n_cols].set_xlabel("true policy value") axes[i // n_cols, i % n_cols].set_ylabel( "estimated policy value lower bound" ) if ( legend and behavior_policy_name is None and isinstance(input_dict, MultipleInputDict) ): axes[i // n_cols, i % n_cols].legend( title="behavior_policy", loc="lower right", ) if not share_axes: margin = (max_val - min_val) * 0.05 guide = np.linspace(min_val - margin, max_val + margin) axes[i // n_cols, i % n_cols].plot( guide, guide, color="black", linewidth=1.0, ) guide_min = min_val if guide_min > min_val else guide_min guide_max = max_val if guide_max < max_val else guide_max if share_axes: margin = (guide_max - guide_min) * 0.05 guide = np.linspace(guide_min - margin, guide_max + margin) for i, estimator in enumerate(compared_estimators): axes[i // n_cols, i % n_cols].plot( guide, guide, color="black", linewidth=1.0, ) else: if n_cols == 1: for j, ci in enumerate(cis): for i, estimator in enumerate(compared_estimators): if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): n_datasets = input_dict.n_datasets[behavior_policy] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "estimated_policy_value_lower_bound" ] if dataset_id_ == 0: axes[j].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) else: # to remove duplicated labels axes[j].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_vals.min()) max_val = max(max_val, max_vals.max()) elif ( behavior_policy_name is None and dataset_id is not None ): min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): true_policy_value = policy_value_dict[ci][ behavior_policy ][ci][estimator]["true_policy_value"] estimated_policy_value = policy_value_dict[ behavior_policy ][ci][estimator][ "estimated_policy_value_lower_bound" ] axes[j].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) min_val_ = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val_ = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_val_) max_val = max(max_val, max_val_) elif ( behavior_policy_name is not None and dataset_id is None ): n_datasets = input_dict.n_datasets[behavior_policy_name] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = policy_value_dict[dataset_id_][ ci ][estimator]["true_policy_value"] estimated_policy_value = policy_value_dict[ dataset_id_ ][ci][estimator][ "estimated_policy_value_lower_bound" ] axes[j].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min_vals.min() max_val = max_vals.max() else: true_policy_value = policy_value_dict[ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ci][ estimator ]["estimated_policy_value_lower_bound"] axes[j].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) else: true_policy_value = policy_value_dict[ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ci][estimator][ "estimated_policy_value_lower_bound" ] axes[j].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) axes[j].set_title(f"{ci}, {estimator}") axes[j].set_xlabel("true policy value") axes[j].set_ylabel("estimated policy value lower bound") if ( legend and behavior_policy_name is None and isinstance(input_dict, MultipleInputDict) ): axes[j].legend(title="behavior_policy", loc="lower right") if not share_axes: margin = (max_val - min_val) * 0.05 guide = np.linspace(min_val - margin, max_val + margin) axes[j].plot( guide, guide, color="black", linewidth=1.0, ) guide_min = min_val if guide_min > min_val else guide_min guide_max = max_val if guide_max < max_val else guide_max if share_axes: margin = (guide_max - guide_min) * 0.05 guide = np.linspace(guide_min - margin, guide_max + margin) for j, ci in enumerate(cis): axes[j].plot( guide, guide, color="black", linewidth=1.0, ) else: for j, ci in enumerate(cis): for i, estimator in enumerate(compared_estimators): if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): n_datasets = input_dict.n_datasets[behavior_policy] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ behavior_policy ][dataset_id_][ci][estimator][ "estimated_policy_value_lower_bound" ] if dataset_id_ == 0: axes[i, j].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) else: # to remove duplicated labels axes[i, j].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_vals.min()) max_val = max(max_val, max_vals.max()) elif ( behavior_policy_name is None and dataset_id is not None ): min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): true_policy_value = policy_value_dict[ behavior_policy ][ci][estimator]["true_policy_value"] estimated_policy_value = policy_value_dict[ behavior_policy ][ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i, j].scatter( true_policy_value, estimated_policy_value, color=color[l % n_colors], label=behavior_policy, ) min_val_ = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val_ = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min(min_val, min_val_) max_val = max(max_val, max_val_) elif ( behavior_policy_name is not None and dataset_id is None ): n_datasets = input_dict.n_datasets[behavior_policy_name] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): true_policy_value = policy_value_dict[dataset_id_][ ci ][estimator]["true_policy_value"] estimated_policy_value = policy_value_dict[ dataset_id_ ][ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i, j].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) min_val = min_vals.min() max_val = max_vals.max() else: true_policy_value = policy_value_dict[ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ci][ estimator ]["estimated_policy_value_lower_bound"] axes[i, j].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) else: true_policy_value = policy_value_dict[ci][estimator][ "true_policy_value" ] estimated_policy_value = policy_value_dict[ci][estimator][ "estimated_policy_value_lower_bound" ] axes[i, j].scatter( true_policy_value, estimated_policy_value, color=color[0], ) min_val = np.minimum( np.nanmin(true_policy_value), np.nanmin(estimated_policy_value), ) max_val = np.maximum( np.nanmax(true_policy_value), np.nanmax(estimated_policy_value), ) axes[i, j].set_title(f"{ci}, {estimator}") axes[i, j].set_xlabel("true policy value") axes[i, j].set_ylabel("estimated policy value lower bound") if ( legend and behavior_policy_name is None and isinstance(input_dict, MultipleInputDict) ): axes[i, j].legend( title="behavior_policy", loc="lower right" ) if not share_axes: margin = (max_val - min_val) * 0.05 guide = np.linspace(min_val - margin, max_val + margin) axes[i, j].plot( guide, guide, color="black", linewidth=1.0, ) guide_min = min_val if guide_min > min_val else guide_min guide_max = max_val if guide_max < max_val else guide_max if share_axes: margin = (guide_max - guide_min) * 0.05 guide = np.linspace(guide_min - margin, guide_max + margin) for j, ci in enumerate(cis): for i, estimator in enumerate(compared_estimators): axes[i, j].plot( guide, guide, color="black", linewidth=1.0, ) fig.tight_layout() plt.show() if fig_dir: fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")
[docs] def visualize_variance_for_validation( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, n_cols: Optional[int] = None, share_axes: bool = False, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "validation_variance.png", ): """Visualize the true variance and its estimate (scatter plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. n_cols: int, default=None (> 0) Number of columns in the figure. share_axes: bool, default=False Whether to share x- and y-axes or not. legend: bool, default=True Whether to include a legend in the scatter plot. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="validation_variance.png" Name of the bar figure. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) self._check_basic_visualization_inputs( n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name ) ground_truth_policy_value_dict = self.obtain_true_selection_result( input_dict=input_dict, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, return_variance=True, ) estimated_variance_dict = self.cumulative_distribution_ope.estimate_variance( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, ) if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: candidate_policy_names = defaultdict(list) true_variance = defaultdict(list) for ( behavior_policy, n_datasets, ) in input_dict.n_datasets.items(): for dataset_id_ in range(n_datasets): candidate_policy_names[behavior_policy].append( ground_truth_policy_value_dict[behavior_policy][ dataset_id_ ]["ranking"] ) true_variance[behavior_policy].append( ground_truth_policy_value_dict[behavior_policy][ dataset_id_ ]["variance"] ) candidate_policy_names = defaultdict_to_dict(candidate_policy_names) true_variance = defaultdict_to_dict(true_variance) elif behavior_policy_name is None and dataset_id is not None: candidate_policy_names = {} true_variance = {} for behavior_policy in input_dict.behavior_policy_names: candidate_policy_names[ behavior_policy ] = ground_truth_policy_value_dict[behavior_policy]["ranking"] true_variance[behavior_policy] = ground_truth_policy_value_dict[ behavior_policy ]["variance"] elif behavior_policy_name is not None and dataset_id is None: candidate_policy_names = [] true_variance = [] for dataset_id_ in range(input_dict.n_datasets[behavior_policy_name]): candidate_policy_names.append( ground_truth_policy_value_dict[dataset_id_]["ranking"] ) true_variance.append( ground_truth_policy_value_dict[dataset_id_]["variance"] ) else: candidate_policy_names = ground_truth_policy_value_dict["ranking"] true_variance = ground_truth_policy_value_dict["variance"] else: candidate_policy_names = ground_truth_policy_value_dict["ranking"] true_variance = ground_truth_policy_value_dict["variance"] plt.style.use("ggplot") color = plt.rcParams["axes.prop_cycle"].by_key()["color"] n_colors = len(color) n_figs = len(compared_estimators) n_cols = min(5, n_figs) if n_cols is None else n_cols n_rows = (n_figs - 1) // n_cols + 1 fig, axes = plt.subplots( nrows=n_rows, ncols=n_cols, figsize=(4 * n_cols, 3 * n_rows), sharex=share_axes, sharey=share_axes, ) guide_min, guide_max = 1e5, -1e5 if n_rows == 1: for i, estimator in enumerate(compared_estimators): if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): n_datasets = input_dict.n_datasets[behavior_policy] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): estimated_variance = np.zeros( len( candidate_policy_names[behavior_policy][ dataset_id_ ] ) ) for j, eval_policy in enumerate( candidate_policy_names[behavior_policy][dataset_id_] ): estimated_variance[j] = estimated_variance_dict[ behavior_policy ][dataset_id_][eval_policy][estimator] if dataset_id_ == 0: axes[i].scatter( true_variance[behavior_policy][dataset_id_], estimated_variance, color=color[l % n_colors], label=behavior_policy, ) else: axes[i].scatter( true_variance[behavior_policy][dataset_id_], estimated_variance, color=color[l % n_colors], ) min_vals[dataset_id_] = np.minimum( np.nanmin( true_variance[behavior_policy][dataset_id_] ), np.nanmin(estimated_variance), ) max_vals[dataset_id_] = np.maximum( np.nanmax( true_variance[behavior_policy][dataset_id_] ), np.nanmax(estimated_variance), ) min_val = min(min_val, min_vals.min()) max_val = max(max_val, max_vals.max()) elif behavior_policy_name is None and dataset_id is not None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): estimated_variance = np.zeros( len(candidate_policy_names[behavior_policy]) ) for j, eval_policy in enumerate( candidate_policy_names[behavior_policy] ): estimated_variance[j] = estimated_variance_dict[ behavior_policy ][eval_policy][estimator] axes[i].scatter( true_variance[behavior_policy], estimated_variance, color=color[l % n_colors], label=behavior_policy, ) min_val_ = np.minimum( np.nanmin(true_variance[behavior_policy]), np.nanmin(estimated_variance), ) max_val_ = np.maximum( np.nanmax(true_variance[behavior_policy]), np.nanmax(estimated_variance), ) min_val = min(min_val, min_val_) max_val = max(max_val, max_val_) elif behavior_policy_name is not None and dataset_id is None: n_datasets = input_dict.n_datasets[behavior_policy_name] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): estimated_variance = np.zeros( len(candidate_policy_names[dataset_id_]) ) for j, eval_policy in enumerate( candidate_policy_names[dataset_id_] ): estimated_variance[j] = estimated_variance_dict[ dataset_id_ ][eval_policy][estimator] axes[i].scatter( true_variance[dataset_id_], estimated_variance, color=color[0], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_variance[dataset_id_]), np.nanmin(estimated_variance[dataset_id_]), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_variance[dataset_id_]), np.nanmax(estimated_variance[dataset_id_]), ) min_val = min_vals.min() max_val = max_vals.max() else: estimated_variance = np.zeros(len(candidate_policy_names)) for j, eval_policy in enumerate(candidate_policy_names): estimated_variance[j] = estimated_variance_dict[ eval_policy ][estimator] axes[i].scatter( true_variance, estimated_variance, color=color[0], ) min_val = np.minimum( np.nanmin(true_variance), np.nanmin(estimated_variance), ) max_val = np.maximum( np.nanmax(true_variance), np.nanmax(estimated_variance), ) else: estimated_variance = np.zeros(len(candidate_policy_names)) for j, eval_policy in enumerate(candidate_policy_names): estimated_variance[j] = estimated_variance_dict[eval_policy][ estimator ] axes[i].scatter( true_variance, estimated_variance, color=color[0], ) min_val = np.minimum( np.nanmin(true_variance), np.nanmin(estimated_variance), ) max_val = np.maximum( np.nanmax(true_variance), np.nanmax(estimated_variance), ) axes[i].set_title(estimator) axes[i].set_xlabel("true variance") axes[i].set_ylabel("estimated variance") if ( legend and behavior_policy_name is None and isinstance(input_dict, MultipleInputDict) ): axes[i].legend(title="behavior_policy", loc="lower right") if not share_axes: margin = (max_val - min_val) * 0.05 guide = np.linspace(min_val - margin, max_val + margin) axes[i].plot( guide, guide, color="black", linewidth=1.0, ) guide_min = min_val if guide_min > min_val else guide_min guide_max = max_val if guide_max < max_val else guide_max if share_axes: margin = (guide_max - guide_min) * 0.05 guide = np.linspace(guide_min - margin, guide_max + margin) for i, estimator in enumerate(compared_estimators): axes[i].plot( guide, guide, color="black", linewidth=1.0, ) else: for i, estimator in enumerate(compared_estimators): if isinstance(input_dict, MultipleInputDict): if behavior_policy_name is None and dataset_id is None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): n_datasets = input_dict.n_datasets[behavior_policy] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): estimated_variance = np.zeros( len( candidate_policy_names[behavior_policy][ dataset_id_ ] ) ) for j, eval_policy in enumerate( candidate_policy_names[behavior_policy][dataset_id_] ): estimated_variance[j] = estimated_variance_dict[ behavior_policy ][dataset_id_][eval_policy][estimator] if dataset_id_ == 0: axes[i // n_cols, i % n_cols].scatter( true_variance[behavior_policy][dataset_id_], estimated_variance, color=color[l % n_colors], label=behavior_policy, ) else: axes[i // n_cols, i % n_cols].scatter( true_variance[behavior_policy][dataset_id_], estimated_variance, color=color[l % n_colors], ) min_vals[dataset_id_] = np.minimum( np.nanmin( true_variance[behavior_policy][dataset_id_] ), np.nanmin(estimated_variance), ) max_vals[dataset_id_] = np.maximum( np.nanmax( true_variance[behavior_policy][dataset_id_] ), np.nanmax(estimated_variance), ) min_val = min(min_val, min_vals.min()) max_val = max(max_val, max_vals.max()) elif behavior_policy_name is None and dataset_id is not None: min_val, max_val = np.infty, -np.infty for l, behavior_policy in enumerate( input_dict.behavior_policy_names ): estimated_variance = np.zeros( len(candidate_policy_names[behavior_policy]) ) for j, eval_policy in enumerate( candidate_policy_names[behavior_policy] ): estimated_variance[j] = estimated_variance_dict[ behavior_policy ][eval_policy][estimator] axes[i // n_cols, i % n_cols].scatter( true_variance[behavior_policy], estimated_variance, color=color[l % n_colors], label=behavior_policy, ) min_val_ = np.minimum( np.nanmin(true_variance[behavior_policy]), np.nanmin(estimated_variance), ) max_val_ = np.maximum( np.nanmax(true_variance[behavior_policy]), np.nanmax(estimated_variance), ) min_val = min(min_val, min_val_) max_val = max(max_val, max_val_) elif behavior_policy_name is not None and dataset_id is None: n_datasets = input_dict.n_datasets[behavior_policy_name] min_vals = np.zeros(n_datasets) max_vals = np.zeros(n_datasets) for dataset_id_ in range(n_datasets): estimated_variance = np.zeros( len(candidate_policy_names[dataset_id_]) ) for j, eval_policy in enumerate( candidate_policy_names[dataset_id_] ): estimated_variance[j] = estimated_variance_dict[ dataset_id_ ][eval_policy][estimator] axes[i // n_cols, i % n_cols].scatter( true_variance[dataset_id_], estimated_variance, color=color[0], ) min_vals[dataset_id_] = np.minimum( np.nanmin(true_variance[dataset_id_]), np.nanmin(estimated_variance[dataset_id_]), ) max_vals[dataset_id_] = np.maximum( np.nanmax(true_variance[dataset_id_]), np.nanmax(estimated_variance[dataset_id_]), ) min_val = min_vals.min() max_val = max_vals.max() else: estimated_variance = np.zeros(len(candidate_policy_names)) for j, eval_policy in enumerate(candidate_policy_names): estimated_variance[j] = estimated_variance_dict[ eval_policy ][estimator] axes[i // n_cols, i % n_cols].scatter( true_variance, estimated_variance, color=color[0], ) min_val = np.minimum( np.nanmin(true_variance), np.nanmin(estimated_variance), ) max_val = np.maximum( np.nanmax(true_variance), np.nanmax(estimated_variance), ) else: estimated_variance = np.zeros(len(candidate_policy_names)) for j, eval_policy in enumerate(candidate_policy_names): estimated_variance[j] = estimated_variance_dict[eval_policy][ estimator ] axes[i // n_cols, i % n_cols].scatter( true_variance, estimated_variance, color=color[0], ) min_val = np.minimum( np.nanmin(true_variance), np.nanmin(estimated_variance), ) max_val = np.maximum( np.nanmax(true_variance), np.nanmax(estimated_variance), ) axes[i // n_cols, i % n_cols].set_title(estimator) axes[i // n_cols, i % n_cols].set_xlabel("true variance") axes[i // n_cols, i % n_cols].set_ylabel("estimated variance") if ( legend and behavior_policy_name is None and isinstance(input_dict, MultipleInputDict) ): axes[i // n_cols, i % n_cols].legend( title="behavior_policy", loc="lower right" ) if not share_axes: margin = (max_val - min_val) * 0.05 guide = np.linspace(min_val - margin, max_val + margin) axes[i // n_cols, i % n_cols].plot( guide, guide, color="black", linewidth=1.0, ) guide_min = min_val if guide_min > min_val else guide_min guide_max = max_val if guide_max < max_val else guide_max if share_axes: margin = (guide_max - guide_min) * 0.05 guide = np.linspace(guide_min - margin, guide_max + margin) for i, estimator in enumerate(compared_estimators): axes[i // n_cols, i % n_cols].plot( guide, guide, color="black", linewidth=1.0, ) fig.tight_layout() plt.show() if fig_dir: fig.savefig(str(fig_dir / fig_name), dpi=300, bbox_inches="tight")
[docs] def visualize_lower_quartile_for_validation( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, alpha: float = 0.05, n_cols: Optional[int] = None, share_axes: bool = False, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "validation_lower_quartile.png", ): """Visualize the true lower quartile and its estimate (scatter plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 0.5]`. n_cols: int, default=None (> 0) Number of columns in the figure. share_axes: bool, default=False Whether to share x- and y-axes or not. legend: bool, default=True Whether to include a legend in the scatter plot. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="validation_lower_quartile.png" Name of the bar figure. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) self._check_basic_visualization_inputs( n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name ) lower_quartile_dict = self.select_by_lower_quartile( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=alpha, return_true_values=True, ) self._visualize_policy_performance_for_validation( estimation_dict=lower_quartile_dict, input_dict=input_dict, true_value_arg="true_lower_quartile", estimated_value_arg="estimated_lower_quartile", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, n_cols=n_cols, share_axes=share_axes, legend=legend, ylabel=f"lower quartile ({alpha})", fig_dir=fig_dir, fig_name=fig_name, )
[docs] def visualize_conditional_value_at_risk_for_validation( self, input_dict: Union[OPEInputDict, MultipleInputDict], compared_estimators: Optional[List[str]] = None, behavior_policy_name: Optional[str] = None, dataset_id: Optional[int] = None, alpha: float = 0.05, n_cols: Optional[int] = None, share_axes: bool = False, legend: bool = True, fig_dir: Optional[Path] = None, fig_name: str = "validation_conditional_value_at_risk.png", ): """Visualize the true conditional value at risk and its estimate (scatter plot). Parameters ------- input_dict: OPEInputDict or MultipleInputDict Dictionary of the OPE inputs for each evaluation policy. .. code-block:: python key: [evaluation_policy][ evaluation_policy_action, evaluation_policy_action_dist, state_action_value_prediction, initial_state_value_prediction, state_action_marginal_importance_weight, state_marginal_importance_weight, on_policy_policy_value, gamma, behavior_policy, evaluation_policy, dataset_id, ] .. seealso:: :class:`scope_rl.ope.input.CreateOPEInput` describes the components of :class:`input_dict`. compared_estimators: list of str, default=None Name of compared estimators. When `None` is given, all the estimators are compared. behavior_policy_name: str, default=None Name of the behavior policy. dataset_id: int, default=None Id of the logged dataset. alpha: float, default=0.05 Proportion of the shaded region. The value should be within `[0, 1]`. n_cols: int, default=None (> 0) Number of columns in the figure. share_axes: bool, default=False Whether to share x- and y-axes or not. legend: bool, default=True Whether to include a legend in the scatter plot. fig_dir: Path, default=None Path to store the bar figure. If `None` is given, the figure will not be saved. fig_name: str, default="validation_conditional_value_at_risk.png" Name of the bar figure. """ compared_estimators = self._check_compared_estimators( compared_estimators, ope_type="cumulative_distribution_ope" ) self._check_basic_visualization_inputs( n_cols=n_cols, fig_dir=fig_dir, fig_name=fig_name ) cvar_dict = self.select_by_conditional_value_at_risk( input_dict, compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, alpha=alpha, return_true_values=True, ) self._visualize_policy_performance_for_validation( estimation_dict=cvar_dict, input_dict=input_dict, true_value_arg="true_conditional_value_at_risk", estimated_value_arg="estimated_conditional_value_at_risk", compared_estimators=compared_estimators, behavior_policy_name=behavior_policy_name, dataset_id=dataset_id, n_cols=n_cols, share_axes=share_axes, legend=legend, ylabel=f"CVaR ({alpha})", fig_dir=fig_dir, fig_name=fig_name, )
@property def estimators_name(self): estimators_name = { "standard_ope": None if self.ope is None else self.ope.estimators_name, "cumulative_distribution_ope": None if self.cumulative_distribution_ope is None else self.cumulative_distribution_ope.estimators_name, } return estimators_name