Skip to content

LightRL API Reference

Welcome to the detailed API reference for LightRL. Below you'll find documentation for the key classes and functions available in the library, complete with usage guidelines and examples.

Bandits Module

LightRL includes a variety of bandit algorithms, each tailored for specific use cases in reinforcement learning environments. The following classes are part of the lightrl.bandits module:

Base Bandit Class

Bandit: The foundational class for all bandit algorithms. Subclasses provide specialized implementations.

Bases: ABC

Source code in lightrl/bandits.py
class Bandit(ABC):
    def __init__(self, arms: List[Any]) -> None:
        """
        Initialize a Bandit with a specified number of arms.

        Args:
            arms (List[Any]): A list representing different arms or tasks
                              that the Bandit can choose from.
        """
        self.arms: List[Any] = arms
        self.q_values: List[float] = [0.0] * len(arms)  # Estimated rewards for each arm
        self.counts: List[int] = [0] * len(
            arms
        )  # Number of times each arm has been selected

    @abstractmethod
    def select_arm(self) -> int:
        """
        Abstract method to select the next arm to be used.

        Returns:
            int: The index of the selected arm.
        """
        pass

    def update(self, arm_index: int, reward: float) -> None:
        """
        Update the value estimates for a given arm based on the reward received.

        Args:
            arm_index (int): Index of the arm that was selected.
            reward (float): Reward received after selecting the arm.
        """
        self.counts[arm_index] += 1
        n = self.counts[arm_index]
        old_q = self.q_values[arm_index]
        self.q_values[arm_index] = ((n - 1) * old_q + reward) / n

    def __repr__(self) -> str:
        """
        String representation of the Bandit object.

        Returns:
            str: String representation of the Bandit, showing its arms.
        """
        return f"{self.__class__.__name__}(arms={self.arms})"

    def report(self) -> None:
        """
        Print a report of the average rewards (Q-values) and selection counts for each arm.
        """
        print("Q-values per arm:")
        for arm, q, cnt in zip(self.arms, self.q_values, self.counts):
            print(f"  num_tasks={arm}: avg_reward={q:.5f}, count={cnt}")

__init__(arms)

Initialize a Bandit with a specified number of arms.

Parameters:

Name Type Description Default
arms List[Any]

A list representing different arms or tasks that the Bandit can choose from.

required
Source code in lightrl/bandits.py
def __init__(self, arms: List[Any]) -> None:
    """
    Initialize a Bandit with a specified number of arms.

    Args:
        arms (List[Any]): A list representing different arms or tasks
                          that the Bandit can choose from.
    """
    self.arms: List[Any] = arms
    self.q_values: List[float] = [0.0] * len(arms)  # Estimated rewards for each arm
    self.counts: List[int] = [0] * len(
        arms
    )  # Number of times each arm has been selected

__repr__()

String representation of the Bandit object.

Returns:

Name Type Description
str str

String representation of the Bandit, showing its arms.

Source code in lightrl/bandits.py
def __repr__(self) -> str:
    """
    String representation of the Bandit object.

    Returns:
        str: String representation of the Bandit, showing its arms.
    """
    return f"{self.__class__.__name__}(arms={self.arms})"

report()

Print a report of the average rewards (Q-values) and selection counts for each arm.

Source code in lightrl/bandits.py
def report(self) -> None:
    """
    Print a report of the average rewards (Q-values) and selection counts for each arm.
    """
    print("Q-values per arm:")
    for arm, q, cnt in zip(self.arms, self.q_values, self.counts):
        print(f"  num_tasks={arm}: avg_reward={q:.5f}, count={cnt}")

select_arm() abstractmethod

Abstract method to select the next arm to be used.

Returns:

Name Type Description
int int

The index of the selected arm.

Source code in lightrl/bandits.py
@abstractmethod
def select_arm(self) -> int:
    """
    Abstract method to select the next arm to be used.

    Returns:
        int: The index of the selected arm.
    """
    pass

update(arm_index, reward)

Update the value estimates for a given arm based on the reward received.

Parameters:

Name Type Description Default
arm_index int

Index of the arm that was selected.

required
reward float

Reward received after selecting the arm.

required
Source code in lightrl/bandits.py
def update(self, arm_index: int, reward: float) -> None:
    """
    Update the value estimates for a given arm based on the reward received.

    Args:
        arm_index (int): Index of the arm that was selected.
        reward (float): Reward received after selecting the arm.
    """
    self.counts[arm_index] += 1
    n = self.counts[arm_index]
    old_q = self.q_values[arm_index]
    self.q_values[arm_index] = ((n - 1) * old_q + reward) / n

Epsilon-Based Bandits

These bandits use epsilon strategies to balance exploration and exploitation.

EpsilonGreedyBandit: Implements an epsilon-greedy algorithm, allowing for a tunable exploration rate.

Bases: Bandit

Source code in lightrl/bandits.py
class EpsilonGreedyBandit(Bandit):
    def __init__(self, arms: List[Any], epsilon: float = 0.1) -> None:
        """
        Initialize an EpsilonGreedyBandit with a specified number of arms and an exploration probability.

        Args:
            arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
            epsilon (float, optional): The probability of choosing a random arm for exploration.
                                       Defaults to 0.1.
        """
        super().__init__(arms)
        self.epsilon: float = epsilon

    def select_arm(self) -> int:
        """
        Select an arm to use based on the epsilon-greedy strategy.

        This method uses exploration with probability 'epsilon' and exploitation otherwise,
        selecting the arm with the highest estimated value.

        Returns:
            int: The index of the selected arm.
        """
        if random.random() < self.epsilon:
            # Explore: select a random arm
            return random.randint(0, len(self.arms) - 1)

        # Exploit: select the arm with maximum estimated value
        max_q = max(self.q_values)
        candidates = [i for i, q in enumerate(self.q_values) if q == max_q]
        return random.choice(candidates)

__init__(arms, epsilon=0.1)

Initialize an EpsilonGreedyBandit with a specified number of arms and an exploration probability.

Parameters:

Name Type Description Default
arms List[Any]

A list representing different arms or tasks that the Bandit can choose from.

required
epsilon float

The probability of choosing a random arm for exploration. Defaults to 0.1.

0.1
Source code in lightrl/bandits.py
def __init__(self, arms: List[Any], epsilon: float = 0.1) -> None:
    """
    Initialize an EpsilonGreedyBandit with a specified number of arms and an exploration probability.

    Args:
        arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
        epsilon (float, optional): The probability of choosing a random arm for exploration.
                                   Defaults to 0.1.
    """
    super().__init__(arms)
    self.epsilon: float = epsilon

select_arm()

Select an arm to use based on the epsilon-greedy strategy.

This method uses exploration with probability 'epsilon' and exploitation otherwise, selecting the arm with the highest estimated value.

Returns:

Name Type Description
int int

The index of the selected arm.

Source code in lightrl/bandits.py
def select_arm(self) -> int:
    """
    Select an arm to use based on the epsilon-greedy strategy.

    This method uses exploration with probability 'epsilon' and exploitation otherwise,
    selecting the arm with the highest estimated value.

    Returns:
        int: The index of the selected arm.
    """
    if random.random() < self.epsilon:
        # Explore: select a random arm
        return random.randint(0, len(self.arms) - 1)

    # Exploit: select the arm with maximum estimated value
    max_q = max(self.q_values)
    candidates = [i for i, q in enumerate(self.q_values) if q == max_q]
    return random.choice(candidates)

EpsilonFirstBandit: Prioritizes exploration for a set number of initial steps before switching to exploitation.

Bases: Bandit

Source code in lightrl/bandits.py
class EpsilonFirstBandit(Bandit):
    def __init__(
        self, arms: List[Any], exploration_steps: int = 100, epsilon: float = 0.1
    ) -> None:
        """
        Initialize an EpsilonFirstBandit with a specified number of arms, exploration steps, and exploration probability.

        Args:
            arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
            exploration_steps (int, optional): The number of initial steps to purely explore. Defaults to 100.
            epsilon (float, optional): The probability of choosing a random arm during the exploration phase.
                                       Defaults to 0.1.
        """
        super().__init__(arms)
        self.exploration_steps: int = exploration_steps
        self.epsilon: float = epsilon
        self.step: int = 0

    def select_arm(self) -> int:
        """
        Select an arm to use based on the epsilon-first strategy.

        This method uses pure exploration for a defined number of initial steps and then follows
        an epsilon-greedy strategy thereafter.

        Returns:
            int: The index of the selected arm.
        """
        if self.step < self.exploration_steps or random.random() < self.epsilon:
            # Explore: select a random arm either during the exploration phase or if chosen randomly
            return random.randint(0, len(self.arms) - 1)

        # Exploit: select the arm with maximum estimated value
        max_q = max(self.q_values)
        candidates = [i for i, q in enumerate(self.q_values) if q == max_q]

        self.step += 1
        return random.choice(candidates)

__init__(arms, exploration_steps=100, epsilon=0.1)

Initialize an EpsilonFirstBandit with a specified number of arms, exploration steps, and exploration probability.

Parameters:

Name Type Description Default
arms List[Any]

A list representing different arms or tasks that the Bandit can choose from.

required
exploration_steps int

The number of initial steps to purely explore. Defaults to 100.

100
epsilon float

The probability of choosing a random arm during the exploration phase. Defaults to 0.1.

0.1
Source code in lightrl/bandits.py
def __init__(
    self, arms: List[Any], exploration_steps: int = 100, epsilon: float = 0.1
) -> None:
    """
    Initialize an EpsilonFirstBandit with a specified number of arms, exploration steps, and exploration probability.

    Args:
        arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
        exploration_steps (int, optional): The number of initial steps to purely explore. Defaults to 100.
        epsilon (float, optional): The probability of choosing a random arm during the exploration phase.
                                   Defaults to 0.1.
    """
    super().__init__(arms)
    self.exploration_steps: int = exploration_steps
    self.epsilon: float = epsilon
    self.step: int = 0

select_arm()

Select an arm to use based on the epsilon-first strategy.

This method uses pure exploration for a defined number of initial steps and then follows an epsilon-greedy strategy thereafter.

Returns:

Name Type Description
int int

The index of the selected arm.

Source code in lightrl/bandits.py
def select_arm(self) -> int:
    """
    Select an arm to use based on the epsilon-first strategy.

    This method uses pure exploration for a defined number of initial steps and then follows
    an epsilon-greedy strategy thereafter.

    Returns:
        int: The index of the selected arm.
    """
    if self.step < self.exploration_steps or random.random() < self.epsilon:
        # Explore: select a random arm either during the exploration phase or if chosen randomly
        return random.randint(0, len(self.arms) - 1)

    # Exploit: select the arm with maximum estimated value
    max_q = max(self.q_values)
    candidates = [i for i, q in enumerate(self.q_values) if q == max_q]

    self.step += 1
    return random.choice(candidates)

EpsilonDecreasingBandit: Uses a decreasing epsilon value over time to reduce exploration as understanding improves.

Bases: Bandit

Source code in lightrl/bandits.py
class EpsilonDecreasingBandit(Bandit):
    def __init__(
        self,
        arms: List[Any],
        initial_epsilon: float = 1.0,
        limit_epsilon: float = 0.1,
        half_decay_steps: int = 100,
    ) -> None:
        """
        Initialize an EpsilonDecreasingBandit with a specified number of arms and epsilon parameters.

        Args:
            arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
            initial_epsilon (float, optional): The initial exploration probability. Defaults to 1.0.
            limit_epsilon (float, optional): The minimum limit for the exploration probability. Defaults to 0.1.
            half_decay_steps (int, optional): The number of steps at which the exploration probability is reduced
                                              to half of the difference between `initial_epsilon` and `limit_epsilon`.
                                              Defaults to 100.
        """
        super().__init__(arms)
        self.epsilon: float = initial_epsilon
        self.initial_epsilon: float = initial_epsilon
        self.limit_epsilon: float = limit_epsilon
        self.half_decay_steps: int = half_decay_steps
        self.step: int = 0

    def select_arm(self) -> int:
        """
        Select an arm to use based on the epsilon-decreasing strategy.

        This method adjusts the exploration probability over time and selects an arm accordingly.

        Returns:
            int: The index of the selected arm.
        """
        self.step += 1
        self.update_epsilon()

        if random.random() < self.epsilon:
            # Explore: select a random arm
            return random.randint(0, len(self.arms) - 1)
        # Exploit: select the arm with maximum estimated value
        max_q = max(self.q_values)
        candidates = [i for i, q in enumerate(self.q_values) if q == max_q]
        return random.choice(candidates)

    def update_epsilon(self) -> None:
        """
        Update the exploration probability `epsilon` based on the current step.

        The exploration probability decays towards the limit probability over time, according to a half-life decay model.
        """
        self.epsilon = self.limit_epsilon + (
            self.initial_epsilon - self.limit_epsilon
        ) * (0.5 ** (self.step / self.half_decay_steps))

__init__(arms, initial_epsilon=1.0, limit_epsilon=0.1, half_decay_steps=100)

Initialize an EpsilonDecreasingBandit with a specified number of arms and epsilon parameters.

Parameters:

Name Type Description Default
arms List[Any]

A list representing different arms or tasks that the Bandit can choose from.

required
initial_epsilon float

The initial exploration probability. Defaults to 1.0.

1.0
limit_epsilon float

The minimum limit for the exploration probability. Defaults to 0.1.

0.1
half_decay_steps int

The number of steps at which the exploration probability is reduced to half of the difference between initial_epsilon and limit_epsilon. Defaults to 100.

100
Source code in lightrl/bandits.py
def __init__(
    self,
    arms: List[Any],
    initial_epsilon: float = 1.0,
    limit_epsilon: float = 0.1,
    half_decay_steps: int = 100,
) -> None:
    """
    Initialize an EpsilonDecreasingBandit with a specified number of arms and epsilon parameters.

    Args:
        arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
        initial_epsilon (float, optional): The initial exploration probability. Defaults to 1.0.
        limit_epsilon (float, optional): The minimum limit for the exploration probability. Defaults to 0.1.
        half_decay_steps (int, optional): The number of steps at which the exploration probability is reduced
                                          to half of the difference between `initial_epsilon` and `limit_epsilon`.
                                          Defaults to 100.
    """
    super().__init__(arms)
    self.epsilon: float = initial_epsilon
    self.initial_epsilon: float = initial_epsilon
    self.limit_epsilon: float = limit_epsilon
    self.half_decay_steps: int = half_decay_steps
    self.step: int = 0

select_arm()

Select an arm to use based on the epsilon-decreasing strategy.

This method adjusts the exploration probability over time and selects an arm accordingly.

Returns:

Name Type Description
int int

The index of the selected arm.

Source code in lightrl/bandits.py
def select_arm(self) -> int:
    """
    Select an arm to use based on the epsilon-decreasing strategy.

    This method adjusts the exploration probability over time and selects an arm accordingly.

    Returns:
        int: The index of the selected arm.
    """
    self.step += 1
    self.update_epsilon()

    if random.random() < self.epsilon:
        # Explore: select a random arm
        return random.randint(0, len(self.arms) - 1)
    # Exploit: select the arm with maximum estimated value
    max_q = max(self.q_values)
    candidates = [i for i, q in enumerate(self.q_values) if q == max_q]
    return random.choice(candidates)

update_epsilon()

Update the exploration probability epsilon based on the current step.

The exploration probability decays towards the limit probability over time, according to a half-life decay model.

Source code in lightrl/bandits.py
def update_epsilon(self) -> None:
    """
    Update the exploration probability `epsilon` based on the current step.

    The exploration probability decays towards the limit probability over time, according to a half-life decay model.
    """
    self.epsilon = self.limit_epsilon + (
        self.initial_epsilon - self.limit_epsilon
    ) * (0.5 ** (self.step / self.half_decay_steps))

Other Bandit Strategies

UCB1Bandit: Employs the UCB1 algorithm, focusing on arm pulls with calculated confidence bounds.

Bases: Bandit

Source code in lightrl/bandits.py
class UCB1Bandit(Bandit):
    def __init__(self, arms: List[Any]) -> None:
        """
        Initialize a UCB1Bandit with a specified number of arms.

        Args:
            arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
        """
        super().__init__(arms)
        self.total_count: int = 0  # Total number of times any arm has been selected

    def select_arm(self) -> int:
        """
        Select an arm to use based on the Upper Confidence Bound (UCB1) strategy.

        This method selects an arm that maximizes the UCB estimate, accounting for exploration and exploitation.

        Returns:
            int: The index of the selected arm.
        """
        for arm_index, count in enumerate(self.counts):
            if count == 0:
                # If an arm has not been selected yet, select it
                return arm_index

        # Calculate UCB values for each arm and choose the arm with the highest UCB value
        ucb_values = [
            self.q_values[i]
            + math.sqrt((2 * math.log(self.total_count)) / self.counts[i])
            for i in range(len(self.arms))
        ]
        return ucb_values.index(max(ucb_values))

    def update(self, arm_index: int, reward: float) -> None:
        """
        Update the value estimates for a given arm based on the reward received and increment the total count.

        Args:
            arm_index (int): Index of the arm that was selected.
            reward (float): Reward received after selecting the arm. Must be in the range [0, 1].

        Raises:
            ValueError: If the reward is not within the range [0, 1].
        """
        if not (0 <= reward <= 1):
            raise ValueError("Reward must be in the range [0, 1].")
        self.total_count += 1
        super().update(arm_index, reward)

__init__(arms)

Initialize a UCB1Bandit with a specified number of arms.

Parameters:

Name Type Description Default
arms List[Any]

A list representing different arms or tasks that the Bandit can choose from.

required
Source code in lightrl/bandits.py
def __init__(self, arms: List[Any]) -> None:
    """
    Initialize a UCB1Bandit with a specified number of arms.

    Args:
        arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
    """
    super().__init__(arms)
    self.total_count: int = 0  # Total number of times any arm has been selected

select_arm()

Select an arm to use based on the Upper Confidence Bound (UCB1) strategy.

This method selects an arm that maximizes the UCB estimate, accounting for exploration and exploitation.

Returns:

Name Type Description
int int

The index of the selected arm.

Source code in lightrl/bandits.py
def select_arm(self) -> int:
    """
    Select an arm to use based on the Upper Confidence Bound (UCB1) strategy.

    This method selects an arm that maximizes the UCB estimate, accounting for exploration and exploitation.

    Returns:
        int: The index of the selected arm.
    """
    for arm_index, count in enumerate(self.counts):
        if count == 0:
            # If an arm has not been selected yet, select it
            return arm_index

    # Calculate UCB values for each arm and choose the arm with the highest UCB value
    ucb_values = [
        self.q_values[i]
        + math.sqrt((2 * math.log(self.total_count)) / self.counts[i])
        for i in range(len(self.arms))
    ]
    return ucb_values.index(max(ucb_values))

update(arm_index, reward)

Update the value estimates for a given arm based on the reward received and increment the total count.

Parameters:

Name Type Description Default
arm_index int

Index of the arm that was selected.

required
reward float

Reward received after selecting the arm. Must be in the range [0, 1].

required

Raises:

Type Description
ValueError

If the reward is not within the range [0, 1].

Source code in lightrl/bandits.py
def update(self, arm_index: int, reward: float) -> None:
    """
    Update the value estimates for a given arm based on the reward received and increment the total count.

    Args:
        arm_index (int): Index of the arm that was selected.
        reward (float): Reward received after selecting the arm. Must be in the range [0, 1].

    Raises:
        ValueError: If the reward is not within the range [0, 1].
    """
    if not (0 <= reward <= 1):
        raise ValueError("Reward must be in the range [0, 1].")
    self.total_count += 1
    super().update(arm_index, reward)

GreedyBanditWithHistory: A variant that uses historical performance data to adjust its greedy selection strategy.

Bases: Bandit

Source code in lightrl/bandits.py
class GreedyBanditWithHistory(Bandit):
    def __init__(self, arms: List[Any], history_length: int = 100) -> None:
        """
        Initialize a GreedyBanditWithHistory with a specified number of arms and history length.

        Args:
            arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
            history_length (int, optional): The maximum length of history to maintain for each arm's rewards.
                                            Defaults to 100.
        """
        super().__init__(arms)
        self.history_length: int = history_length
        self.history: List[List[float]] = [
            [] for _ in range(len(arms))
        ]  # History of rewards for each arm

    def select_arm(self) -> int:
        """
        Select an arm to use based on the greedy strategy with bounded history.

        This method ensures that each arm's history reaches the defined length before purely exploiting.

        Returns:
            int: The index of the selected arm.
        """
        if any(len(history) < self.history_length for history in self.history):
            # If any arm has not reached the history length, select one of these arms for exploration
            candidates = [
                i
                for i, history in enumerate(self.history)
                if len(history) < self.history_length
            ]
            return random.choice(candidates)

        # Once history length is reached for all arms, exploit the arm with maximum estimated value
        max_q = max(self.q_values)
        candidates = [i for i, q in enumerate(self.q_values) if q == max_q]
        return random.choice(candidates)

    def update(self, arm_index: int, reward: float) -> None:
        """
        Update the value estimates for a given arm based on the reward received and update its history.

        Args:
            arm_index (int): Index of the arm that was selected.
            reward (float): Reward received after selecting the arm.
        """
        if len(self.history[arm_index]) >= self.history_length:
            # Maintain bounded history by removing the oldest reward if limit is exceeded
            self.history[arm_index].pop(0)
        self.history[arm_index].append(reward)

        # Update the count and Q-value for the arm based on its history
        self.counts[arm_index] = len(self.history[arm_index])
        self.q_values[arm_index] = sum(self.history[arm_index]) / self.counts[arm_index]

__init__(arms, history_length=100)

Initialize a GreedyBanditWithHistory with a specified number of arms and history length.

Parameters:

Name Type Description Default
arms List[Any]

A list representing different arms or tasks that the Bandit can choose from.

required
history_length int

The maximum length of history to maintain for each arm's rewards. Defaults to 100.

100
Source code in lightrl/bandits.py
def __init__(self, arms: List[Any], history_length: int = 100) -> None:
    """
    Initialize a GreedyBanditWithHistory with a specified number of arms and history length.

    Args:
        arms (List[Any]): A list representing different arms or tasks that the Bandit can choose from.
        history_length (int, optional): The maximum length of history to maintain for each arm's rewards.
                                        Defaults to 100.
    """
    super().__init__(arms)
    self.history_length: int = history_length
    self.history: List[List[float]] = [
        [] for _ in range(len(arms))
    ]  # History of rewards for each arm

select_arm()

Select an arm to use based on the greedy strategy with bounded history.

This method ensures that each arm's history reaches the defined length before purely exploiting.

Returns:

Name Type Description
int int

The index of the selected arm.

Source code in lightrl/bandits.py
def select_arm(self) -> int:
    """
    Select an arm to use based on the greedy strategy with bounded history.

    This method ensures that each arm's history reaches the defined length before purely exploiting.

    Returns:
        int: The index of the selected arm.
    """
    if any(len(history) < self.history_length for history in self.history):
        # If any arm has not reached the history length, select one of these arms for exploration
        candidates = [
            i
            for i, history in enumerate(self.history)
            if len(history) < self.history_length
        ]
        return random.choice(candidates)

    # Once history length is reached for all arms, exploit the arm with maximum estimated value
    max_q = max(self.q_values)
    candidates = [i for i, q in enumerate(self.q_values) if q == max_q]
    return random.choice(candidates)

update(arm_index, reward)

Update the value estimates for a given arm based on the reward received and update its history.

Parameters:

Name Type Description Default
arm_index int

Index of the arm that was selected.

required
reward float

Reward received after selecting the arm.

required
Source code in lightrl/bandits.py
def update(self, arm_index: int, reward: float) -> None:
    """
    Update the value estimates for a given arm based on the reward received and update its history.

    Args:
        arm_index (int): Index of the arm that was selected.
        reward (float): Reward received after selecting the arm.
    """
    if len(self.history[arm_index]) >= self.history_length:
        # Maintain bounded history by removing the oldest reward if limit is exceeded
        self.history[arm_index].pop(0)
    self.history[arm_index].append(reward)

    # Update the count and Q-value for the arm based on its history
    self.counts[arm_index] = len(self.history[arm_index])
    self.q_values[arm_index] = sum(self.history[arm_index]) / self.counts[arm_index]

Runners Module

two_state_time_dependent_process: The function two_state_time_dependent_process() takes the bandit and keeps two states. The ALIVE state and WAITING state, the bandit is switching between those two states in order to probe the rewards (tasks per seconds multiplied by reward_factor). In WAITING state we can select lower number of tasks to process (waiting_args).

Execute a two-state time-dependent process with a bandit decision-maker.

This function simulates a process which alternates between an "ALIVE" state and a "WAITING" state based on the performance of a given task in relation to a failure threshold. It updates the bandit model with rewards calculated from successful tasks.

Parameters:

Name Type Description Default
bandit

An object with methods select_arm, update, and report, representing a multi-armed bandit.

required
fun Callable[..., Tuple[float, float]]

A function to be called with the current arm's arguments. Should return a tuple containing the number of successful and failed tasks.

required
failure_threshold float

A float to determine what fraction of tasks fails that triggers a switch to the "WAITING" state.

0.1
default_wait_time float

The base wait time in seconds between task executions in the "ALIVE" state.

5
extra_wait_time float

Additional wait time in seconds to be added in the "WAITING" state.

10
waiting_args Optional[Union[Tuple, List]]

Arguments to be used when calling fun in the "WAITING" state.

None
max_steps int

Maximum number of iterations/steps to be performed.

500
verbose bool

If True, prints additional detailed logs and progress via tqdm.

False
reward_factor float

A scaling factor to adjust the magnitude of the reward computed.

1e-06

Raises:

Type Description
ValueError

If waiting_args is not provided or if it is not of expected types.

Source code in lightrl/runners.py
def two_state_time_dependent_process(
    bandit,
    fun: Callable[..., Tuple[float, float]],
    failure_threshold: float = 0.1,
    default_wait_time: float = 5,
    extra_wait_time: float = 10,
    waiting_args: Optional[Union[Tuple, List]] = None,
    max_steps: int = 500,
    verbose: bool = False,
    reward_factor: float = 1e-6,
) -> None:
    """Execute a two-state time-dependent process with a bandit decision-maker.

    This function simulates a process which alternates between an "ALIVE" state
    and a "WAITING" state based on the performance of a given task in relation
    to a failure threshold. It updates the bandit model with rewards calculated
    from successful tasks.

    Args:
        bandit: An object with methods `select_arm`, `update`, and `report`, representing
                a multi-armed bandit.
        fun: A function to be called with the current arm's arguments. Should return a tuple
             containing the number of successful and failed tasks.
        failure_threshold: A float to determine what fraction of tasks fails that triggers
                           a switch to the "WAITING" state.
        default_wait_time: The base wait time in seconds between task executions in the "ALIVE" state.
        extra_wait_time: Additional wait time in seconds to be added in the "WAITING" state.
        waiting_args: Arguments to be used when calling `fun` in the "WAITING" state.
        max_steps: Maximum number of iterations/steps to be performed.
        verbose: If True, prints additional detailed logs and progress via tqdm.
        reward_factor: A scaling factor to adjust the magnitude of the reward computed.

    Raises:
        ValueError: If `waiting_args` is not provided or if it is not of expected types.
    """

    if waiting_args is None:
        raise ValueError("waiting_args must be provided")
    else:
        if not (isinstance(waiting_args, tuple) or isinstance(waiting_args, list)):
            waiting_args = (waiting_args,)

    state = "ALIVE"
    last_alive_successes: float = 0.0
    last_arm_index: Optional[int] = None
    waiting_steps: int = 0
    waiting_time: float = 0.0

    iterator = range(max_steps)
    if verbose:
        iterator = tqdm(range(max_steps))

    for _ in iterator:
        if verbose:
            bandit.report()

        if state == "ALIVE":
            current_arm_index = bandit.select_arm()

            fun_args = bandit.arms[current_arm_index]
            if not (isinstance(fun_args, tuple) or isinstance(fun_args, list)):
                fun_args = (fun_args,)
            successful_tasks, failed_tasks = fun(*fun_args)
            fail_fraction = failed_tasks / (successful_tasks + failed_tasks)

            time.sleep(default_wait_time)
            waiting_time += default_wait_time

            if fail_fraction >= failure_threshold:
                last_alive_successes = successful_tasks
                last_arm_index = current_arm_index
                state = "WAITING"
                waiting_steps = 0
            else:
                reward = successful_tasks / waiting_time * reward_factor
                bandit.update(current_arm_index, reward)
                waiting_time = 0.0

        else:
            successful_tasks, failed_tasks = fun(*waiting_args)
            fail_fraction = failed_tasks / (successful_tasks + failed_tasks)
            waiting_steps += 1

            time.sleep(default_wait_time + extra_wait_time)
            waiting_time += default_wait_time + extra_wait_time

            if fail_fraction < failure_threshold:
                reward = last_alive_successes / waiting_time * reward_factor
                bandit.update(last_arm_index, reward)
                waiting_time = 0.0
                state = "ALIVE"

    if verbose:
        bandit.report()

If you have any questions or require further assistance, feel free to open an issue.