Multi-armed Bandits

Introduction

import numpy as np
import matplotlib.pyplot as plt

# Set the random seed for reproducibility
np.random.seed(0)

# Set the figure size
plt.figure(figsize=(10, 6))

# Generate the data and plot the violin plot
data = np.random.randn(2000, 10) + np.random.randn(10)
plt.violinplot(dataset=data)

# Set the labels and title with increased font sizes
plt.xlabel("Bandit", fontsize=14)
plt.ylabel("Reward", fontsize=14)
plt.title("Violin plot for 10-armed bandits", fontsize=16)

plt.tick_params(axis="both", which="major", labelsize=12)
plt.xticks(np.arange(0, 11))

# Save the figure
plt.savefig("bandits_violin_plot.svg", format="svg")

Notation

Expected Reward and Estimated reward

As long as we obtain sufficient samples, the sample average will converge to the true expected reward. However, this method requires storing all the rewards and actions, which will be increasingly expensive as the number of actions and time steps increases.

Incremental Implementation

It is easy to prove that the incremental implementation is equivalent to the sample average implementation. The proof is as follows:

Now, we can use the incremental implementation to estimate the expected reward for each action. The next question is how to choose the action in each period.

Exploration and Exploitation

A Simple Bandit Algorithm

We have discussed the methods to estimate the expected reward and to select the action. Now, we can combine these methods to create a simple bandit algorithm. The pseudocode is as follows:

Implementation: n-armed Bandits

import numpy as np
import matplotlib.pyplot as plt

class bandit_algorithm:
    def __init__(self, bandit, epsilon, steps):
        self.bandit = bandit
        self.epsilon = epsilon
        self.steps = steps
        self.k = bandit.k
        self.Q = np.zeros(self.k)
        self.N = np.zeros(self.k)
        self.reward = np.zeros(self.steps)

    def learn(self):
        for t in range(self.steps):
            # epsilon greedy
            if np.random.rand() < self.epsilon:
                action = np.random.randint(self.k)
            else:
                # choose action with maximum Q, if multiple, choose randomly
                action = np.random.choice(np.where(self.Q == np.max(self.Q))[0])
            # get reward
            reward = self.bandit.bandit(action)
            # update Q
            self.N[action] += 1
            self.Q[action] += (reward - self.Q[action]) / self.N[action]
            # update reward
            self.reward[t] = reward

class bandit:
    def __init__(self, k):
        self.k = k
        self.mu = np.random.randn(k)
        self.sigma = np.ones(k)

    def bandit(self, action):
        return np.random.normal(self.mu[action], self.sigma[action])
    
if __name__ == '__main__':
    
    # set random seed for reproducibility
    np.random.seed(0)

    k = 10
    steps = 1000
    epsilon_list = [0, 0.1, 0.01]

    # mean reward
    number_of_runs = 2000
    rewards = np.zeros((len(epsilon_list), number_of_runs, steps))

    for i, epsilon in enumerate(epsilon_list):
        for j in range(number_of_runs):
            bandit_instance = bandit(k)
            simple_bandit = bandit_algorithm(bandit_instance, epsilon, steps)
            simple_bandit.learn()
            rewards[i, j, :] = simple_bandit.reward

    # plot
    plt.figure(figsize=(10, 6))
    for i in range(len(epsilon_list)):
        plt.plot(
            np.mean(rewards[i, :, :], axis=0),
            label="epsilon = {}".format(epsilon_list[i]),
        )
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Average Reward", fontsize=14)
    plt.title("Average Reward vs Steps", fontsize=16)
    plt.tick_params(axis="both", which="major", labelsize=12)
    plt.legend(fontsize=12)
    plt.savefig("simple_bandit.svg")

Application: The Newsvendor Problem

The newsvendor problem is a classic inventory management problem in which a decision-maker must decide how many copies of a newspaper to order each morning. The demand for the newspaper follows a probability distribution. The overage cost occurs when the demand is less than the order quantity, and the underage cost occurs when the demand is greater than the order quantity.

Note that the classic newsvendor problem assumes that the demand distribution is known and by minimizing the expected cost, we can find the optimal order quantity.

In here, we assume that the demand distribution is unknown, and we can only observe the demand after ordering the newspaper. The objective is to minimize the cost over some time period. By considering the discrete order quantity as actions, we can model the newsvendor problem as a multi-armed bandit problem.

import numpy as np
import matplotlib.pyplot as plt


class newsvendor_as_bandits:
    def __init__(self, k, h, p, mu, sigma):
        self.k = k
        self.h = h
        self.p = p
        self.mu = mu
        self.sigma = sigma

    def bandit(self, action):
        # sample demand to integer
        demand = round(np.random.normal(self.mu, self.sigma))
        # calculate cost
        cost = self.h * max(action - demand, 0) + self.p * max(demand - action, 0)
        return -cost


class bandit_algorithm:
    def __init__(self, bandit, epsilon, steps):
        self.bandit = bandit
        self.epsilon = epsilon
        self.steps = steps
        self.k = bandit.k
        self.Q = np.zeros(self.k)
        self.N = np.zeros(self.k)
        self.reward = np.zeros(self.steps)

    def learn(self):
        for t in range(self.steps):
            # epsilon greedy
            if np.random.rand() < self.epsilon:
                action = np.random.randint(self.k)
            else:
                # choose action with maximum Q, if multiple, choose randomly
                action = np.random.choice(np.where(self.Q == np.max(self.Q))[0])
            # get reward
            reward = self.bandit.bandit(action)
            # update Q
            self.N[action] += 1
            self.Q[action] += (reward - self.Q[action]) / self.N[action]
            # update reward
            self.reward[t] = reward


if __name__ == "__main__":

    # set random seed for reproducibility
    np.random.seed(0)

    # parameters
    k = 10
    h = 0.18
    p = 0.7
    mu = 5
    sigma = 1
    optimal = 6
    epsilon_list = [0.1]
    steps = 2000

    # mean reward
    number_of_runs = 10
    rewards = np.zeros((len(epsilon_list), number_of_runs, steps))

    # newsvendor problem
    newsvendor = newsvendor_as_bandits(k, h, p, mu, sigma)

    for i in range(len(epsilon_list)):
        for j in range(number_of_runs):
            # initialize bandit algorithm
            bandit = bandit_algorithm(newsvendor, epsilon_list[i], steps)
            # learn
            bandit.learn()
            # store results
            rewards[i, j, :] = bandit.reward
            # print optimal action and Q value
            print(
                "optimal action = {}, Q = {}".format(
                    np.argmax(bandit.Q), bandit.Q[np.argmax(bandit.Q)]
                )
            )

    # plot
    plt.figure(figsize=(10, 6))
    for i in range(len(epsilon_list)):
        plt.plot(
            np.mean(rewards[i, :, :], axis=0),
            label="epsilon = {}".format(epsilon_list[i]),
        )
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Average Reward", fontsize=14)
    plt.title("Average Reward vs Steps", fontsize=16)
    plt.tick_params(axis="both", which="major", labelsize=12)
    plt.legend(fontsize=12)
    plt.savefig("newsvendor_average_reward.svg", format="svg")

Summary

  • The incremental implementation can be used to update the estimated value without storing all the rewards and actions.

Last updated