Source code for training_classical_control.inverted_pendulum

"""
Original code taken from:
https://github.com/Farama-Foundation/Gymnasium/blob/f26cbe13e9ac20d43486032b7e9dd4b8f2c563dc/gymnasium/envs/classic_control/cartpole.py

MIT License:
https://github.com/Farama-Foundation/Gymnasium/blob/f26cbe13e9ac20d43486032b7e9dd4b8f2c563dc/LICENSE
"""
import logging
import math
from typing import Optional

import numpy as np
from gymnasium import spaces
from gymnasium.envs.classic_control import utils
from gymnasium.envs.classic_control.cartpole import CartPoleEnv
from numpy.typing import NDArray

__all__ = ["InvertedPendulumEnv"]


logger = logging.getLogger(__name__)



[docs]
class InvertedPendulumEnv(CartPoleEnv):
    """
    Description
    -----------

    The inverted pendulum problem is based on the classic problem in control theory.
    The system consists of an inverted pole attached at one end to a cart, and the other end being free.
    The pole can rotate around its fixed point and the cart can move horizontally.
    The pole starts by default in a random upright position and the goal is to move the cart
    to keep it upright.

        **Note** This environment is a modified version of the CartPole environment.
        It allows configuring most relevant parameters of the system (e.g. cart mass,
        pole mass, pole length) and it uses a continuous action space instead of
        a discrete one.

    Action Space
    ============

    The action is a `ndarray` with shape `(1,)` representing the force applied to the cart.

    +-----+---------------------------+-------------+-------------+
    | Num | Action                    | Control Min | Control Max |
    +=====+===========================+=============+=============+
    | 0   | Force applied on the cart | -10         | 10          |
    +-----+---------------------------+-------------+-------------+

    Observation Space
    =================

    The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:

    +-----+-----------------------------------------------+------+-----+
    | Num | Observation                                   | Min  | Max |
    +=====+===============================================+======+=====+
    | 0   | position of the cart along the linear surface | -3   | 3   |
    | 1   | linear velocity of the cart                   | -Inf | Inf |
    | 2   | vertical angle of the pole on the cart        | -24  | 24  |
    | 3   | angular velocity of the pole on the cart      | -Inf | Inf |
    +-----+-----------------------------------------------+------+-----+

    Rewards
    =======

    The goal is to make the inverted pendulum remain upright (within a certain angle limit)
    as long as possible - as such a reward of +1 is awarded for each timestep that
    the pole is upright.

    Starting State
    ==============

    All observations start in state
    (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
    of [-0.01, 0.01] added to the values for stochasticity.

    Episode End
    -----------

    The episode ends when any of the following happens:

    1. Termination: Any of the state space values is no longer finite.
    2. Termination: The absolute value of the vertical angle between the pole
            and the cart are greater than a threshold value (which defaults to 24 degrees).


    :param masspole: mass of the pole.
    :param masscart: mass of the cart.
    :param length: length of the pole.
    :param x_threshold: threshold for cart position.
    :param theta_threshold: threshold for pole angle.
    :param force_max: maximum absolute value for force applied to Cart.

    """

    def __init__(
        self,
        render_mode: Optional[str] = None,
        *,
        masspole: float = 0.1,
        masscart: float = 1.0,
        length: float = 1.0,
        x_threshold: float = 3,
        theta_threshold: float = 24,
        force_max: float = 30.0,
    ) -> None:
        super().__init__()
        self.gravity = 9.81
        self.masscart = masscart
        self.masspole = masspole
        self.total_mass = self.masspole + self.masscart
        self.length = length
        self.half_length = self.length / 2
        self.polemass_length = self.masspole * self.half_length
        self.dt = 0.02
        self.kinematics_integrator = "euler"

        # Angle at which to fail the episode
        self.theta_threshold_radians = math.radians(theta_threshold)
        # Cart position at which to fail the episode
        self.x_threshold = x_threshold
        # Maximum absolute value of force applied on Cart
        self.force_max = force_max

        # Angle limit set to 2 * theta_threshold_radians so failing observation
        # is still within bounds.
        high = np.array(
            [
                self.x_threshold * 2,
                np.finfo(np.float32).max,
                self.theta_threshold_radians * 2,
                np.finfo(np.float32).max,
            ],
            dtype=np.float32,
        )

        self.action_space = spaces.Box(
            -self.force_max, self.force_max, dtype=np.float32
        )
        self.observation_space = spaces.Box(-high, high, dtype=np.float32)

        self.render_mode = render_mode

        self.screen_width = 600
        self.screen_height = 400
        self.screen = None
        self.clock = None
        self.isopen = True
        self.state = None

        self.steps_beyond_terminated = None

        self.init_state = np.array([0.0, 0.0, 0.0, 0.0])
        self.state = self.init_state.copy()


[docs]
    def step(self, action: float) -> tuple[NDArray, float, bool, bool, dict]:
        if self.state is None:
            raise RuntimeError("Call reset before using step method.")
        x, x_dot, theta, theta_dot = self.state
        force = np.clip(action, -self.force_max, self.force_max).item()
        costheta = math.cos(theta)
        sintheta = math.sin(theta)

        # For the interested reader:
        # https://coneural.org/florian/papers/05_cart_pole.pdf
        temp = (
            force + self.polemass_length * theta_dot**2 * sintheta
        ) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / (
            self.half_length
            * (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass)
        )
        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass

        if self.kinematics_integrator == "euler":
            x = x + self.dt * x_dot
            x_dot = x_dot + self.dt * xacc
            theta = theta + self.dt * theta_dot
            theta_dot = theta_dot + self.dt * thetaacc
        else:  # semi-implicit euler
            x_dot = x_dot + self.dt * xacc
            x = x + self.dt * x_dot
            theta_dot = theta_dot + self.dt * thetaacc
            theta = theta + self.dt * theta_dot

        self.state = (x, x_dot, theta, theta_dot)

        terminated = bool(
            x < -self.x_threshold
            or x > self.x_threshold
            or theta < -self.theta_threshold_radians
            or theta > self.theta_threshold_radians
        )

        if not terminated:
            reward = 1.0
        elif self.steps_beyond_terminated is None:
            # Pole just fell!
            self.steps_beyond_terminated = 0
            reward = 1.0
        else:
            if self.steps_beyond_terminated == 0:
                logger.warning(
                    "You are calling 'step()' even though this "
                    "environment has already returned terminated = True. You "
                    "should always call 'reset()' once you receive 'terminated = "
                    "True' -- any further steps are undefined behavior."
                )
            self.steps_beyond_terminated += 1
            reward = 0.0

        if self.render_mode == "human":
            self.render()
        return np.asarray(self.state, dtype=np.float32), reward, terminated, False, {}



[docs]
    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ) -> tuple[NDArray, dict]:
        super().reset(seed=seed)
        # Note that if you use custom reset bounds, it may lead to out-of-bound
        # state/observations.
        low, high = utils.maybe_parse_reset_bounds(
            options, -0.01, 0.01  # default low
        )  # default high
        self.state = self.init_state + self.np_random.uniform(
            low=low, high=high, size=(4,)
        )
        self.steps_beyond_terminated = None

        if self.render_mode == "human":
            self.render()
        return np.array(self.state, dtype=np.float32), {}