Source code for training_classical_control.inverted_pendulum

"""
Original code taken from:
https://github.com/Farama-Foundation/Gymnasium/blob/f26cbe13e9ac20d43486032b7e9dd4b8f2c563dc/gymnasium/envs/classic_control/cartpole.py

MIT License:
https://github.com/Farama-Foundation/Gymnasium/blob/f26cbe13e9ac20d43486032b7e9dd4b8f2c563dc/LICENSE
"""
import logging
import math
from typing import Optional

import numpy as np
from gymnasium import spaces
from gymnasium.envs.classic_control import utils
from gymnasium.envs.classic_control.cartpole import CartPoleEnv
from numpy.typing import NDArray

__all__ = ["InvertedPendulumEnv"]


logger = logging.getLogger(__name__)


[docs] class InvertedPendulumEnv(CartPoleEnv): """ Description ----------- The inverted pendulum problem is based on the classic problem in control theory. The system consists of an inverted pole attached at one end to a cart, and the other end being free. The pole can rotate around its fixed point and the cart can move horizontally. The pole starts by default in a random upright position and the goal is to move the cart to keep it upright. **Note** This environment is a modified version of the CartPole environment. It allows configuring most relevant parameters of the system (e.g. cart mass, pole mass, pole length) and it uses a continuous action space instead of a discrete one. Action Space ============ The action is a `ndarray` with shape `(1,)` representing the force applied to the cart. +-----+---------------------------+-------------+-------------+ | Num | Action | Control Min | Control Max | +=====+===========================+=============+=============+ | 0 | Force applied on the cart | -10 | 10 | +-----+---------------------------+-------------+-------------+ Observation Space ================= The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: +-----+-----------------------------------------------+------+-----+ | Num | Observation | Min | Max | +=====+===============================================+======+=====+ | 0 | position of the cart along the linear surface | -3 | 3 | | 1 | linear velocity of the cart | -Inf | Inf | | 2 | vertical angle of the pole on the cart | -24 | 24 | | 3 | angular velocity of the pole on the cart | -Inf | Inf | +-----+-----------------------------------------------+------+-----+ Rewards ======= The goal is to make the inverted pendulum remain upright (within a certain angle limit) as long as possible - as such a reward of +1 is awarded for each timestep that the pole is upright. Starting State ============== All observations start in state (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range of [-0.01, 0.01] added to the values for stochasticity. Episode End ----------- The episode ends when any of the following happens: 1. Termination: Any of the state space values is no longer finite. 2. Termination: The absolute value of the vertical angle between the pole and the cart are greater than a threshold value (which defaults to 24 degrees). :param masspole: mass of the pole. :param masscart: mass of the cart. :param length: length of the pole. :param x_threshold: threshold for cart position. :param theta_threshold: threshold for pole angle. :param force_max: maximum absolute value for force applied to Cart. """ def __init__( self, render_mode: Optional[str] = None, *, masspole: float = 0.1, masscart: float = 1.0, length: float = 1.0, x_threshold: float = 3, theta_threshold: float = 24, force_max: float = 30.0, ) -> None: super().__init__() self.gravity = 9.81 self.masscart = masscart self.masspole = masspole self.total_mass = self.masspole + self.masscart self.length = length self.half_length = self.length / 2 self.polemass_length = self.masspole * self.half_length self.dt = 0.02 self.kinematics_integrator = "euler" # Angle at which to fail the episode self.theta_threshold_radians = math.radians(theta_threshold) # Cart position at which to fail the episode self.x_threshold = x_threshold # Maximum absolute value of force applied on Cart self.force_max = force_max # Angle limit set to 2 * theta_threshold_radians so failing observation # is still within bounds. high = np.array( [ self.x_threshold * 2, np.finfo(np.float32).max, self.theta_threshold_radians * 2, np.finfo(np.float32).max, ], dtype=np.float32, ) self.action_space = spaces.Box( -self.force_max, self.force_max, dtype=np.float32 ) self.observation_space = spaces.Box(-high, high, dtype=np.float32) self.render_mode = render_mode self.screen_width = 600 self.screen_height = 400 self.screen = None self.clock = None self.isopen = True self.state = None self.steps_beyond_terminated = None self.init_state = np.array([0.0, 0.0, 0.0, 0.0]) self.state = self.init_state.copy()
[docs] def step(self, action: float) -> tuple[NDArray, float, bool, bool, dict]: if self.state is None: raise RuntimeError("Call reset before using step method.") x, x_dot, theta, theta_dot = self.state force = np.clip(action, -self.force_max, self.force_max).item() costheta = math.cos(theta) sintheta = math.sin(theta) # For the interested reader: # https://coneural.org/florian/papers/05_cart_pole.pdf temp = ( force + self.polemass_length * theta_dot**2 * sintheta ) / self.total_mass thetaacc = (self.gravity * sintheta - costheta * temp) / ( self.half_length * (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass) ) xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass if self.kinematics_integrator == "euler": x = x + self.dt * x_dot x_dot = x_dot + self.dt * xacc theta = theta + self.dt * theta_dot theta_dot = theta_dot + self.dt * thetaacc else: # semi-implicit euler x_dot = x_dot + self.dt * xacc x = x + self.dt * x_dot theta_dot = theta_dot + self.dt * thetaacc theta = theta + self.dt * theta_dot self.state = (x, x_dot, theta, theta_dot) terminated = bool( x < -self.x_threshold or x > self.x_threshold or theta < -self.theta_threshold_radians or theta > self.theta_threshold_radians ) if not terminated: reward = 1.0 elif self.steps_beyond_terminated is None: # Pole just fell! self.steps_beyond_terminated = 0 reward = 1.0 else: if self.steps_beyond_terminated == 0: logger.warning( "You are calling 'step()' even though this " "environment has already returned terminated = True. You " "should always call 'reset()' once you receive 'terminated = " "True' -- any further steps are undefined behavior." ) self.steps_beyond_terminated += 1 reward = 0.0 if self.render_mode == "human": self.render() return np.asarray(self.state, dtype=np.float32), reward, terminated, False, {}
[docs] def reset( self, *, seed: Optional[int] = None, options: Optional[dict] = None, ) -> tuple[NDArray, dict]: super().reset(seed=seed) # Note that if you use custom reset bounds, it may lead to out-of-bound # state/observations. low, high = utils.maybe_parse_reset_bounds( options, -0.01, 0.01 # default low ) # default high self.state = self.init_state + self.np_random.uniform( low=low, high=high, size=(4,) ) self.steps_beyond_terminated = None if self.render_mode == "human": self.render() return np.array(self.state, dtype=np.float32), {}