18. Random Search within OpenAI

03 Oct 2019 | Reinforcement Learning

Random Search

just add a little code to do random search for a linear model
- state.dot(params) > 0 -> do action 1
- state.dot(params) < 0 -> do action 0

For # of times i want to adjust the weights
  new_weights = random
  For # of episodes i want to play to decide whether to update the weights
    Play episode
  if avg episode length > best so far:
    weithgs = new_weights
Play a final set of episode to see how good my best weights do again

Random search

# https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python
# https://www.udemy.com/deep-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future

import gym
import numpy as np
import matplotlib.pyplot as plt


def get_action(s, w):
  return 1 if s.dot(w) > 0 else 0
# value function


def play_one_episode(env, params):
  observation = env.reset()
  done = False
  t = 0

  while not done and t < 10000:
    env.render() #<- window will pop up and we be able to see a video of the episode as it being played
    t += 1
    action = get_action(observation, params)
    observation, reward, done, info = env.step(action)
    if done:
      break

  return t


def play_multiple_episodes(env, T, params):
  episode_lengths = np.empty(T)

  for i in range(T):
    episode_lengths[i] = play_one_episode(env, params)

  avg_length = episode_lengths.mean()
  print("avg length:", avg_length)
  return avg_length


def random_search(env):
  episode_lengths = []
  best = 0
  params = None
  for t in range(100):
    new_params = np.random.random(4)*2 - 1
    avg_length = play_multiple_episodes(env, 100, new_params)
    episode_lengths.append(avg_length)

    if avg_length > best:
      params = new_params
      best = avg_length
  return episode_lengths, params


if __name__ == '__main__':
  env = gym.make('CartPole-v0')
  episode_lengths, params = random_search(env)
  plt.plot(episode_lengths)
  plt.show()

  # play a final set of episodes
  print("***Final run with final weights***")
  play_multiple_episodes(env, 100, params)

import numpy as np
np.empty(100)

array([ 6.34918891e+151,  5.40618860e+257,  1.16543025e+166,
        2.27438781e+161,  4.11368246e+223,  1.17122224e+166,
        4.82407136e+228,  2.35953965e+232,  1.66880539e-307,
        3.51863529e-315,  4.26638573e-270,  3.71557101e-164,
        4.26638934e-270,  7.89737178e-251,  2.53069372e-212,
        7.72812543e-319,  6.95335581e-309, -2.47148131e-258,
        5.09636100e+173,  2.12611159e+289,  5.40552416e+173,
        4.52018132e+202, -1.71405763e-284,  4.03775671e+202,
        1.74294797e+252,  4.03775703e+202,  4.51999087e+202,
        3.32524435e-188,  3.03195438e-212,  4.03173663e+019,
        4.51999834e+202, -1.79811855e-284,  1.32031518e-019,
        2.85697648e+289,  2.00807226e+289,  2.43400036e+159,
        6.01334412e-154,  5.04344070e+180,  7.29489665e+175,
        1.42244599e+214,  2.45945760e+198,  1.96086579e+243,
        1.69505627e+190,  2.49970340e+262,  6.24527202e-085,
        3.67285416e+194,  1.27919412e-152,  7.20358919e+159,
        2.31634007e-152,  5.82147482e+180,  2.35625381e+251,
        3.81187276e+180,  1.27966001e-152,  4.77092211e+180,
        6.01346953e-154,  9.30350598e+199,  2.05947607e-027,
        1.30389145e-076,  5.66774924e+160,  1.28625507e+248,
        4.47590761e-091,  1.18600496e-259,  1.94861571e-153,
        6.78690190e+199,  6.01347002e-154,  9.82391515e+252,
        1.23875453e-259,  8.90429111e+247,  1.27874063e-152,
        1.94903487e+227,  1.27827553e-152,  1.45729796e-094,
        2.13945866e+161,  6.01347002e-154,  2.31649991e-152,
        2.31463556e-152,  1.68821824e+195,  8.72944715e+183,
        1.14490518e+243,  2.60985693e+180,  2.44015014e-154,
        6.01347002e-154,  9.05293030e+223,  1.69593624e-152,
        5.81816253e+180,  2.64520780e+185,  6.32266889e+180,
        3.17095857e+180,  7.22941924e+159,  5.98150411e-154,
        1.79805224e+044,  2.45943259e+198,  9.75015749e+199,
        3.62483719e+228,  6.97379781e+228,  1.97717050e+161,
        1.46923002e+195,  2.44048419e-152,  2.42766858e-154,
        6.01347002e-154])

How to save a video

importantm because it allows us to wath the agent play and see what it has learned through human eyes
states, actions, rewards are abstract -> this is not a bad thing because it gives us a very powerful framework that makes this all possible
but it leaves some unanswered questions
has the agent learned to play as a human would?
Does it play better?
Does it use unconventional move?
Main Changes

import gym
from gym import wrappers
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, 'my_unique_folder')
observation = env.reset()
while not done:
  action = choose_action()
  observation, reward, done, info = env.step(action)
  if done:
    break

Random search and Save video

# https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python
# https://www.udemy.com/deep-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future

import gym
from gym import wrappers
import numpy as np
import matplotlib.pyplot as plt


def get_action(s, w):
  return 1 if s.dot(w) > 0 else 0


def play_one_episode(env, params):
  observation = env.reset()
  done = False
  t = 0

  while not done and t < 10000:
    t += 1
    action = get_action(observation, params)
    observation, reward, done, info = env.step(action)
    if done:
      break

  return t


def play_multiple_episodes(env, T, params):
  episode_lengths = np.empty(T)

  for i in range(T):
    episode_lengths[i] = play_one_episode(env, params)
    #각 에피소드 1,2,3... 에 에피소드 1에 몇번에 도전끝에 성공이 되었는지 적혀저서 나온다.

  avg_length = episode_lengths.mean()
  print("avg length:", avg_length)
  return avg_length


def random_search(env):
  episode_lengths = []
  best = 0
  params = None
  for t in range(100):
    new_params = np.random.random(4)*2 - 1 # 4 weight
    avg_length = play_multiple_episodes(env, 100, new_params)
    episode_lengths.append(avg_length)

    if avg_length > best:
      params = new_params
      best = avg_length
  return episode_lengths, params


if __name__ == '__main__':
  env = gym.make('CartPole-v0')
  episode_lengths, params = random_search(env)
  plt.plot(episode_lengths)
  plt.show()

  # play a final set of episodes
  env = wrappers.Monitor(env, 'my_awesome_dir')
  print("***Final run with final weights***:", play_one_episode(env, params))

Reference:

Artificial Intelligence Reinforcement Learning

Advance AI : Deep-Reinforcement Learning

Cutting-Edge Deep-Reinforcement Learning

Comment Read more

3. Forward Kinematic

03 Oct 2019 | ROS

to understand TF look at the my Forward Kinematic post

Forward Kinematic

Forward Kinematic

#!/usr/bin/env python

import numpy

import geometry_msgs.msg
import rospy
from sensor_msgs.msg import JointState
import tf
import tf.msg
from urdf_parser_py.urdf import URDF

""" Starting from a computed transform T, creates a message that can be
communicated over the ROS wire. In addition to the transform itself, the message
also specifies who is related by this transform, the parent and the child."""
def convert_to_message(T, child, parent):
    t = geometry_msgs.msg.TransformStamped()
    t.header.frame_id = parent
    t.header.stamp = rospy.Time.now()
    t.child_frame_id = child
    translation = tf.transformations.translation_from_matrix(T)
    rotation = tf.transformations.quaternion_from_matrix(T)
    t.transform.translation.x = translation[0]
    t.transform.translation.y = translation[1]
    t.transform.translation.z = translation[2]
    t.transform.rotation.x = rotation[0]
    t.transform.rotation.y = rotation[1]
    t.transform.rotation.z = rotation[2]
    t.transform.rotation.w = rotation[3]        
    return t

#Our main class for computing Forward Kinematics
class ForwardKinematics(object):


    #Initialization
    def __init__(self):
        """Announces that it will publish forward kinematics results, in the form of tfMessages.
        "tf" stands for "transform library", it's ROS's way of communicating around information
        about where things are in the world"""
        self.pub_tf = rospy.Publisher("/tf", tf.msg.tfMessage, queue_size=1)

        #Loads the robot model, which contains the robot's kinematics information
        self.robot = URDF.from_parameter_server()

        #Subscribes to information about what the current joint values are.
        rospy.Subscriber("/joint_states", JointState, self.callback)


    """This function is called every time the robot publishes its joint values. We must use
    the information we get to compute forward kinematics.

    We will iterate through the entire chain, and publish the transform for each link we find.
    """
    def callback(self, joint_values):
        # First, we must extract information about the kinematics of the robot from its URDF.
        # We will start at the root and add links and joints to lists
        link_name = self.robot.get_root()
        link_names = ['base_link','shoulder_link','upper_arm_link','forearm_link','wrist_1_link','wrist_2_link','wrist_3_link']
        joints = ['shoulder_pan_joint','shoulder_lift_joint','elbow_joint','wrist_1_joint','wrist_2_joint','wrist_3_joint']
        while True:
            # Find the joint connected at the end of this link, or its "child"
            # Make sure this link has a child
            if link_name not in self.robot.child_map:
                break
            # Make sure it has a single child (we don't deal with forks)
            if len(self.robot.child_map[link_name]) != 1:
                rospy.logerr("Forked kinematic chain!");
                break
            # Get the name of the child joint, as well as the link it connects to
            (joint_name, next_link_name) = self.robot.child_map[link_name][0]
            # Get the actual joint based on its name
            if joint_name not in self.robot.joint_map:
                rospy.logerr("Joint not found in map")
                break;
            joints.append(self.robot.joint_map[joint_name])
            link_names.append(next_link_name)

            # Move to the next link
            link_name = next_link_name

        # Compute all the transforms based on the information we have
        all_transforms = self.compute_transforms(link_names, joints, joint_values)

        # Publish all the transforms
        self.pub_tf.publish(all_transforms)


    """ This is the function that performs the main forward kinematics computation. It accepts as
    parameters all the information needed about the joints and links of the robot, as well as the current
    values of all the joints, and must compute and return the transforms from the world frame to all the
    links, ready to be published through tf.

    Parameters are as follows:
    - link_names: a list with all the names of the robot's links, ordered from proximal to distal.
    These are also the names of the link's respective coordinate frame. In other words, the transform
    from the world to link i should be published with "world_link" as the parent frame and link_names[i]
    as the child frame.    

    - joints: a list of all the joints of the robot, in the same order as the links listed above. Each
    entry in this list is an object which contains the following fields:
     * joint.origin.xyz: the translation from the frame of the previous joint to this one
     * joint.origin.rpy: the rotation from the frame of the previous joint to this one,
       in ROLL-PITCH-YAW XYZ convention
     * joint.type: either 'fixed' or 'revolute'. A fixed joint does not move; it is meant to
       contain a static transform.
     * joint.name: the name of the current joint in the robot description
     * joint.axis: (only if type is 'revolute') the axis of rotation of the joint

     - joint_values contains information about the current joint values in the robot. It contains
     information about *all* the joints, and the ordering can vary, so we must find the relevant value
     for a particular joint you are considering. We can use the following fields:
      * joint_values.name: a list of the names of *all* the joints in the robot
      * joint_values.position: a list of the current values of *all* the joints in the robot, in the same
        order as the names in the list above.
     To find the value of the joint we care about, we must find its name in the "name" list, then take
     the value found at the same index in the "position" list.

    The function must return one tf message. The "transforms" field of this message must list *all* the
    transforms from the world coordinate frame to the links of the robot. In other words, when you are done,
    all_transforms.transforms must contain a list in which you must place all the transforms from the
    "world_link" coordinate frame to each of the coordinate frames listed in "link_names". You can use the
    "convert_to_message" function (defined above) for a convenient way to create a tf message from a
    transformation matrix.
    """

    def compute_transforms(self, link_names, joints, joint_values):
        all_transforms = tf.msg.tfMessage()
        # We start with the identity
        T = tf.transformations.identity_matrix()

        # YOUR CODE GOES HERE
        for i in range(len(link_names)):
            link_name = link_names[i]
            joint_name = joints[i].name
            if joint_name == 'world_link_lwr_arm_base_joint':
                #joint_index = joint_values.name.index(joint_name)
                T = tf.transformations.concatenate_matrices(T,tf.transformations.translation_matrix(joints[i].origin.xyz))
                joint_rpy = joints[i].origin.rpy
                T = tf.transformations.concatenate_matrices(T,tf.transformations.quaternion_matrix(tf.transformations.quaternion_from_euler(joint_rpy[0],joint_rpy[1],joint_rpy[2])))
                #all_transforms.transforms.append(convert_to_message(T,link_name,'world_link'))
                all_transforms.transforms.append(convert_to_message(T,link_name,'world'))
                continue    

            joint_index = joint_values.name.index(joint_name)
            T = tf.transformations.concatenate_matrices(T,tf.transformations.translation_matrix(joints[i].origin.xyz))
            joint_rpy = joints[i].origin.rpy
            T = tf.transformations.concatenate_matrices(T,tf.transformations.quaternion_matrix(tf.transformations.quaternion_from_euler(joint_rpy[0],joint_rpy[1],joint_rpy[2])))
            joint_pose = numpy.array(joints[i].axis)*joint_values.position[joint_index]
            T = tf.transformations.concatenate_matrices(T,tf.transformations.quaternion_matrix(tf.transformations.quaternion_from_euler(joint_pose[0],joint_pose[1],joint_pose[2])))
            #all_transforms.transforms.append(convert_to_message(T,link_name,'world_link'))
            all_transforms.transforms.append(convert_to_message(T,link_name,'world'))

        return all_transforms

if __name__ == '__main__':
    rospy.init_node('fwk', anonymous=True)
    fwk = ForwardKinematics()
    rospy.spin()

Comment Read more

17. OpenAI Gym Tutorial

03 Oct 2019 | Reinforcement Learning

OpenAI Gym Tutorial

Tutorial on the basics of Open AI Gym
install gym : pip install openai
what we’ll do:
- Connect to an environment
- Play an episode with purely random actions
Purpose: Familiarize ourselves with the API

Import Gym

First things :
```
import gym
```

Get our environment

env = gym.make('CartPole-v0')

Full list: https://gym.openai.com/envs
has short blurbs(简介)
- what the task is
- whether it’s been solved
- leaderboard

Start an episode

# put ourselves in the start state
# it also return the state
env.reset()
# out: array([-0.04533, -0.032314, -0.0146921, 0.04151])

What is the state?

what do these numbers mean?
https://github.com/openai/gym/wiki/CartPole-v0
Documentation is somewhat sparse

In the console

Box = env.observation_space
# In : Box
# out : box(4,)
# in box.
# box.contains box.high, box.simple, box.to_jsonable, box.from_jasonable, box.low , box.shape

The observation_space defines the structure of the observations our environment will be returning. Learning agents usually need to know this before they start running, in order to set up the policy function. some general-purpose learning agents can bandle a wide range of overvation types: Discrete, box, or pixels(which is usually a Box(0,255,[height,width,3])for RGB pixels)

Action Space

env.action_space
# in : env.action_space
# out : Discrete(2)
# in : env.action_space
# env.action_space.contains, env.action_space.n, env.action_space.to_jsonable, env.action_space.form_jsonable, env.action_space.sample

play an episode

observation, reward, done, info = env.step(action)

Typically ignore info, since it can’t be used in submission(屈服)(although it’s possible it can help training)

Finish an episode

done = False
while not done:
  observation, reward, done, _ = env.step(env.action_space.sample())

will end quickly since random actions can’t keep the pole up for long

Exercise

determine how many steps, on average, are taken when actions are randomly sampled
can be a benchmark to compare later algorithms

# https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python
# https://www.udemy.com/deep-reinforcement-learning-in-python
import gym
# Wiki:
# https://github.com/openai/gym/wiki/CartPole-v0
# Environment page:
# https://gym.openai.com/envs/CartPole-v0

# get the environment
env = gym.make('CartPole-v0')

# put yourself in the start state
# it also returns the state
env.reset()
# Out[50]: array([-0.04533731, -0.03231478, -0.01469216,  0.04151   ])

# what do the state variables mean?
# Num Observation Min Max
# 0 Cart Position -2.4  2.4
# 1 Cart Velocity -Inf  Inf
# 2 Pole Angle  ~ -41.8°  ~ 41.8°
# 3 Pole Velocity At Tip  -Inf  Inf

box = env.observation_space

# In [53]: box
# Out[53]: Box(4,)

# In [54]: box.
# box.contains       box.high           box.sample         box.to_jsonable
# box.from_jsonable  box.low            box.shape

env.action_space

# In [71]: env.action_space
# Out[71]: Discrete(2)

# In [72]: env.action_space.
# env.action_space.contains       env.action_space.n              env.action_space.to_jsonable
# env.action_space.from_jsonable  env.action_space.sample

# pick an action
action = env.action_space.sample()

# do an action
observation, reward, done, info = env.step(action)


# run through an episode
done = False
while not done:
  observation, reward, done, _ = env.step(env.action_space.sample())

print(box.contains) # <bound method Box.contains of Box(4,)>
num_states=10**env.observation_space.shape[0]
num_actions= env.action_space.n
import numpy as np
a=np.random.uniform(low=-1, high=1, size=(num_states, num_actions))
len(a) #10000
observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
observation_examples.shape
#(10000, 4)

observation_examples

array([[-9.7157693e-01, -7.4095410e+37, -4.1871965e-01,  6.7507521e+37],
       [-3.7798457e+00, -2.5458868e+38,  2.8346911e-01,  3.4012554e+38],
       [ 2.6570508e+00, -3.3872902e+38, -4.0225080e-01,  4.4559389e+37],
       ...,
       [-2.4882526e+00,  2.3085303e+38, -4.1779616e-01, -1.0087459e+38],
       [-3.6637935e-01,  8.6948986e+37, -2.4446332e-01, -3.9084766e+37],
       [ 2.8921950e+00,  3.1869669e+38, -2.1086818e-02, -8.7358295e+37]],
      dtype=float32)

np.random.random((20000, 4))*2 - 1
a =np.random.random((20000, 4))*2 - 1
a.shape #(20000, 4)

array([[-0.24646925, -0.71345292, -0.13534508,  0.56783749],
       [ 0.94458905,  0.51363389,  0.16503288, -0.42810724],
       [ 0.09917716, -0.44244101, -0.46836587, -0.04689816],
       ...,
       [-0.59025553, -0.39560999,  0.35734313,  0.43326763],
       [ 0.03911455, -0.6548753 , -0.11969308,  0.91571025],
       [-0.10430864,  0.15491529, -0.73753709, -0.05311987]])

Reference:

Artificial Intelligence Reinforcement Learning

Advance AI : Deep-Reinforcement Learning

Cutting-Edge Deep-Reinforcement Learning

Comment Read more

3. Appendix- Elementary results and notations

03 Oct 2019 | Optimization method

Appendix: Elementary results and notations

1.Derivatives

A gradient represents the slope of the tangent of the graph of the function. It gives the linear approximation of f at a point. It points toward the greatest rate of increase.

2. Hessian

Let f be twice differentiable.

A Hessian gives a quadratic approximation of f at a point.

Gradient and Hessian are local properties that help us recognize local solutions and determine a direction to move at toward the next point.

3. Taylor’s series Expansion

α is learning rate

Reference

Optimization method - Standford University

Comment Read more

2. TensorFlow warmup

02 Oct 2019 | Deep Learning

TensorFlow warmup

# https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python
# https://www.udemy.com/deep-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range

import numpy as np
import tensorflow as tf
import q_learning


class SGDRegressor:
  def __init__(self, D):
    print("Hello TensorFlow!")
    lr = 0.1

    # create inputs, targets, params
    # matmul doesn't like when w is 1-D
    # so we make it 2-D and then flatten the prediction
    self.w = tf.Variable(tf.random_normal(shape=(D, 1)), name='w')
    self.X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    self.Y = tf.placeholder(tf.float32, shape=(None,), name='Y')

    # make prediction and cost
    Y_hat = tf.reshape( tf.matmul(self.X, self.w), [-1] )
    delta = self.Y - Y_hat
    cost = tf.reduce_sum(delta * delta)

    # ops we want to call later
    self.train_op = tf.train.GradientDescentOptimizer(lr).minimize(cost)
    self.predict_op = Y_hat

    # start the session and initialize params
    init = tf.global_variables_initializer()
    self.session = tf.InteractiveSession()
    self.session.run(init)

  def partial_fit(self, X, Y):
    self.session.run(self.train_op, feed_dict={self.X: X, self.Y: Y})

  def predict(self, X):
    return self.session.run(self.predict_op, feed_dict={self.X: X})


if __name__ == '__main__':
  q_learning.SGDRegressor = SGDRegressor
  q_learning.main()

result

Hello TensorFlow!
Hello TensorFlow!
episode:
0 total reward: 34.0 eps: 1.0 avg reward (last 100): 34.0
episode: 100 total reward: 199.0 eps: 0.09950371902099892 avg reward (last 100): 105.13861386138613
episode: 200 total reward: 176.0 eps: 0.07053456158585983 avg reward (last 100): 179.85148514851485
episode: 300 total reward: 178.0 eps: 0.0576390417704235 avg reward (last 100): 170.88118811881188
episode: 400 total reward: 199.0 eps: 0.04993761694389223 avg reward (last 100): 191.97029702970298
avg reward for last 100 episodes: 198.79

Comment Read more

Older Newer

for Robot Artificial Inteligence

18. Random Search within OpenAI

Random Search

How to save a video

3. Forward Kinematic

Forward Kinematic

17. OpenAI Gym Tutorial

OpenAI Gym Tutorial

Import Gym

Get our environment

Start an episode

What is the state?

In the console

Action Space

play an episode

Finish an episode

Exercise

3. Appendix- Elementary results and notations

Appendix: Elementary results and notations

1.Derivatives

2. Hessian

3. Taylor’s series Expansion

Reference

2. TensorFlow warmup

TensorFlow warmup