PPO Experiment with Atari Breakout

This experiment trains Proximal Policy Optimization (PPO) agent Atari Breakout game on OpenAI Gym. It runs the game environments on multiple processes to sample efficiently.

15fromtypingimportDict1617importnumpyasnp18importtorch19fromtorchimportnn20fromtorchimportoptim21fromtorch.distributionsimportCategorical2223fromlabmlimportmonit,tracker,logger,experiment24fromlabml.configsimportFloatDynamicHyperParam,IntDynamicHyperParam25fromlabml\_nn.rl.gameimportWorker26fromlabml\_nn.rl.ppoimportClippedPPOLoss,ClippedValueFunctionLoss27fromlabml\_nn.rl.ppo.gaeimportGAE

Select device

30iftorch.cuda.is\_available():31device=torch.device("cuda:0")32else:33device=torch.device("cpu")

Model

36classModel(nn.Module):

41def\_\_init\_\_(self):42super().\_\_init\_\_()

The first convolution layer takes a 84x84 frame and produces a 20x20 frame

46self.conv1=nn.Conv2d(in\_channels=4,out\_channels=32,kernel\_size=8,stride=4)

The second convolution layer takes a 20x20 frame and produces a 9x9 frame

50self.conv2=nn.Conv2d(in\_channels=32,out\_channels=64,kernel\_size=4,stride=2)

The third convolution layer takes a 9x9 frame and produces a 7x7 frame

54self.conv3=nn.Conv2d(in\_channels=64,out\_channels=64,kernel\_size=3,stride=1)

A fully connected layer takes the flattened frame from third convolution layer, and outputs 512 features

59self.lin=nn.Linear(in\_features=7\*7\*64,out\_features=512)

A fully connected layer to get logits for π

62self.pi\_logits=nn.Linear(in\_features=512,out\_features=4)

A fully connected layer to get value function

65self.value=nn.Linear(in\_features=512,out\_features=1)

68self.activation=nn.ReLU()

70defforward(self,obs:torch.Tensor):71h=self.activation(self.conv1(obs))72h=self.activation(self.conv2(h))73h=self.activation(self.conv3(h))74h=h.reshape((-1,7\*7\*64))7576h=self.activation(self.lin(h))7778pi=Categorical(logits=self.pi\_logits(h))79value=self.value(h).reshape(-1)8081returnpi,value

Scale observations from [0, 255] to [0, 1]

84defobs\_to\_torch(obs:np.ndarray)-\>torch.Tensor:

86returntorch.tensor(obs,dtype=torch.float32,device=device)/255.

Trainer

89classTrainer:

94def\_\_init\_\_(self,\*,95updates:int,epochs:IntDynamicHyperParam,96n\_workers:int,worker\_steps:int,batches:int,97value\_loss\_coef:FloatDynamicHyperParam,98entropy\_bonus\_coef:FloatDynamicHyperParam,99clip\_range:FloatDynamicHyperParam,100learning\_rate:FloatDynamicHyperParam,101):

Configurations

number of updates

105self.updates=updates

number of epochs to train the model with sampled data

107self.epochs=epochs

number of worker processes

109self.n\_workers=n\_workers

number of steps to run on each process for a single update

111self.worker\_steps=worker\_steps

number of mini batches

113self.batches=batches

total number of samples for a single update

115self.batch\_size=self.n\_workers\*self.worker\_steps

size of a mini batch

117self.mini\_batch\_size=self.batch\_size//self.batches118assert(self.batch\_size%self.batches==0)

Value loss coefficient

121self.value\_loss\_coef=value\_loss\_coef

Entropy bonus coefficient

123self.entropy\_bonus\_coef=entropy\_bonus\_coef

Clipping range

126self.clip\_range=clip\_range

Learning rate

128self.learning\_rate=learning\_rate

Initialize

create workers

133self.workers=[Worker(47+i)foriinrange(self.n\_workers)]

initialize tensors for observations

136self.obs=np.zeros((self.n\_workers,4,84,84),dtype=np.uint8)137forworkerinself.workers:138worker.child.send(("reset",None))139fori,workerinenumerate(self.workers):140self.obs[i]=worker.child.recv()

model

143self.model=Model().to(device)

optimizer

146self.optimizer=optim.Adam(self.model.parameters(),lr=2.5e-4)

GAE with γ=0.99 and λ=0.95

149self.gae=GAE(self.n\_workers,self.worker\_steps,0.99,0.95)

PPO Loss

152self.ppo\_loss=ClippedPPOLoss()

Value Loss

155self.value\_loss=ClippedValueFunctionLoss()

Sample data with current policy

157defsample(self)-\>Dict[str,torch.Tensor]:

162rewards=np.zeros((self.n\_workers,self.worker\_steps),dtype=np.float32)163actions=np.zeros((self.n\_workers,self.worker\_steps),dtype=np.int32)164done=np.zeros((self.n\_workers,self.worker\_steps),dtype=np.bool)165obs=np.zeros((self.n\_workers,self.worker\_steps,4,84,84),dtype=np.uint8)166log\_pis=np.zeros((self.n\_workers,self.worker\_steps),dtype=np.float32)167values=np.zeros((self.n\_workers,self.worker\_steps+1),dtype=np.float32)168169withtorch.no\_grad():

sample worker_steps from each worker

171fortinrange(self.worker\_steps):

self.obs keeps track of the last observation from each worker, which is the input for the model to sample the next action

174obs[:,t]=self.obs

sample actions from πθOLD for each worker; this returns arrays of size n_workers

177pi,v=self.model(obs\_to\_torch(self.obs))178values[:,t]=v.cpu().numpy()179a=pi.sample()180actions[:,t]=a.cpu().numpy()181log\_pis[:,t]=pi.log\_prob(a).cpu().numpy()

run sampled actions on each worker

184forw,workerinenumerate(self.workers):185worker.child.send(("step",actions[w,t]))186187forw,workerinenumerate(self.workers):

get results after executing the actions

189self.obs[w],rewards[w,t],done[w,t],info=worker.child.recv()

collect episode info, which is available if an episode finished; this includes total reward and length of the episode - look at Game to see how it works.

194ifinfo:195tracker.add('reward',info['reward'])196tracker.add('length',info['length'])

Get value of after the final step

199\_,v=self.model(obs\_to\_torch(self.obs))200values[:,self.worker\_steps]=v.cpu().numpy()

calculate advantages

203advantages=self.gae(done,rewards,values)

206samples={207'obs':obs,208'actions':actions,209'values':values[:,:-1],210'log\_pis':log\_pis,211'advantages':advantages212}

samples are currently in [workers, time_step] table, we should flatten it for training

216samples\_flat={}217fork,vinsamples.items():218v=v.reshape(v.shape[0]\*v.shape[1],\*v.shape[2:])219ifk=='obs':220samples\_flat[k]=obs\_to\_torch(v)221else:222samples\_flat[k]=torch.tensor(v,device=device)223224returnsamples\_flat

Train the model based on samples

226deftrain(self,samples:Dict[str,torch.Tensor]):

It learns faster with a higher number of epochs, but becomes a little unstable; that is, the average episode reward does not monotonically increase over time. May be reducing the clipping range might solve it.

236for\_inrange(self.epochs()):

shuffle for each epoch

238indexes=torch.randperm(self.batch\_size)

for each mini batch

241forstartinrange(0,self.batch\_size,self.mini\_batch\_size):

get mini batch

243end=start+self.mini\_batch\_size244mini\_batch\_indexes=indexes[start:end]245mini\_batch={}246fork,vinsamples.items():247mini\_batch[k]=v[mini\_batch\_indexes]

train

250loss=self.\_calc\_loss(mini\_batch)

Set learning rate

253forpginself.optimizer.param\_groups:254pg['lr']=self.learning\_rate()

Zero out the previously calculated gradients

256self.optimizer.zero\_grad()

Calculate gradients

258loss.backward()

Clip gradients

260torch.nn.utils.clip\_grad\_norm\_(self.model.parameters(),max\_norm=0.5)

Update parameters based on gradients

262self.optimizer.step()

Normalize advantage function

264@staticmethod265def\_normalize(adv:torch.Tensor):

267return(adv-adv.mean())/(adv.std()+1e-8)

Calculate total loss

269def\_calc\_loss(self,samples:Dict[str,torch.Tensor])-\>torch.Tensor:

Rt returns sampled from πθOLD

275sampled\_return=samples['values']+samples['advantages']

Atˉ=σ(At^)At^−μ(At^), where At^ is advantages sampled from πθOLD. Refer to sampling function in Main class below for the calculation of A^t.

281sampled\_normalized\_advantage=self.\_normalize(samples['advantages'])

Sampled observations are fed into the model to get πθ(at∣st) and Vπθ(st); we are treating observations as state

285pi,value=self.model(samples['obs'])

−logπθ(at∣st), at are actions sampled from πθOLD

288log\_pi=pi.log\_prob(samples['actions'])

Calculate policy loss

291policy\_loss=self.ppo\_loss(log\_pi,samples['log\_pis'],sampled\_normalized\_advantage,self.clip\_range())

Calculate Entropy Bonus

LEB(θ)=E[Sπθ]

297entropy\_bonus=pi.entropy()298entropy\_bonus=entropy\_bonus.mean()

Calculate value function loss

301value\_loss=self.value\_loss(value,samples['values'],sampled\_return,self.clip\_range())

LCLIP+VF+EB(θ)=LCLIP(θ)+c1LVF(θ)−c2LEB(θ)

306loss=(policy\_loss307+self.value\_loss\_coef()\*value\_loss308-self.entropy\_bonus\_coef()\*entropy\_bonus)

for monitoring

311approx\_kl\_divergence=.5\*((samples['log\_pis']-log\_pi)\*\*2).mean()

Add to tracker

314tracker.add({'policy\_reward':-policy\_loss,315'value\_loss':value\_loss,316'entropy\_bonus':entropy\_bonus,317'kl\_div':approx\_kl\_divergence,318'clip\_fraction':self.ppo\_loss.clip\_fraction})319320returnloss

Run training loop

322defrun\_training\_loop(self):

last 100 episode information

328tracker.set\_queue('reward',100,True)329tracker.set\_queue('length',100,True)330331forupdateinmonit.loop(self.updates):

sample with current policy

333samples=self.sample()

train the model

336self.train(samples)

Save tracked indicators.

339tracker.save()

Add a new line to the screen periodically

341if(update+1)%1\_000==0:342logger.log()

Destroy

Stop the workers

344defdestroy(self):

349forworkerinself.workers:350worker.child.send(("close",None))

353defmain():

Create the experiment

355experiment.create(name='ppo')

Configurations

357configs={

Number of updates

359'updates':10000,

⚙️ Number of epochs to train the model with sampled data. You can change this while the experiment is running.

362'epochs':IntDynamicHyperParam(8),

Number of worker processes

364'n\_workers':8,

Number of steps to run on each process for a single update

366'worker\_steps':128,

Number of mini batches

368'batches':4,

⚙️ Value loss coefficient. You can change this while the experiment is running.

371'value\_loss\_coef':FloatDynamicHyperParam(0.5),

⚙️ Entropy bonus coefficient. You can change this while the experiment is running.

374'entropy\_bonus\_coef':FloatDynamicHyperParam(0.01),

⚙️ Clip range.

376'clip\_range':FloatDynamicHyperParam(0.1),

You can change this while the experiment is running. ⚙️ Learning rate.

379'learning\_rate':FloatDynamicHyperParam(1e-3,(0,1e-3)),380}381382experiment.configs(configs)

Initialize the trainer

385m=Trainer(\*\*configs)

Run and monitor the experiment

388withexperiment.start():389m.run\_training\_loop()

Stop the workers

391m.destroy()

Run it

395if\_\_name\_\_=="\_\_main\_\_":396main()

labml.ai