Back to Annotated Deep Learning Paper Implementations

Evaluation

docs/neox/evaluation/index.html

latest8.4 KB
Original Source

homeneoxevaluation

[View code on Github](https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/neox/evaluation/ init.py)

#

Evaluation

This is the code to test the model on EleutherAI/lm-evaluation-harness.

15importmath16fromtypingimportList1718importtorch19importtorch.nn.functionalasF20fromlm\_evalimporttasks,evaluator,utils21fromlm\_eval.baseimportBaseLM22fromtokenizersimportTokenizer23fromtorchimportnn24fromtqdmimporttqdm2526fromlabmlimportmonit27fromlabml\_nn.neox.tokenizerimportget\_tokenizer

#

Evaluation Harness Adapter

This is based on the adapter from EleutherAI/gpt-neox

30classEvalHarnessAdapter(BaseLM):

#

  • tokenizer is the Huggingface Tokenizer
  • vocab_size is the size of the vocabulary (this differs from the tokenizer vocab size since neox adds some extra to make the embedding layer model parallel.)
  • batch_size is the batch size
37def\_\_init\_\_(self,tokenizer:Tokenizer,vocab\_size:int,batch\_size:int):

#

45super().\_\_init\_\_()46self.tokenizer=tokenizer47self.\_eot\_token\_id=self.tokenizer.token\_to\_id("\<|endoftext|\>")48self.\_vocab\_size=vocab\_size4950self.\_batch\_size=batch\_size

#

Size of the vocabulary

52@property53defdevice(self):54raiseRuntimeError()5556@property57defvocab\_size(self):

#

59returnself.\_vocab\_size

#

End-of-text token

61@property62defeot\_token\_id(self):

#

64returnself.\_eot\_token\_id

#

Maximum sequence length

66@property67defmax\_length(self):

#

69return2048

#

Maximum number of tokens to generate

71@property72defmax\_gen\_toks(self):

#

74return128

#

Batch size

76@property77defbatch\_size(self):

#

81returnself.\_batch\_size

#

Encode a given text

83deftok\_encode(self,string:str):

#

87returnself.tokenizer.encode(string).ids

#

Decode text from token ids

89deftok\_decode(self,tokens:List[int]):

#

93returnself.tokenizer.decode(tokens)

#

95def\_model\_call(self,inps:torch.Tensor):96raiseNotImplementedError

#

98def\_model\_generate(self,context,max\_length,eos\_token\_id):99raiseRuntimeError()

#

101defgreedy\_until(self,requests):102raiseRuntimeError()

#

Get log-likelihoods of the next tokens

  • requests List of requests containing the context and the expected continuation.
  • disable_tqdm If True, disable tqdm progress bar.
[email protected]\_grad()105def\_loglikelihood\_tokens(self,requests,disable\_tqdm=False):

#

For results

114res=[]

#

Reorder the requests in the descending order of the lengths, so that sequences with similar lengths are close

118def\_collate(x):119toks=x[1]+x[2]120return-len(toks),tuple(toks)121122reord=utils.Reorderer(requests,\_collate)

#

Loop through requests with batch_size number of requests at a time

125forchunkinutils.chunks(tqdm(reord.get\_reordered(),disable=disable\_tqdm),self.batch\_size):

#

To store the inputs for the batch

127inps=[]

#

The continuations for the batch

129continuations=[]

#

Lengths of the input sequences

131inplens=[]

#

Padded length for the batch

133padded\_length=None

#

Loop through each request in the chunk and collect them into PyTorch tensors with paddings

135for\_,context\_enc,continuation\_encinchunk:

#

Concatenate the context and continuation

137inp=context\_enc+continuation\_enc

#

Truncate from left if the size exceeds the max_length

139inp=inp[-(self.max\_length+1):]

#

Remove final token

141inp=inp[:-1]

#

Create a tensor

143inp=torch.tensor(inp,dtype=torch.long)

#

Input length

145inplen=inp.shape[0]

#

Determine the padded length. Shorter sequences will get padded.

149ifpadded\_lengthisNone:150padded\_length=int(math.ceil(inplen/32))\*32

#

padded_length = padded_length if padded_length is not None else inplen

#

Padding

154padding=torch.zeros(padded\_length-inplen,dtype=torch.long)

#

Add padding

157inp=torch.cat([inp,padding],dim=0)158159inps.append(inp)160continuations.append(continuation\_enc)161inplens.append(inplen)

#

Get model logits

164logits=self.\_model\_call(torch.stack(inps))

#

Get log softmaxes

167multi\_logits=F.log\_softmax(logits,dim=-1)

#

Loop through the input/output pairs of the batch

170forlogits,inplen,cont\_toksinzip(multi\_logits,inplens,continuations):

#

Get number of predicted tokens

172contlen=len(cont\_toks)

#

Get logits of those

174logits=logits[inplen-contlen:inplen]

#

Get the tokens with the highest probabilities

176greedy\_tokens=logits.argmax(dim=-1)

#

Get the target tokens

178cont\_toks=torch.tensor(cont\_toks,dtype=torch.long).to(logits.device)

#

Whether there's an exact match

180max\_equal=(greedy\_tokens==cont\_toks).all()

#

Log-likelihoods of the target tokens

182logits=torch.gather(logits,1,cont\_toks[:,None])

#

Add the total log-likelihoods and whether there was a match to the results

184res.append((float(logits.sum()),bool(max\_equal)))

#

Re-order and return results

187returnreord.get\_original(res)

#

Run given evaluations

[email protected]\_grad()190defrun\_eval(self,name:str,eval\_tasks:List[str]):

#

Run EleutherAI/lm-evaluation-harness evaluator

196results=evaluator.evaluate(lm=self,task\_dict=tasks.get\_task\_dict(eval\_tasks))

#

Add configs

199results["config"]={200"name":name,201}

#

204returnresults

#

Evaluation Harness Adapter

This is based on the adapter from EleutherAI/gpt-neox

207classNoeXEvalHarnessAdapter(EvalHarnessAdapter):

#

  • model is model
  • tokenizer is the Huggingface Tokenizer
  • vocab_size is the size of the vocabulary (this differs from the tokenizer vocab size since neox adds some extra to make the embedding layer model parallel.)
  • batch_size is the batch size
  • device is the device of the model
214def\_\_init\_\_(self,model:nn.Module,tokenizer:Tokenizer,vocab\_size:int,batch\_size:int,device:torch.device):

#

224super().\_\_init\_\_(tokenizer,vocab\_size,batch\_size)225self.model=model226self.\_device=device

#

Call the model

228def\_model\_call(self,inps:torch.Tensor):

#

232returnself.model(inps.to(self.\_device))

#

Run evaluation harness with a given model

235defrun\_eval\_harness(model:nn.Module,name:str,eval\_tasks:List[str],device:torch.device,batch\_size:int=8):

#

Load the tokenizer

241withmonit.section('Load tokenizer'):242tokenizer=get\_tokenizer()

#

All tasks if nothing is specified

245ifnoteval\_tasks:246eval\_tasks=[247"anli\_r1",248"anli\_r2",249"anli\_r3",250"hellaswag",251"lambada",252"piqa",253"winogrande",254"wsc",255"mathqa",256]

#

Create the adapter

259adapter=NoeXEvalHarnessAdapter(model,tokenizer,50\_432,batch\_size,device)

#

Run

262returnadapter.run\_eval(name,eval\_tasks)

labml.ai