Back to Annotated Deep Learning Paper Implementations

DeepNorm Experiment

docs/normalization/deep_norm/experiment.html

latest4.9 KB
Original Source

homenormalizationdeep_norm

View code on Github

#

DeepNorm Experiment

13importcopy1415importtorch16importtorch.nnasnn1718fromlabmlimportexperiment19fromlabml.configsimportoption20fromlabml\_nn.experiments.nlp\_autoregressionimportNLPAutoRegressionConfigs21fromlabml\_nn.normalization.deep\_normimportDeepNormTransformerLayer22fromlabml\_nn.transformersimportMultiHeadAttention23fromlabml\_nn.transformers.feed\_forwardimportFeedForward

#

Auto-Regressive model

This is a autoregressive transformer model that uses DeepNorm.

26classAutoregressiveTransformer(nn.Module):

#

  • n_tokens is the number of tokens in the vocabulary
  • d_model is the embedding size
  • n_layers is the number of transformer layers
  • layer is the layer. We use n_layers copies of this for the tranformer.
33def\_\_init\_\_(self,n\_tokens:int,d\_model:int,n\_layers:int,layer:DeepNormTransformerLayer):

#

40super().\_\_init\_\_()

#

Transformer with n_layers layers

42self.transformer=nn.Sequential(\*[copy.deepcopy(layer)for\_inrange(n\_layers)])

#

Token embedding layer

45self.emb=nn.Embedding(n\_tokens,d\_model)

#

Readout layer

47self.readout=nn.Linear(d\_model,n\_tokens)

#

  • x are the input tokens of shape [seq_len, batch_size]
49defforward(self,x:torch.Tensor):

#

Get the token embeddings

54x=self.emb(x)

#

Transformer encoder

56x=self.transformer(x)

#

Get logits

58x=self.readout(x)

#

Return results

61returnx,None

#

Configurations

This inherits from NLPAutoRegressionConfigs

64classConfigs(NLPAutoRegressionConfigs):

#

Model

73model:AutoregressiveTransformer

#

Number of layers

76n\_layers:int=32

#

α and β for DeepNorm

79deep\_norm\_alpha:float80deep\_norm\_beta:float

#

Number of heads in the attention

83n\_heads:int=4

#

Embedding size

85d\_model:int=64

#

Size of each attention head

87d\_k:int=16

#

Calculate α

α=(2M)41​

90@option(Configs.deep\_norm\_alpha)91def\_deep\_norm\_alpha(c:Configs):

#

97return(2.\*c.n\_layers)\*\*(1./4.)

#

Calculate β

β=(8M)−41​

100@option(Configs.deep\_norm\_beta)101def\_deep\_norm\_beta(c:Configs):

#

107return(8.\*c.n\_layers)\*\*-(1./4.)

#

Initialize the model

110@option(Configs.model)111def\_model(c:Configs):

#

115m=AutoregressiveTransformer(c.n\_tokens,c.d\_model,c.n\_layers,116DeepNormTransformerLayer(d\_model=c.d\_model,117deep\_norm\_alpha=c.deep\_norm\_alpha,118deep\_norm\_beta=c.deep\_norm\_beta,119feed\_forward=FeedForward(d\_model=c.d\_model,120d\_ff=c.d\_model\*4),121self\_attn=MultiHeadAttention(c.n\_heads,c.d\_model,122dropout\_prob=0.0)))123124returnm.to(c.device)

#

Create and run the experiment

127defmain():

#

Create experiment

132experiment.create(name="deep\_norm",writers={'screen','web\_api'})

#

Create configs

134conf=Configs()

#

Override configurations

136experiment.configs(conf,{

#

Use character level tokenizer

138'tokenizer':'character',

#

Prompt separator is blank

140'prompt\_separator':'',

#

Starting prompt for sampling

142'prompt':'It is ',

#

Use Tiny Shakespeare dataset

144'text':'tiny\_shakespeare',

#

Use a context size of 256

147'seq\_len':256,

#

Train for 32 epochs

149'epochs':32,

#

Batch size 16

151'batch\_size':16,

#

Switch between training and validation for 10 times per epoch

153'inner\_iterations':10,

#

Number of layers

156'n\_layers':50,

#

Adam optimizer with no warmup

160'optimizer.optimizer':'Adam',161'optimizer.learning\_rate':1.25e-4,162})

#

Set model(s) for saving and loading

165experiment.add\_pytorch\_models({'model':conf.model})

#

Start the experiment

168withexperiment.start():

#

Run training

170conf.run()

#

174if\_\_name\_\_=='\_\_main\_\_':175main()

labml.ai