언어모델 실습

AI TECH

언어모델 실습

prefer_all 2022. 11. 24. 12:59

Tokenizing

1. 어절 단위 : 띄어쓰기 단위 word

text = "이순신은 조선 중기의 무신이다."
tokenized_text = text.split(" ")    # split 함수는 입력 string에 대해서 특정 string을 기반으로 분리해줍니다.
print(tokenized_text)  
# ['이순신은', '조선', '중기의', '무신이다.']

# padding
max_seq_length = 10
tokenized_text += ["padding"] * (max_seq_length - len(tokenized_text))
print(tokenized_text)
# ['이순신은', '조선', '중기의', '무신이다.', 'padding', 'padding', 'padding', 'padding', 'padding', 'padding']

token의 개수가 부족할 때는 padding 처리, 개수가 많을 때는 자르기

여러 문장이 들어오면 batch 단위로 자르기

class Tokenizer:
    def __init__(self):
        self.tokenizer_type_list = ["word"]
        self.pad_token = "<pad>"
        self.max_seq_length = 10
        self.padding = False
    def tokenize(self, text, tokenizer_type): 
        assert tokenizer_type in self.tokenizer_type_list, "정의되지 않은 tokenizer_type입니다."
        if tokenizer_type == "word":
            tokenized_text = text.split(" ")
        if self.padding:
            tokenized_text += [self.pad_token] * (self.max_seq_length - len(tokenized_text))
            return tokenized_text[:self.max_seq_length]
        else:
            return tokenized_text[:self.max_seq_length]
    def batch_tokenize(self, texts, tokenizer_type):
        for i, text in enumerate(texts):
            texts[i] = self.tokenize(text, tokenizer_type)
        return texts

my_tokenizer = Tokenizer()
my_tokenizer.pad_token = "[PAD]"
my_tokenizer.max_seq_length = 10
my_tokenizer.padding = True

print(my_tokenizer.tokenize("이순신은 조선 중기의 무신이다.", "word"))
# ['이순신은', '조선', '중기의', '무신이다.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
print(my_tokenizer.batch_tokenize(["이순신은 조선 중기의 무신이다.", "그는 임진왜란을 승리로 이끌었다."], "word"))
# [['이순신은', '조선', '중기의', '무신이다.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['그는', '임진왜란을', '승리로', '이끌었다.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']]

2. 형태소 단위 : mecab 라이브러리 사용 morph

from konlpy.tag import Mecab

mecab = Mecab()
print(mecab.pos("아버지가방에들어가신다."))
# [('아버지', 'NNG'), ('가', 'JKS'), ('방', 'NNG'), ('에', 'JKB'), ('들어가', 'VV'), ('신다', 'EP+EF'), ('.', 'SF')]

text = "이순신은 조선 중기의 무신이다."
# 이순신 -> PS
# 조선 -> DT TI
# 중기 -> TI
# 무신 -> OC
# 이순신 - 직업 - 무신
# 이순신 - 출생지 - 조선

tokenized_text = [lemma[0] for lemma in mecab.pos(text)]
print(tokenized_text)
# ['이순신', '은', '조선', '중기', '의', '무신', '이', '다', '.']

3. 음절 단위: 한 글자씩 syllable

text = "이순신은 조선 중기의 무신이다."
tokenized_text = list(text)    # split 함수는 입력 string에 대해서 특정 string을 기반으로 분리해줍니다.
print(tokenized_text)  
# ['이', '순', '신', '은', ' ', '조', '선', ' ', '중', '기', '의', ' ', '무', '신', '이', '다', '.']

4. 자소 단위: 하나의 문자도 초성, 중성, 총성 최대 3개의 자소로 분리

hgtk 라이브러리 사용 jaso

import hgtk
text = "이순신은 조선 중기의 무신이다."
tokenized_text = list(hgtk.text.decompose(text))
print(tokenized_text)
# ㅇ ㅣ ㅅ ㅜ ㄴ ㅅ ㅣ ...

5. WordPiece

파일을 새로 만든다

!mkdir wordPieceTokenizer

from tokenizers import BertWordPieceTokenizer

# Initialize an empty tokenizer
wp_tokenizer = BertWordPieceTokenizer(
    clean_text=True,    # [이순신, ##은, ' ', 조선]  띄어쓰기를 지우고 싶으면 clean_text = True
    handle_chinese_chars=True,
    strip_accents=False,    # True: [YepHamza] -> [Yep, Hamza]
    lowercase=False,
)

# And then train
wp_tokenizer.train(
    files="my_data/wiki_20190620_small.txt",
    vocab_size=10000,
    min_frequency=2, # 두 개 이하면 vocab의 단어로 만들지 않는다
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##"
)

# Save the files
wp_tokenizer.save_model("wordPieceTokenizer", "my_tokenizer")

text = "이순신은 조선 중기의 무신이다."
tokenized_text = wp_tokenizer.encode(text)
print(tokenized_text)
print(tokenized_text.tokens)
print(tokenized_text.ids)

class Tokenizer:
    def __init__(self):
        self.tokenizer_type_list = ["word", "morph", "syllable", "jaso", "wordPiece"]
        self.pad_token = "<pad>"
        self.max_seq_length = 10
        self.padding = False
    def tokenize(self, text, tokenizer_type): 
        assert tokenizer_type in self.tokenizer_type_list, "정의되지 않은 tokenizer_type입니다."
        if tokenizer_type == "word":
            tokenized_text = text.split(" ")
        elif tokenizer_type == "morph":
            tokenized_text = [lemma[0] for lemma in mecab.pos(text)]
        elif tokenizer_type == "syllable":
            tokenized_text = list(text)
        elif tokenizer_type == "jaso":
            tokenized_text = list(hgtk.text.decompose(text))
        elif tokenizer_type == "wordPiece":
            tokenized_text = wp_tokenizer.encode(text).tokens
        if self.padding:
            tokenized_text += [self.pad_token] * (self.max_seq_length - len(tokenized_text))
            return tokenized_text[:self.max_seq_length]
        else:
            return tokenized_text[:self.max_seq_length]
    def batch_tokenize(self, texts, tokenizer_type):
        for i, text in enumerate(texts):
            texts[i] = self.tokenize(text, tokenizer_type)
        return texts

my_tokenizer = Tokenizer()
my_tokenizer.pad_token = "[PAD]"
my_tokenizer.max_seq_length = 10
my_tokenizer.padding = True

print(my_tokenizer.tokenize("이순신은 조선 중기의 무신이다.", "word"))
# ["word", "morph", "syllable", "jaso", "wordPiece"]

BERT 실습

(3강: BERT 언어모델 소개)

1. Tokenizer 에 대한 이해

- [PAD] 추가하기, special token

Tokenizer 선언

from transformers import AutoModel, AutoTokenizer, BertTokenizer

# Store the model we want to use
# Multi-lingual bert model은 BertTokenizerFast class로 되어있습니다
MODEL_NAME = "bert-base-multilingual-cased" 

# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(type(tokenizer)) # 출력값: <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>
print(tokenizer.vocab_size) # 출력값: 119547

예제로 Tokenizer 살펴보기

text = "이순신은 조선 중기의 무신이다."

tokenized_input_text = tokenizer(text, return_tensors="pt")
for key, value in tokenized_input_text.items():
    print("{}:\n\t{}".format(key, value))
    
'''
출력값:
input_ids:
	tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    119,    102]])
token_type_ids:
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
'''

tokenized_text = tokenizer.tokenize(text)
print(tokenized_text) # 출력값: ['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.']
input_ids = tokenizer.encode(text)
print(input_ids) # 출력값: [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102]
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids) # 출력값: [CLS] 이순신은 조선 중기의 무신이다. [SEP]

tokenized_text = tokenizer.tokenize(
    text,
    add_special_tokens=False,
    max_length=5,
    truncation=True
    ) # 최대 길이는 5개이고, 그 이상은 truncate (자른다)
print(tokenized_text) # ['이', '##순', '##신', '##은', '조선']

input_ids = tokenizer.encode(
    text,
    add_special_tokens=False,
    max_length=5,
    truncation=True
    ) # 음절이나 어절 단위로 잘리는 규칙이 있는 게 아니라, 입력 문장의 토큰을 기준으로 잘림
print(input_ids) # [9638, 119064, 25387, 10892, 59906]
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids) # 이순신은 조선

Tokenizer의 padding 추가하기 [PAD]

print(tokenizer.pad_token) # [PAD]
print(tokenizer.pad_token_id) # 0

tokenized_text = tokenizer.tokenize(
    text,
    add_special_tokens=False,
    max_length=20,
    padding="max_length"
    )
print(tokenized_text)
# ['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']

input_ids = tokenizer.encode(
    text,
    add_special_tokens=False,
    max_length=20,
    padding="max_length"
    )
print(input_ids)
# [9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 0, 0, 0, 0, 0, 0, 0, 0, 0]

decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)
# 이순신은 조선 중기의 무신이다. [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

Tokenizer에 새로운 Token을 추가하기

tokenizer.add_tokens decode 결과, [unk] 토큰이 줄어든다.

text = "깟뻬뜨랑 리뿔이 뜨럽거 므리커럭이 케쇽 냐왜쇼 우뤼갸 쳥쇼섀료다혀뚜여"

tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)
'''
token 추가 이전:
['[UNK]', '리', '##뿔', '##이', '뜨', '##럽', '##거', '므', '##리', '##커', '##럭', '##이', '[UNK]', '냐', '##왜', '##쇼', '[UNK]', '[UNK]']
[100, 9238, 119021, 10739, 9151, 118867, 41521, 9308, 12692, 106826, 118864, 10739, 100, 9002, 119164, 119060, 100, 100]
[UNK] 리뿔이 뜨럽거 므리커럭이 [UNK] 냐왜쇼 [UNK] [UNK]
'''

added_token_num = tokenizer.add_tokens(["깟뻬뜨랑", "케쇽", "우뤼갸", "쳥쇼", "섀료"])

tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)
'''
token 추가 이후:
['깟뻬뜨랑', '리', '##뿔', '##이', '뜨', '##럽', '##거', '므', '##리', '##커', '##럭', '##이', '케쇽', '냐', '##왜', '##쇼', '우뤼갸', '쳥쇼', '섀료', '다', '##혀', '##뚜', '##여']
[119547, 9238, 119021, 10739, 9151, 118867, 41521, 9308, 12692, 106826, 118864, 10739, 119548, 9002, 119164, 119060, 119549, 119550, 119551, 9056, 80579, 118841, 29935]
깟뻬뜨랑 리뿔이 뜨럽거 므리커럭이 케쇽 냐왜쇼 우뤼갸 쳥쇼 섀료 다혀뚜여
'''

Special token 추가하기

tokenizer.add_special_tokens 입력을 dictionary 형태로

text = "[SHKIM]이순신은 조선 중기의 무신이다.[/SHKIM]"

added_token_num += tokenizer.add_special_tokens({"additional_special_tokens":["[SHKIM]", "[/SHKIM]"]})
tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)
decoded_ids = tokenizer.decode(input_ids,skip_special_tokens=True)
print(decoded_ids)
'''
['[SHKIM]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[/SHKIM]']
[119552, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 119553]
[SHKIM] 이순신은 조선 중기의 무신이다. [/SHKIM]
이순신은 조선 중기의 무신이다.
'''

위의 두 가지 예제로 7개의 token이 새롭게 추가되었다. => 모델 리사이즈할 때 필요한 정보이다

print(added_token_num) # 7

자연어의 task에 따라서 다양하게 사용할 수 있다.

# 1. Single segment input
single_seg_input = tokenizer("이순신은 조선 중기의 무신이다.")

print("Single segment token (str): {}".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))
print("Single segment token (int): {}".format(single_seg_input['input_ids']))
print("Single segment type       : {}".format(single_seg_input['token_type_ids']))
'''
Single segment token (str): ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[SEP]']
Single segment token (int): [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102]
Single segment type       : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
'''

# 2. Multiple segment input
multi_seg_input = tokenizer("이순신은 조선 중기의 무신이다.", "그는 임진왜란을 승리로 이끌었다.")
# Segments are concatened in the input to the model, with 
print("Multi segment token (str): {}".format(tokenizer.convert_ids_to_tokens(multi_seg_input['input_ids'])))
print("Multi segment token (int): {}".format(multi_seg_input['input_ids']))
print("Multi segment type       : {}".format(multi_seg_input['token_type_ids']))
'''
Multi segment token (str): ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[SEP]', '그는', '임', '##진', '##왜', '##란', '##을', '승', '##리로', '이', '##끌', '##었다', '.', '[SEP]']
Multi segment token (int): [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102, 17889, 9644, 18623, 119164, 49919, 10622, 9484, 100434, 9638, 118705, 17706, 119, 102]
Multi segment type       : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
'''

# 3. 배열로 입력 시 출력도 배열로 저장된다
# Padding highlight
tokens = tokenizer(
    ["이순신은 조선 중기의 무신이다.", "그는 임진왜란을 승리로 이끌었다."], 
    padding=True  # First sentence will have some PADDED tokens to match second sequence length
)

for i in range(2):
    print("Tokens (int)      : {}".format(tokens['input_ids'][i]))
    print("Tokens (str)      : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][i]]))
    print("Tokens (attn_mask): {}".format(tokens['attention_mask'][i]))
    print()
'''
Tokens (int)      : [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102, 0]
Tokens (str)      : ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[SEP]', '[PAD]']
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]

Tokens (int)      : [101, 17889, 9644, 18623, 119164, 49919, 10622, 9484, 100434, 9638, 118705, 17706, 119, 102]
Tokens (str)      : ['[CLS]', '그는', '임', '##진', '##왜', '##란', '##을', '승', '##리로', '이', '##끌', '##었다', '.', '[SEP]']
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
'''

2. BERT 모델을 통한 [MASK] 토큰 예측하기 pipeline

from transformers import pipeline

nlp_fill = pipeline('fill-mask', model=MODEL_NAME)
nlp_fill("이순신은 [MASK] 중기의 무신이다.")
'''
[{'score': 0.8747126460075378,
  'sequence': '[CLS] 이순신은 조선 중기의 무신이다. [SEP]',
  'token': 59906,
  'token_str': '조선'},
 {'score': 0.06436426192522049,
  'sequence': '[CLS] 이순신은 청 중기의 무신이다. [SEP]',
  'token': 9751,
  'token_str': '청'}]
'''

from transformers import BertForMaskedLM, AutoTokenizer
from transformers import pipeline

# Store the model we want to use
MODEL_NAME = "bert-base-multilingual-cased"

# We need to create the model and tokenizer
model = BertForMaskedLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

nlp_fill = pipeline('fill-mask', top_k=3, model=model, tokenizer=tokenizer)
nlp_fill('Barack Hussein Obama graduated from [MASK] University.')

모델의 출력 결과도 획득할 수 있다.

tokens_pt = tokenizer("이순신은 조선 중기의 무신이다.", return_tensors="pt")

outputs = model(**tokens_pt)
last_hidden_state = outputs.last_hidden_state
pooler_output = outputs.pooler_output

print("\nToken wise output: {}, Pooled output: {}".format(last_hidden_state.shape, pooler_output.shape))
# Token wise output: 입력된 문장이 13개의 토큰임. 그 13개의 토큰에 대한 벡터값
# Pooled output: [CLS] 토큰의 벡터만 얻어낼 수 있음. ([CLS] token to 768 dimension)
# Token wise output: torch.Size([1, 13, 768]), Pooled output: torch.Size([1, 768])

만약에 vocab을 새롭게 추가했다면, 반드시 model의 embedding layer 사이즈를 늘려주어여 한다.

print(model.get_input_embeddings()) # Embedding(119547, 768, padding_idx=0)
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
print(model.get_input_embeddings()) # Embedding(119554, 768)

[CLS] 토큰을 활용해 문장의 유사도를 측정할 수 있다.

from torch import nn

sent1 = tokenizer("오늘 하루 어떻게 보냈나요?", return_tensors="pt")
sent2 = tokenizer("오늘은 어떤 하루를 보내셨나요?", return_tensors="pt")
sent3 = tokenizer("이순신은 조선 중기의 무신이다.", return_tensors="pt")

outputs = model(**sent1)
sent_1_pooler_output = outputs.pooler_output

outputs = model(**sent2)
sent_2_pooler_output = outputs.pooler_output

outputs = model(**sent3)
sent_3_pooler_output = outputs.pooler_output

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
print(cos(sent_1_pooler_output, sent_2_pooler_output)) # tensor([0.9757])
print(cos(sent_2_pooler_output, sent_3_pooler_output)) # tensor([0.6075])

3. Pretrained BERT의[CLS] 토큰을 이용한 챗봇 만들기

사전 준비물
1. Pretrain된 BERT 모델.
2. 질의응답 Dataset.

진행 과정
1. 사용자의 질문(query)를 입력 받는다.
2. query를 pretrained BERT의 입력으로 넣어, query 문장에 해당하는 [CLS] token hidden을 얻는다.
3. 사전에 준비된 질의응답 Dataset에 존재하는 모든 질문들을 pretrained BERT의 입력으로 넣어, 질문들에 해당하는 [CLS] token hidden을 얻는다.
4. query의 [CLS] token hidden과 질문들의 [CLS] token hidden간의 코사인 유사도를 구한다.
5. 가장 높은 코사인 유사도를 가진 질문의 답변을 반환시켜준다.
6. 위 과정 반복.

0. Pretrained BERT 불러오기

import torch
from transformers import AutoModel, AutoTokenizer
MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

0. 질의응답 Dataset 만들기

chatbot_Question = ['기차 타고 여행 가고 싶어','꿈이 이루어질까?','내년에는 더 행복해질려고 이렇게 힘든가봅니다', '간만에 휴식 중', '오늘도 힘차게!'] # 질문
chatbot_Answer = ['꿈꾸던 여행이네요.','현실을 꿈처럼 만들어봐요.','더 행복해질 거예요.', '휴식도 필요하죠', '아자아자 화이팅!!'] # 답변

1. [CLS] 토큰을 얻기 위한 함수 정의

def get_cls_token(sent_A):
    model.eval()
    tokenized_sent = tokenizer(
            sent_A,
            return_tensors="pt",
            truncation=True,
            add_special_tokens=True,
            max_length=128
    )
    with torch.no_grad():# 그라디엔트 계산 비활성화
        outputs = model(    # **tokenized_sent
            input_ids=tokenized_sent['input_ids'],
            attention_mask=tokenized_sent['attention_mask'],
            token_type_ids=tokenized_sent['token_type_ids']
            )
    logits = outputs.last_hidden_state[:,0,:].detach().cpu().numpy()
    return logits

2. query 문장의 [CLS] token hidden 확인하기

query = '아 여행가고 싶다~'
query_cls_hidden = get_cls_token(query)
print(query_cls_hidden)
print(query_cls_hidden.shape)

챗봇 데이터 셋 질문의 [CLS] token hidden 확인하기

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

dataset_cls_hidden = []
for q in chatbot_Question:
    q_cls = get_cls_token(q)
    dataset_cls_hidden.append(q_cls)
dataset_cls_hidden = np.array(dataset_cls_hidden).squeeze(axis=1)
print(dataset_cls_hidden)   # 데이터셋의 질문에 대한 [CLS] 토큰 벡터
print(dataset_cls_hidden.shape)

3. 코사인 유사도 구하고, 챗봇 데이터 셋 중 가장 유사도가 높은 질문 선택 및 답변

cos_sim = cosine_similarity(query_cls_hidden, dataset_cls_hidden)   # 데이터셋의 0번째 질문과 가장 유사하군요!
print(cos_sim) # [[0.85016316 0.7788855  0.73615134 0.7798742  0.72420174]]

top_question = np.argmax(cos_sim)

print('나의 질문: ', query)
print('저장된 답변: ', chatbot_Answer[top_question])
'''
나의 질문:  아 여행가고 싶다~
저장된 답변:  꿈꾸던 여행이네요.
'''

4. BERT pre-training

- 데이터 셋

1. 생성한 tokenizer을 불러온다

- 맨 위 wp_tokenizer 참고

2. config를 통해 BERT 조절

from transformers import BertConfig, BertForPreTraining

config = BertConfig(    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig
    vocab_size=20000,
    # hidden_size=512,
    # num_hidden_layers=12,    # layer num
    # num_attention_heads=8,    # transformer attention head number
    # intermediate_size=3072,   # transformer 내에 있는 feed-forward network의 dimension size
    # hidden_act="gelu",
    # hidden_dropout_prob=0.1,
    # attention_probs_dropout_prob=0.1,
    max_position_embeddings=128,    # embedding size 최대 몇 token까지 input으로 사용할 것인지 지정
    # type_vocab_size=2,    # token type ids의 범위 (BERT는 segmentA, segmentB로 2종류)
    # pad_token_id=0,
    # position_embedding_type="absolute"
)

model = BertForPreTraining(config=config)
model.num_parameters() #101720098

3. corpus를 dataset으로 구성하기

class TextDatasetForNextSentencePrediction(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        short_seq_probability=0.1,
        nsp_probability=0.5,
    ):
        # 💭 여기 부분은 학습 데이터를 caching하는 부분입니다 :-)
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

        self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
        self.short_seq_probability = short_seq_probability
        self.nsp_probability = nsp_probability

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory,
            "cached_nsp_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        self.tokenizer = tokenizer

        lock_path = cached_features_file + ".lock"

        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            # 💭 
            else:
                logger.info(f"Creating features from dataset file at {directory}")
                # 🔥 여기서부터 본격적으로 dataset을 만듭니다.
                self.documents = [[]] # 주제에 따라 문단을 나눔 
                with open(file_path, encoding="utf-8") as f:
                    while True: # 일단 문장을 읽고
                        line = f.readline()
                        if not line:
                            break
                        line = line.strip() # 공백 제거

                        # 이중 띄어쓰기가 발견된다면, 나왔던 문장들을 모아 하나의 문서로 묶어버립니다.
                        # 즉, 문단 단위로 데이터를 저장합니다.
                        if not line and len(self.documents[-1]) != 0:
                            self.documents.append([])
                        tokens = tokenizer.tokenize(line)
                        tokens = tokenizer.convert_tokens_to_ids(tokens)
                        if tokens:
                            self.documents[-1].append(tokens)
                # 🔥 이제 코퍼스 전체를 읽고, 문서 데이터를 생성했습니다! :-)
                logger.info(f"Creating examples from {len(self.documents)} documents.")
                self.examples = []
                # 본격적으로 학습을 위한 데이터로 변형시켜볼까요?
                for doc_index, document in enumerate(self.documents):
                    self.create_examples_from_document(document, doc_index) # 🔥🔥 함수로 가봅시다.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def create_examples_from_document(self, document: List[List[int]], doc_index: int):
        """🔥🔥 Creates examples for a single document. 🔥🔥"""
        # 문장의 앞, 뒤에 [CLS], [SEP] token이 부착되기 때문에, 내가 지정한 size에서 2 만큼 빼줍니다.
        # 예를 들어 128 token 만큼만 학습 가능한 model을 선언했다면, 학습 데이터로부터는 최대 126 token만 가져오게 됩니다.
        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)

        # We *usually* want to fill up the entire sequence since we are padding
        # to `block_size` anyways, so short sequences are generally wasted
        # computation. However, we *sometimes*
        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
        # sequences to minimize the mismatch between pretraining and fine-tuning.
        # The `target_seq_length` is just a rough target however, whereas
        # `block_size` is a hard limit.

        # 여기가 재밌는 부분인데요!
        # 위에서 설명했듯이, 학습 데이터는 126 token(128-2)을 채워서 만들어지는게 목적입니다.
        # 하지만 나중에 BERT를 사용할 때, 126 token 이내의 짧은 문장을 테스트하는 경우도 분명 많을 것입니다 :-)
        # 그래서 short_seq_probability 만큼의 데이터에서는 2-126 사이의 random 값으로 학습 데이터를 만들게 됩니다.
        target_seq_length = max_num_tokens
        if random.random() < self.short_seq_probability:
            target_seq_length = random.randint(2, max_num_tokens)

        current_chunk = []  # a buffer stored current working segments
        current_length = 0
        i = 0

        # 데이터 구축의 단위는 document 입니다
        # 이 때, 무조건 문장_1[SEP]문장_2 이렇게 만들어지는 것이 아니라,
        # 126 token을 꽉 채울 수 있게 문장_1+문장_2[SEP]문장_3+문장_4 형태로 만들어질 수 있습니다.
        while i < len(document):
            segment = document[i]
            current_chunk.append(segment)
            current_length += len(segment)
            if i == len(document) - 1 or current_length >= target_seq_length:
                if current_chunk:
                    # `a_end` is how many segments from `current_chunk` go into the `A`
                    # (first) sentence.
                    a_end = 1
                    # 여기서 문장_1+문장_2 가 이루어졌을 때, 길이를 random하게 짤라버립니다 :-)
                    if len(current_chunk) >= 2:
                        a_end = random.randint(1, len(current_chunk) - 1)
                    tokens_a = []
                    for j in range(a_end):
                        tokens_a.extend(current_chunk[j])
                    # 이제 [SEP] 뒷 부분인 segmentB를 살펴볼까요?
                    tokens_b = []
                    # 50%의 확률로 랜덤하게 다른 문장을 선택하거나, 다음 문장을 학습데이터로 만듭니다.
                    if len(current_chunk) == 1 or random.random() < self.nsp_probability:
                        is_random_next = True
                        target_b_length = target_seq_length - len(tokens_a)

                        # This should rarely go for more than one iteration for large
                        # corpora. However, just to be careful, we try to make sure that
                        # the random document is not the same as the document
                        # we're processing.
                        for _ in range(10):
                            random_document_index = random.randint(0, len(self.documents) - 1)
                            if random_document_index != doc_index:
                                break
                        # 여기서 랜덤하게 선택합니다 :-)
                        random_document = self.documents[random_document_index]
                        random_start = random.randint(0, len(random_document) - 1)
                        for j in range(random_start, len(random_document)):
                            tokens_b.extend(random_document[j])
                            if len(tokens_b) >= target_b_length:
                                break
                        # We didn't actually use these segments so we "put them back" so
                        # they don't go to waste.
                        num_unused_segments = len(current_chunk) - a_end
                        i -= num_unused_segments
                    # Actual next
                    else:
                        is_random_next = False
                        for j in range(a_end, len(current_chunk)):
                            tokens_b.extend(current_chunk[j])

                    # 이제 126 token을 넘는다면 truncation을 해야합니다.
                    # 이 때, 126 token 이내로 들어온다면 행위를 멈추고,
                    # 만약 126 token을 넘는다면, segmentA와 segmentB에서 랜덤하게 하나씩 제거합니다.
                    def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
                        """🔥🔥🔥 Truncates a pair of sequences to a maximum sequence length.🔥🔥🔥 """
                        while True:
                            total_length = len(tokens_a) + len(tokens_b)
                            if total_length <= max_num_tokens:
                                break
                            trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
                            assert len(trunc_tokens) >= 1
                            # We want to sometimes truncate from the front and sometimes from the
                            # back to add more randomness and avoid biases.
                            if random.random() < 0.5:
                                del trunc_tokens[0]
                            else:
                                trunc_tokens.pop()

                    truncate_seq_pair(tokens_a, tokens_b, max_num_tokens) # 🔥🔥🔥 

                    assert len(tokens_a) >= 1
                    assert len(tokens_b) >= 1

                    # add special tokens
                    input_ids = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
                    # add token type ids, 0 for sentence a, 1 for sentence b
                    token_type_ids = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
                    
                    # 드디어 아래 항목에 대한 데이터셋이 만들어졌습니다! :-)
                    # 즉, segmentA[SEP]segmentB, [0, 0, .., 0, 1, 1, ..., 1], NSP 데이터가 만들어진 것입니다 :-)
                    # 그럼 다음은.. 이 데이터에 [MASK] 를 씌워야겠죠?
                    example = {
                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
                        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                        "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
                    }

                    self.examples.append(example)

                current_chunk = []
                current_length = 0

            i += 1

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

DataCollatorForLanguageModeling

dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path='/content/my_data/wiki_20190620_small.txt',
    block_size=128, #모델 임베딩 size
    overwrite_cache=False,
    short_seq_probability=0.1,
    nsp_probability=0.5,
)

data_collator = DataCollatorForLanguageModeling(    # [MASK] 를 씌우는 것은 저희가 구현하지 않아도 됩니다! :-)
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

이제 만들어진 학습데이터를 확인해보자

for example in dataset.examples[0:1]:
    print(example)
# 2는 CLS 토큰, 3은 SEP
# 0 -> 1로 바뀜 (다른 문장으로 넘어감)
# next_sentence_label == 1 : 다음 문장이 맞다
'''
{'input_ids': tensor([    2,  9438,  2494,  2429,  2780,  1969,  5380,  3120,  1941,  2408,
           16,  5498, 10310, 16251,   551,  1078,   820,  1238,  1146,   931,
        16498, 12286,  1088,  3668,    16,  6531,  8935,  1034,  2678,  1907,
           16,   174,   985,  4022,  1018,  8598,   726,  1217,    93,  7744,
           93, 10414,  1014,   591, 14329,  1886, 18889,    16,  6438,  1969,
         4022,   277,  3364,   656,  1266,  2106,  1934, 17666,    93,   437,
         1035,  2138,     1,  2024,  4087, 17984,    16,  2063,   494,  2737,
            5,     3,   665,  1990,   610,   575,  1888,    16,  4884,  1895,
           63,  3271,  1042,  1895,    63,  3326,  8483, 15168,  2069,    14,
         4649, 13901,  1904, 16394,  8558,  2190,  1057, 10586,    16,  5465,
         1895,    63,  1889,  1176,  4649,  1944,    14,  2310, 16454, 10023,
          704,  2422, 15412,  4248, 11114,  1101, 15317,  1944,  1895,    63,
         8552,  6110,  3098,    16,     3]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1]), 'next_sentence_label': tensor(1)}
'''

[MASK]를 부착하는 data collator의 역할을 살펴보자

print(data_collator(dataset.examples))

tokenizer.decode(data_collator(dataset.examples)['input_ids'][0].tolist())
# [CLS] 주니어는 민주당 출신 미국 [MASK]번째 대통령 이다. 지미 카터는 [MASK] 섬 [MASK] ...

- Trainer을 사용해 학습

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='model_output',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_gpu_train_batch_size=32,
    save_steps=1000, # 1000 step마다 저장
    save_total_limit=2, # 마지막 두 개빼고 계속 삭제함
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()
trainer.save_model('./model_output')

- Pretrained BERT 사용하기

from transformers import BertForMaskedLM, pipeline

my_model = BertForMaskedLM.from_pretrained('model_output')
nlp_fill = pipeline('fill-mask', top_k=2, model=my_model, tokenizer=tokenizer) 
#어떤 모델, tokenizer 쓸 때 명시해줘야함  
# device= 0 # 0번 gpu를 쓴다 이런식으로 명시해줘야함

nlp_fill('이순신은 [MASK] 중기의 무신이다.')
'''
[{'score': 0.030770400539040565,
  'sequence': '[CLS] 이순신은, 중기의 무신이다. [SEP]',
  'token': 14,
  'token_str': ','},
 {'score': 0.03006444126367569,
  'sequence': '[CLS] 이순신은. 중기의 무신이다. [SEP]',
  'token': 16,
  'token_str': '.'}]
'''

GPT

BERT는 자연어 임베딩 모델이라면 GPT는 자연어 생성에 특화된 모델이다.

'AI TECH' 카테고리의 다른 글

Passage Retrieval (0)	2022.12.20
MRC (0)	2022.12.20
Docker (0)	2022.11.11
Linux, Shell (0)	2022.11.10
ML Ops (0)	2022.11.07

현재글언어모델 실습

But my fav is coding