I appreciate, if you have a pc with nvidia GPUs, to test it and report how long it takes.
Just replace "fivetech_forums_20231222.sql" with a large text file you may have, thanks
train.py
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Load your additional training data from a file
file_path = "fivetech_forums_20231222.sql"
with open(file_path, "r", encoding="utf-8") as file:
ย ย train_text = file.read()
# Tokenize the training data
train_tokens = tokenizer(train_text, return_tensors="pt", truncation=True, padding=True)
# Create a PyTorch Dataset
train_dataset = TextDataset(
ย ย tokenizer=tokenizer,
ย ย file_path=file_path,
ย ย block_size=128 ย # Adjust the block size based on your dataset
)
# Create a data collator
data_collator = DataCollatorForLanguageModeling(
ย ย tokenizer=tokenizer,
ย ย mlm=False ย # Set to True if your training data includes masked language modeling objective
)
# Configure training arguments
training_args = TrainingArguments(
ย ย output_dir="./fine-tuned-model",
ย ย overwrite_output_dir=True,
ย ย num_train_epochs=3, ย # Adjust the number of epochs based on your dataset
ย ย per_device_train_batch_size=4,
ย ย save_steps=10_000,
ย ย save_total_limit=2,
ย ย logging_dir="./logs",
)
# Initialize Trainer
trainer = Trainer(
ย ย model=model,
ย ย args=training_args,
ย ย data_collator=data_collator,
ย ย train_dataset=train_dataset,
)
# Fine-tune the model
trainer.train()
# Save the fine-tuned model
model.save_pretrained("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")