quickly train gpt-j on some data
#artificial_intelligence #large_language_models #technology
ran on RunPod, which provided the "cheapest" RTX A6000 GPU as seen by https://cloud-gpus.com/ (however that website at time of creating the project reported RunPod with 0.489USD/hr, right now it's 0.7USD/hr, maybe there should be a scraper to make that happen)
apply diff that:
- fixes an error while starting up training (
.clone()
idk why lol) - uses the hivemind model from huggingface instead of having to download manually
- you must change the
load_dataset()
call to your own files- generated ours with
csplit -s data.txt 56974
and it spitxx00
andxx01
- generated ours with
- you must change the
/root/workspace
line as that'll spit the model on that place- at least it calls
os.makedir
lol
- at least it calls
- i wish this project could create snapshots of the training models to resume training if the machine fails. 5 hours 30 minutes is quite a lot of time
- fix
bitsandbytes
depedency to use the only one my python version was able to find on PyPI.
diff --git a/gpt-j-6b-8-bit.py b/gpt-j-6b-8-bit.py index 3e1394f..c735486 100644 --- a/gpt-j-6b-8-bit.py +++ b/gpt-j-6b-8-bit.py @@ -65,7 +65,7 @@ class DequantizeAndLinear(torch.autograd.Function): weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code) ctx.save_for_backward(input, weights_quantized, absmax, code) ctx._has_bias = bias is not None - return F.linear(input, weights_deq, bias) + return F.linear(input, weights_deq, bias).clone() @staticmethod @custom_bwd @@ -184,7 +184,8 @@ tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") #logger.info("Saved model to {}".format(save_dir)) # ---------------------> Loading saved gpt-j-6B-8bit model <------------------- # -gpt = GPTJForCausalLM.from_pretrained("./saved_models_gpt-j-6B-8bit/gpt-j-6B",low_cpu_mem_usage=True) +gpt = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit",low_cpu_mem_usage=True) device = 'cuda' if torch.cuda.is_available() else 'cpu' gpt.to(device) @@ -223,7 +224,7 @@ gpt.gradient_checkpointing_enable() #dataset = load_dataset("transformersbook/codeparrot-train", streaming=True) # custom dataset -dataset = load_dataset('text', data_files={'train': ['article-1.txt', 'article-2.txt'], 'test': ['article-3.txt', 'article-4.txt']}) +dataset = load_dataset('text', data_files={'train': ['xx00'], 'test': ['xx01']}) optimizer = Adam8bit(gpt.parameters(), lr=1e-5) @@ -233,6 +234,7 @@ start = time.time() # Training loop with torch.cuda.amp.autocast(): for row in tqdm(dataset["train"]): + print(row) if len(row["text"]) <= 1: continue batch = tokenizer(row["text"], truncation=True, max_length=128, return_tensors='pt') @@ -250,9 +252,9 @@ logger.info("Finished fine-tuning in {}".format(time.time() - start)) # --------------> Saving fine-tuned model <-----------------# try: - save_dir = "/home/paperspace/project/finetuned_gpt-j-8_bit/gpt-j-6B" + save_dir = "/root/workspace/project/finetuned_gpt-j-8_bit/gpt-j-6B" os.makedirs(save_dir) gpt.save_pretrained(save_dir) except Exception as e: #print("Error saving model: ", e) - logger.info("Error saving model: {}".format(e)) \ No newline at end of file + logger.info("Error saving model: {}".format(e)) diff --git a/requirements.txt b/requirements.txt index 7fdc5b8..1c847e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ transformers==4.14.1 -bitsandbytes-cuda111==0.26.0 +bitsandbytes-cuda111==0.26.0.post2 datasets==1.16.1 loguru fastapi uvicorn torch flask