quickly train gpt-j on some data

#artificial_intelligence #large_language_models #technology

based on https://betterprogramming.pub/fine-tuning-gpt-j-6b-on-google-colab-or-equivalent-desktop-or-server-gpu-b6dc849cb205

ran on RunPod, which provided the "cheapest" RTX A6000 GPU as seen by https://cloud-gpus.com/ (however that website at time of creating the project reported RunPod with 0.489USD/hr, right now it's 0.7USD/hr, maybe there should be a scraper to make that happen)

apply diff that:

fixes an error while starting up training (.clone() idk why lol)
uses the hivemind model from huggingface instead of having to download manually
you must change the load_dataset() call to your own files
- generated ours with csplit -s data.txt 56974 and it spit xx00 and xx01
you must change the /root/workspace line as that'll spit the model on that place
- at least it calls os.makedir lol
i wish this project could create snapshots of the training models to resume training if the machine fails. 5 hours 30 minutes is quite a lot of time
fix bitsandbytes depedency to use the only one my python version was able to find on PyPI.

diff --git a/gpt-j-6b-8-bit.py b/gpt-j-6b-8-bit.py
index 3e1394f..c735486 100644
--- a/gpt-j-6b-8-bit.py
+++ b/gpt-j-6b-8-bit.py
@@ -65,7 +65,7 @@ class DequantizeAndLinear(torch.autograd.Function):
         weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
         ctx.save_for_backward(input, weights_quantized, absmax, code)
         ctx._has_bias = bias is not None
-        return F.linear(input, weights_deq, bias)
+        return F.linear(input, weights_deq, bias).clone()
 
     @staticmethod
     @custom_bwd
@@ -184,7 +184,8 @@ tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
 #logger.info("Saved model to {}".format(save_dir))
 
 # ---------------------> Loading saved gpt-j-6B-8bit model <------------------- #
-gpt = GPTJForCausalLM.from_pretrained("./saved_models_gpt-j-6B-8bit/gpt-j-6B",low_cpu_mem_usage=True)
+gpt = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit",low_cpu_mem_usage=True)
 
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 gpt.to(device)
@@ -223,7 +224,7 @@ gpt.gradient_checkpointing_enable()
 #dataset = load_dataset("transformersbook/codeparrot-train", streaming=True)
 
 # custom dataset
-dataset = load_dataset('text', data_files={'train': ['article-1.txt', 'article-2.txt'], 'test': ['article-3.txt', 'article-4.txt']})
+dataset = load_dataset('text', data_files={'train': ['xx00'], 'test': ['xx01']})
 
 optimizer = Adam8bit(gpt.parameters(), lr=1e-5)
 
@@ -233,6 +234,7 @@ start = time.time()
 # Training loop
 with torch.cuda.amp.autocast():
     for row in tqdm(dataset["train"]):
+        print(row)
         if len(row["text"]) <= 1:
             continue
         batch = tokenizer(row["text"], truncation=True, max_length=128, return_tensors='pt')
@@ -250,9 +252,9 @@ logger.info("Finished fine-tuning in {}".format(time.time() - start))
 
 # --------------> Saving fine-tuned model <-----------------#
 try:
-    save_dir = "/home/paperspace/project/finetuned_gpt-j-8_bit/gpt-j-6B"
+    save_dir = "/root/workspace/project/finetuned_gpt-j-8_bit/gpt-j-6B"
     os.makedirs(save_dir)
     gpt.save_pretrained(save_dir)
 except Exception as e:
     #print("Error saving model: ", e)
-    logger.info("Error saving model: {}".format(e))
\ No newline at end of file
+    logger.info("Error saving model: {}".format(e))
diff --git a/requirements.txt b/requirements.txt
index 7fdc5b8..1c847e8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,8 @@
 transformers==4.14.1
-bitsandbytes-cuda111==0.26.0
+bitsandbytes-cuda111==0.26.0.post2
 datasets==1.16.1
 loguru
 fastapi
 uvicorn
 torch
 flask