Detecting Adverse Drug Effects with Deep Lake + TinyLlama

Why Deep Lake Eats CSV Files for Breakfast

Pharmaceutical datasets present unique challenges—unstructured clinical notes, social media chatter, and scientific literature create a data management nightmare. In this chapter, Deep Lake is used to process the ADE Corpus V2 (1,860 documents mixing medical jargon and patient speak) into a query-ready format through three critical operations:

Reading corpus from Hugging Face

splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
df = pd.read_json("hf://datasets/SetFit/ade_corpus_v2_classification/" + splits["train"], lines=True)

Turning corpus into a Deep Lake object

class_names = ["Not-Related", "Related"]

schema = {
    "text": "text",
    "label": "int32",
    "label_text": "text"
}

ds = deeplake.create(
    path_to_deeplake_db,
    schema=schema,
    token=userdata.get('ACTIVELOOP_TOKEN')
)

ds.append({
    'text': df['text'].tolist(),
    'label': df['label'].tolist(),
    'label_text': df['label_text'].tolist()
})

Training a TinyLlama Lora (see code example for details on tokenization) Using Low-Rank Adaptation (LoRA), only a small subset of parameters is modified.

import torch
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(base_model, lora_config)

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./tinyllama-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=25,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

Once the LoRa is finished training it is time for inference and writing the results back to Deep Lake (for documentation purposes):

# Inference function
def classify_text(input_text):
    prompt = f"Input: {input_text}\nLabel:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=4,
            do_sample=False,  # deterministic output
            eos_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the label part
    return decoded.split("Label:")[-1].strip()

inputs_set = ["Acute pain present in middle-aged man taking albuterol.",
              "In this report, we will discuss orthopedic surgery.",
              "Young college student taking acetaminophen presents with severe rashes on left leg.",
              "Four children present in school and taking math play basketball at recess.",
              "Four children taking ibuprofen present with acute pain in shins and thighs."]
for test_input in inputs_set:
    predicted_label = classify_text(test_input)
    print(f"Input: {test_input}\nPredicted label: {predicted_label}\n")

ds.add_column("pred_label_text", "text")
ds.add_column("pred_label_int", "int32")

pred_texts, pred_ints = [], []

for row in ds[:10]:
    text = row["text"]
    pt = classify_text(text)
    pred_texts.append(pt)
    pred_ints.append(1 if pt.startswith("Related") else 0)

ds["pred_label_text"][0:10] = pred_texts
ds["pred_label_int"][0:10] = pred_ints

ds.commit("Annotated first 10 samples with model predictions")