A Coding Implementation on Microsoft’s Phi-4-Mini for Quantized Inference Reasoning Tool Use RAG and LoRA Fine-Tuning

import subprocess, sys, os, shutil, glob


def pip_install(args):
   subprocess.run([sys.executable, "-m", "pip", "install", "-q", *args],
                  check=True)


pip_install(["huggingface_hub>=0.26,<1.0"])


pip_install([
   "-U",
   "transformers>=4.49,<4.57",
   "accelerate>=0.33.0",
   "bitsandbytes>=0.43.0",
   "peft>=0.11.0",
   "datasets>=2.20.0,<3.0",
   "sentence-transformers>=3.0.0,<4.0",
   "faiss-cpu",
])


for p in glob.glob(os.path.expanduser(
       "~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4*")):
   shutil.rmtree(p, ignore_errors=True)


for _m in list(sys.modules):
   if _m.startswith(("transformers", "huggingface_hub", "tokenizers",
                     "accelerate", "peft", "datasets",
                     "sentence_transformers")):
       del sys.modules[_m]


import json, re, textwrap, warnings, torch
warnings.filterwarnings("ignore")


from transformers import (
   AutoModelForCausalLM,
   AutoTokenizer,
   BitsAndBytesConfig,
   TextStreamer,
   TrainingArguments,
   Trainer,
   DataCollatorForLanguageModeling,
)
import transformers
print(f"Using transformers {transformers.__version__}")


PHI_MODEL_ID = "microsoft/Phi-4-mini-instruct"


assert torch.cuda.is_available(), (
   "No GPU detected. In Colab: Runtime > Change runtime type > T4 GPU."
)
print(f"GPU detected: {torch.cuda.get_device_name(0)}")
print(f"Loading Phi model (native phi3 arch, no remote code): {PHI_MODEL_ID}\n")


bnb_cfg = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.bfloat16,
   bnb_4bit_use_double_quant=True,
)


phi_tokenizer = AutoTokenizer.from_pretrained(PHI_MODEL_ID)
if phi_tokenizer.pad_token_id is None:
   phi_tokenizer.pad_token = phi_tokenizer.eos_token


phi_model = AutoModelForCausalLM.from_pretrained(
   PHI_MODEL_ID,
   quantization_config=bnb_cfg,
   device_map="auto",
   torch_dtype=torch.bfloat16,
)
phi_model.config.use_cache = True


print(f"\n✓ Phi-4-mini loaded in 4-bit. "
     f"GPU memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")
print(f"  Architecture: {phi_model.config.model_type}   "
     f"(using built-in {type(phi_model).__name__})")
print(f"  Parameters: ~{sum(p.numel() for p in phi_model.parameters())/1e9:.2f}B")


def ask_phi(messages, *, tools=None, max_new_tokens=512,
           temperature=0.3, stream=False):
   """Single entry point for all Phi-4-mini inference calls below."""
   prompt_ids = phi_tokenizer.apply_chat_template(
       messages,
       tools=tools,
       add_generation_prompt=True,
       return_tensors="pt",
   ).to(phi_model.device)


   streamer = (TextStreamer(phi_tokenizer, skip_prompt=True,
                            skip_special_tokens=True)
               if stream else None)


   with torch.inference_mode():
       out = phi_model.generate(
           prompt_ids,
           max_new_tokens=max_new_tokens,
           do_sample=temperature > 0,
           temperature=max(temperature, 1e-5),
           top_p=0.9,
           pad_token_id=phi_tokenizer.pad_token_id,
           eos_token_id=phi_tokenizer.eos_token_id,
           streamer=streamer,
       )
   return phi_tokenizer.decode(
       out[0][prompt_ids.shape[1]:], skip_special_tokens=True
   ).strip()


def banner(title):
   print("\n" + "=" * 78 + f"\n  {title}\n" + "=" * 78)

A Coding Implementation on Microsoft’s Phi-4-Mini for Quantized Inference Reasoning Tool Use RAG and LoRA Fine-Tuning

An early warning system for novel AI risks

Types, Benefits, and Use Cases

Google AI Releases Multi-Token Prediction (MTP) Drafters for Gemma 4: Delivering Up to 3x Faster Inference Without Quality Loss

RA3: Mid-Training with Temporal Action Abstractions for Faster Reinforcement Learning (RL) Post-Training in Code LLMs