Design a Complete Multimodal RLVR Pipeline with Open-MM-RL, Vision-Language Prompting, Reward Scoring, and GRPO Export

EXTRACT_PATS = [
r”\\boxed\{([^{}]+)\}”,
r”final\s+answer\s*[:=]\s*([^\n]+)”,
r”answer\s*[:=]\s*([^\n]+)”,
]
def extract_final(text):
if not text: return “”
for p in EXTRACT_PATS:
m = re.search(p, text, flags=re.IGNORECASE)
if m: return m.group(1).strip().strip(“.,;”)
lines = [l.strip() for l in str(text).strip().splitlines() if l.strip()]
return lines[-1] if lines else “”
def latex_to_sympy(s):
s = (s or “”).strip().strip(“$”).strip()
s = re.sub(r”^\\[\[$]”, “”, s); s = re.sub(r”\\[\]$]$”, “”, s)
s = (s.replace(“\\pi”, “pi”).replace(“\\cdot”, “*”).replace(“\\times”, “*”)
.replace(“\\,”, “”).replace(“\\;”, “”).replace(“\\!”, “”))
s = re.sub(r”\\frac\s*\{([^{}]+)\}\s*\{([^{}]+)\}”, r”((\1)/(\2))”, s)
s = re.sub(r”\\sqrt\s*\{([^{}]+)\}”, r”sqrt(\1)”, s)
s = s.replace(“^”, “**”)
s = re.sub(r”\\[a-zA-Z]+”, “”, s)
s = s.replace(“{“, “(“).replace(“}”, “)”)
return s
def grade(pred, gold, tol=1e-4):
“””Verifiable reward in [0,1]: exact > numeric > sympy-symbolic > partial.”””
if pred is None or gold is None: return 0.0
p = extract_final(str(pred)).strip()
g = str(gold).strip()
norm = lambda x: re.sub(r”\s+”, “”, x.lower()).strip(“$.,;[]()”)
if norm(p) == norm(g): return 1.0
def to_float(x):
try: return float(latex_to_sympy(x))
except Exception:
try: return float(sp.sympify(latex_to_sympy(x)).evalf())
except Exception: return None
fp, fg = to_float(p), to_float(g)
if fp is not None and fg is not None:
if abs(fp – fg) / max(1.0, abs(fg)) < tol: return 1.0
try:
ep = sp.sympify(latex_to_sympy(p)); eg = sp.sympify(latex_to_sympy(g))
if sp.simplify(ep – eg) == 0: return 1.0
except Exception:
pass
if norm(g) and norm(g) in norm(p): return 0.5
return 0.0
print(“\n=== Grader sanity checks ===”)
for pred, gold, want in [
(“The answer is \\boxed{120}”, “[120]”, 1.0),
(“After computing: 7396 \\pi”, “7396\\pi”, 1.0),
(“Final answer: -71/4”, “-\\frac{71}{4}”, 1.0),
(“Therefore the result is 0.0074”, “0.0074”, 1.0),
(“Final answer: nucleus accumbens”, “Nucleus accumbens”,1.0),
(“I don’t know”, “12”, 0.0),
]:
print(f” pred={pred[:38]!r:42s} gold={gold!r:22s} -> r={grade(pred, gold)} (want {want})”)
SYSTEM = (“You are a STEM expert solving multimodal reasoning problems. ”
“You will see a question and one or more figures. ”
“Reason step by step, then end with exactly one line:\n”
“Final answer: <your answer>”)
def build_prompt(ex):
img_tags = “\n”.join(f”[Image {i+1}]” for i in range(len(ex[“images”])))
return f”{SYSTEM}\n\n{img_tags}\n\nQuestion:\n{ex[‘question’]}\n\nLet’s think step by step.”
print(“\n=== Example prompt (truncated) ===”)
print(build_prompt(ds[0])[:600], “…\n”)

Source link