Spaces:
Sleeping
Sleeping
| # paragraph_annotation_tool.py NEW ui | |
| """ | |
| Paragraph-level annotation tool with per-model comments | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| β’ Upload a CSV containing at least: | |
| Content_Paragraph, | |
| <model>_prompt1, <model>_prompt2 β¦ for each model | |
| β’ Enter your annotator name and click βStart / Resumeβ. | |
| β’ Rate each prompt A / B / C, optionally leave a comment for every model, | |
| navigate Back / Next, download the annotated CSV. | |
| New in this version | |
| βββββββββββββββββββ | |
| β’ One **comment textbox per model** (shown next to the modelβs two ratings). | |
| β’ Comments are stored in the CSV under columns named ``comment_<model>``. | |
| β’ Blank comments are saved as the literal string ``"no comment"``. | |
| """ | |
| from __future__ import annotations | |
| import gradio as gr, pandas as pd | |
| import random, time, os, shutil, uuid | |
| from typing import List | |
| # βββ CONFIG βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MAX_MODELS = 10 # pre-allocate up to this many models | |
| CONTENT_COL = "Content_Paragraph" | |
| PROMPT1_SUFFIX = "_prompt1" | |
| PROMPT2_SUFFIX = "_prompt2" | |
| COMMENT_PREFIX = "comment_" # <COMMENT_PREFIX><model> | |
| PERM_COL = "perm_models" | |
| RATING_OPTS = ["A", "B", "C"] | |
| # βββ GLOBALS (filled after CSV load) βββββββββββββββββββββββββββββββββββββββ | |
| df: pd.DataFrame | None = None | |
| csv_path: str | None = None | |
| models: List[str] = [] | |
| TOTAL = 0 | |
| annotator = "" | |
| current_start: float | None = None | |
| # βββ CSV HELPERS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_csv(path: str): | |
| """Read CSV, discover model columns, add helper columns if needed.""" | |
| global df, models, TOTAL, csv_path | |
| csv_path = path | |
| df = pd.read_csv(csv_path, keep_default_na=False) | |
| TOTAL = len(df) | |
| models.clear() | |
| for col in df.columns: | |
| if col.endswith(PROMPT1_SUFFIX) and not col.startswith("rating_"): | |
| m = col[: -len(PROMPT1_SUFFIX)] | |
| if f"{m}{PROMPT2_SUFFIX}" not in df.columns: | |
| raise ValueError(f"Found '{col}' but no '{m}{PROMPT2_SUFFIX}'") | |
| models.append(m) | |
| if not models: | |
| raise ValueError(f"No '*{PROMPT1_SUFFIX}' columns found") | |
| if len(models) > MAX_MODELS: | |
| raise ValueError( | |
| f"CSV has {len(models)} models but MAX_MODELS is {MAX_MODELS}") | |
| # helper columns | |
| if PERM_COL not in df.columns: | |
| df[PERM_COL] = "" | |
| for m in models: | |
| # rating columns per prompt | |
| for p in ("prompt1", "prompt2"): | |
| rc = f"rating_{m}__{p}" | |
| if rc not in df.columns: | |
| df[rc] = "" | |
| # NEW β comment column per model | |
| cc = f"{COMMENT_PREFIX}{m}" | |
| if cc not in df.columns: | |
| df[cc] = "no comment" # default value | |
| for col in ("annotator", "annotation_time"): | |
| if col not in df.columns: | |
| df[col] = "" if col == "annotator" else 0.0 | |
| # βββ BOOK-KEEPING ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def first_incomplete() -> int: | |
| for i, row in df.iterrows(): | |
| for m in models: | |
| if ( | |
| row[f"rating_{m}__prompt1"] == "" or | |
| row[f"rating_{m}__prompt2"] == "" | |
| ): | |
| return i | |
| return 0 | |
| def get_perm(idx: int) -> List[str]: | |
| cell = str(df.at[idx, PERM_COL]) | |
| if not cell: | |
| seq = models.copy() | |
| random.shuffle(seq) | |
| df.at[idx, PERM_COL] = "|".join(seq) | |
| df.to_csv(csv_path, index=False) | |
| return df.at[idx, PERM_COL].split("|") | |
| def build_row(idx: int): | |
| """Return fixed-length tuple of widget values for example *idx*.""" | |
| global current_start | |
| row = df.loc[idx] | |
| order = get_perm(idx) | |
| outs, rates, comms = [], [], [] | |
| for m in order: | |
| outs.append(row[f"{m}{PROMPT1_SUFFIX}"]) | |
| outs.append(row[f"{m}{PROMPT2_SUFFIX}"]) | |
| rates.append(row[f"rating_{m}__prompt1"] or None) | |
| rates.append(row[f"rating_{m}__prompt2"] or None) | |
| val = row[f"{COMMENT_PREFIX}{m}"] | |
| comms.append("" if val == "no comment" else val) | |
| # pad up to MAX_MODELS | |
| outs += [""] * (MAX_MODELS*2 - len(outs)) | |
| rates += [None]* (MAX_MODELS*2 - len(rates)) | |
| comms += ["" ] * (MAX_MODELS - len(comms)) | |
| ready = all(r in RATING_OPTS for r in rates[: 2*len(models)]) | |
| current_start = time.time() | |
| header = f"Example {idx+1}/{TOTAL}" | |
| return ( | |
| idx, header, row[CONTENT_COL], | |
| *outs, *rates, *comms, | |
| gr.update(visible=True), # back_btn update | |
| gr.update(visible=True, interactive=ready) # next_btn update | |
| ) | |
| def save_row(idx: int, ratings: List[str], comments: List[str]): | |
| """Persist ratings & comments for example *idx* β CSV.""" | |
| if not all(r in RATING_OPTS for r in ratings[: 2*len(models)]): | |
| return | |
| elapsed = time.time() - current_start if current_start else 0.0 | |
| order = get_perm(idx) | |
| p = 0 # rating pointer | |
| for m in order: | |
| df.at[idx, f"rating_{m}__prompt1"] = ratings[p]; p += 1 | |
| df.at[idx, f"rating_{m}__prompt2"] = ratings[p]; p += 1 | |
| # comments | |
| for m, c in zip(order, comments): | |
| clean = (c or "").strip() | |
| df.at[idx, f"{COMMENT_PREFIX}{m}"] = clean or "no comment" | |
| df.at[idx, "annotator"] = annotator | |
| df.at[idx, "annotation_time"] = float(elapsed) | |
| df.to_csv(csv_path, index=False) | |
| def _writable_dir() -> str: | |
| """Return /data on Spaces, /tmp elsewhere β whichever is writeable.""" | |
| for d in ("/data", "/tmp"): | |
| try: | |
| os.makedirs(d, exist_ok=True) | |
| with open(os.path.join(d, ".touch"), "w"): | |
| pass | |
| return d | |
| except PermissionError: | |
| continue | |
| raise PermissionError("No writable directory found.") | |
| # βββ GRADIO UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Paragraph Annotation Tool") as demo: | |
| # shared state | |
| idx_state = gr.State(0) # current example index | |
| nmodels_state = gr.State(0) # how many model slots are active | |
| gr.Markdown("## Paragraph Annotation Tool") | |
| with gr.Row(): | |
| upload_box = gr.File(label="Upload / Resume CSV", file_types=[".csv"]) | |
| annot_box = gr.Textbox(label="Annotator name") | |
| start_btn = gr.Button("Start / Resume") | |
| annotator_label = gr.Markdown(visible=False) | |
| annotation_area = gr.Column(visible=False) | |
| with annotation_area: | |
| idx_box = gr.Number(label="Index", interactive=False) | |
| hdr_box = gr.Markdown() | |
| para_box = gr.Textbox(lines=6, interactive=False, | |
| label="Content Paragraph") | |
| # Pre-allocate up to MAX_MODELS slots | |
| out_boxes, radio_widgets, comment_boxes = [], [], [] | |
| for _ in range(MAX_MODELS): | |
| with gr.Row(): | |
| # prompts + ratings | |
| with gr.Column(scale=2): | |
| out1 = gr.Textbox(lines=6, interactive=False) | |
| rad1 = gr.Radio(RATING_OPTS, label="Rating (P1)", value=None) | |
| with gr.Column(scale=2): | |
| out2 = gr.Textbox(lines=6, interactive=False) | |
| rad2 = gr.Radio(RATING_OPTS, label="Rating (P2)", value=None) | |
| # NEW β comment textbox | |
| with gr.Column(scale=1): | |
| com = gr.Textbox(lines=2, label="Comment", placeholder="Optionalβ¦") | |
| out_boxes.extend((out1, out2)) | |
| radio_widgets.extend((rad1, rad2)) | |
| comment_boxes.append(com) | |
| back_btn = gr.Button("β΅ Back", visible=False) | |
| next_btn = gr.Button("Save & Next βΆ", visible=False) | |
| download_btn = gr.Button("πΎ Download CSV", visible=False) | |
| # Enable NEXT when visible radios are filled (comments are optional) | |
| def toggle_next(model_cnt: int, *vals): | |
| needed = vals[: model_cnt*2] # only rating radios | |
| return gr.update(interactive=all(v in RATING_OPTS for v in needed)) | |
| for r in radio_widgets: | |
| r.change(toggle_next, | |
| inputs=[nmodels_state]+radio_widgets, | |
| outputs=next_btn) | |
| # ββ navigation callbacks ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def goto(step: int): | |
| def _fn(idx: int, model_cnt: int, *vals): | |
| """Handle Back / Next logic.""" | |
| # structure of *vals*: radios (model_cnt*2) + comments (model_cnt) + next_btn | |
| RADIO_COUNT = MAX_MODELS * 2 | |
| ratings = list(vals[: model_cnt * 2]) | |
| comments = list(vals[RADIO_COUNT : RADIO_COUNT + model_cnt]) | |
| # save current row unless we attempted to go back without finishing ratings | |
| if step != -1 or all(r in RATING_OPTS for r in ratings): | |
| save_row(idx, ratings, comments) | |
| new_idx = max(0, min(idx+step, TOTAL-1)) | |
| return build_row(new_idx) | |
| return _fn | |
| back_btn.click( | |
| goto(-1), | |
| inputs=[idx_state, nmodels_state]+radio_widgets+comment_boxes+[next_btn], | |
| outputs=[idx_state, hdr_box, para_box, | |
| *out_boxes, *radio_widgets, *comment_boxes, | |
| back_btn, next_btn], | |
| ) | |
| next_btn.click( | |
| goto(1), | |
| inputs=[idx_state, nmodels_state]+radio_widgets+comment_boxes+[next_btn], | |
| outputs=[idx_state, hdr_box, para_box, | |
| *out_boxes, *radio_widgets, *comment_boxes, | |
| back_btn, next_btn], | |
| ) | |
| # CSV download | |
| def make_download(): | |
| if df is None: | |
| raise gr.Error("No CSV loaded yet.") | |
| tmp = os.path.join(_writable_dir(), | |
| f"annotations_{uuid.uuid4().hex}.csv") | |
| df.to_csv(tmp, index=False) | |
| return tmp | |
| download_btn.click(make_download, outputs=gr.File()) | |
| # ββ Start / Resume ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def start_app(csv_file, name): | |
| global annotator | |
| if csv_file is None or not name.strip(): | |
| raise gr.Error("Please upload a CSV and enter your name.") | |
| new_path = os.path.join(_writable_dir(), f"{uuid.uuid4().hex}.csv") | |
| shutil.copy(csv_file.name, new_path) | |
| load_csv(new_path) | |
| annotator = name.strip() | |
| # visibility flags β one boolean per model slot | |
| vis_flags = [i < len(models) for i in range(MAX_MODELS)] | |
| # build first row values | |
| row_vals = build_row(first_incomplete()) | |
| idx_val, hdr_val, para_val = row_vals[:3] | |
| outs = row_vals[3 : 3 + MAX_MODELS*2] | |
| rates = row_vals[3 + MAX_MODELS*2 : 3 + MAX_MODELS*4] | |
| comms = row_vals[3 + MAX_MODELS*4 : 3 + MAX_MODELS*5] | |
| back_update, next_update = row_vals[-2:] | |
| # updates for textboxes, radios, comments | |
| out_updates = [ | |
| gr.update(value=outs[i], visible=vis_flags[i//2]) | |
| for i in range(MAX_MODELS*2) | |
| ] | |
| radio_updates = [ | |
| gr.update(value=rates[i], visible=vis_flags[i//2]) | |
| for i in range(MAX_MODELS*2) | |
| ] | |
| comment_updates = [ | |
| gr.update(value=comms[i], visible=vis_flags[i]) | |
| for i in range(MAX_MODELS) | |
| ] | |
| return ( | |
| first_incomplete(), # idx_state | |
| len(models), # nmodels_state | |
| gr.update(value=idx_val), # idx_box | |
| gr.update(value=hdr_val), # hdr_box | |
| gr.update(value=para_val), # para_box | |
| *out_updates, | |
| *radio_updates, | |
| *comment_updates, | |
| back_update, next_update, # nav buttons | |
| gr.update(visible=True, | |
| value=f"**Annotator:** {annotator}"), | |
| gr.update(visible=True), # download_btn | |
| gr.update(visible=True) # annotation_area | |
| ) | |
| start_btn.click( | |
| start_app, | |
| inputs=[upload_box, annot_box], | |
| outputs=[ | |
| idx_state, nmodels_state, | |
| idx_box, hdr_box, para_box, | |
| *out_boxes, *radio_widgets, *comment_boxes, | |
| back_btn, next_btn, | |
| annotator_label, | |
| download_btn, | |
| annotation_area | |
| ], | |
| ) | |
| # βββ RUN βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| demo.queue() | |
| demo.launch() # keep share=False on HF Spaces | |