Add coworking camera-aware scene prompts

2026-06-25 23:07:31 +02:00
parent 9434070877
commit ec5640fa22
4 changed files with 282 additions and 6 deletions
@@ -392,6 +392,14 @@ The translator accepts the Qwen labels such as `front-right quarter view`,
 as the native camera nodes. `suppress_phone_visibility` is enabled by default so
 generic Qwen camera views do not add `phone hidden` or other phone wording.
 For coworking-style locations, the prompt builder also uses the translated
 camera geometry to add a location-aware framing sentence. It currently targets
 `coworking lounge`, `business cafe`, and empty office scenes: front/side/back
 views, zoom, and elevation change which desks, windows, laptop tables, glass
 partitions, counters, or office rows are kept visible. In male-POV setups this
 becomes a first-person spatial description and the external camera sentence is
 suppressed.
 `SxCP Caption Naturalizer` rewrites tag-like captions or labeled prompts into
 more natural language. Connect the prompt builder's `metadata_json` output to
 `source_text` for the cleanest result. You can also connect `caption` or
@@ -376,6 +376,7 @@ def _single_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -
    pose = _row_value(row, "pose", ("Pose",))
    expression = "" if _expression_disabled(row) else _row_value(row, "expression", ("Facial expression", "Facial expressions"))
    composition = _normalize_composition(_row_value(row, "composition", ("Composition",)))
    camera_scene = _clean_text(row.get("camera_scene_directive"))
    prop = _row_value(row, "prop", ("Prop/detail",))
    style = _row_value(row, "style") if keep_style else ""
@@ -401,6 +402,8 @@ def _single_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -
        parts.append(f"{possessive_pronoun(subject)} expression is {expression}")
    if scene:
        parts.append(f"The setting is {scene}")
    if _detail_allows(detail_level) and camera_scene:
        parts.append(camera_scene)
    if _detail_allows(detail_level) and composition:
        parts.append(f"The composition is {composition}")
    if keep_style and style:
@@ -447,6 +450,7 @@ def _couple_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -
    if not _expression_disabled(row):
        expression = _row_value(row, "character_expression_text") or _row_value(row, "expression", ("Facial expressions", "Facial expression"))
    composition = _normalize_composition(_row_value(row, "composition", ("Composition",)))
    camera_scene = _clean_text(row.get("camera_scene_directive"))
    style = _row_value(row, "style") if keep_style else ""
    parts = [f"{_cap_first(subject)} are adults"]
@@ -460,6 +464,8 @@ def _couple_from_row(row: dict[str, Any], detail_level: str, keep_style: bool) -
        parts.append(f"The pose is {pose}")
    if scene:
        parts.append(f"The setting is {scene}")
    if _detail_allows(detail_level) and camera_scene:
        parts.append(camera_scene)
    if expression:
        parts.append(f"Their expressions are {expression}")
    if _detail_allows(detail_level) and composition:
@@ -484,6 +490,7 @@ def _configured_cast_from_row(row: dict[str, Any], detail_level: str, keep_style
    if not _expression_disabled(row):
        expression = _row_value(row, "character_expression_text") or _row_value(row, "expression", ("Facial expressions", "Facial expression"))
    composition = _normalize_composition(_row_value(row, "composition", ("Composition",)))
    camera_scene = _clean_text(row.get("camera_scene_directive"))
    cast_descriptor_text = _row_value(row, "cast_descriptor_text", ("Characters", "Cast descriptors"))
    scene_kind = _row_value(row, "scene_kind") or "explicit adult sex scene"
    style = _row_value(row, "style") if keep_style else ""
@@ -506,6 +513,8 @@ def _configured_cast_from_row(row: dict[str, Any], detail_level: str, keep_style
        scene_bits.append(f"framed as {composition}")
    if scene_bits and _detail_allows(detail_level):
        parts.append(", ".join(scene_bits))
    if _detail_allows(detail_level) and camera_scene:
        parts.append(camera_scene)
    if keep_style and style:
        parts.append(f"The visual style is {style}")
    return _join_sentences(parts), "metadata(configured_cast)"
@@ -524,6 +533,7 @@ def _group_or_layout_from_row(row: dict[str, Any], detail_level: str, keep_style
    if not _expression_disabled(row):
        expression = _row_value(row, "character_expression_text") or _row_value(row, "expression", ("Facial expressions", "Facial expression"))
    composition = _normalize_composition(_row_value(row, "composition", ("Composition",)))
    camera_scene = _clean_text(row.get("camera_scene_directive"))
    style = _row_value(row, "style") if keep_style else ""
    if primary == "layout scene":
@@ -540,6 +550,8 @@ def _group_or_layout_from_row(row: dict[str, Any], detail_level: str, keep_style
            parts.append(f"They show {expression}")
    if scene:
        parts.append(f"The setting is {scene}")
    if _detail_allows(detail_level) and camera_scene:
        parts.append(camera_scene)
    if _detail_allows(detail_level) and composition:
        parts.append(f"The composition is {composition}")
    if keep_style and style:
@@ -2140,6 +2140,10 @@ def _camera_phrase(row: dict[str, Any]) -> str:
    return ""
 def _camera_scene_phrase(row: dict[str, Any]) -> str:
    return _clean(row.get("camera_scene_directive"))
 def _camera_phrase_from_config(config: Any) -> str:
    if not isinstance(config, dict):
        return ""
@@ -2219,6 +2223,7 @@ def _normal_row_to_krea(row: dict[str, Any], detail_level: str, style_mode: str)
        flags=re.IGNORECASE,
    )
    camera = _camera_phrase(row)
    camera_scene = _camera_scene_phrase(row)
    style = _style_phrase(row, style_mode)
    if subject_type == "configured_cast" or _clean(row.get("cast_summary")):
@@ -2264,6 +2269,7 @@ def _normal_row_to_krea(row: dict[str, Any], detail_level: str, style_mode: str)
            f"A consensual explicit adult scene with {subject}" if not action else "",
            f"The cast includes {cast}" if cast and not cast_prose and not (women_count == 1 and men_count == 1) else "",
            f"The setting is {scene}" if scene else "",
            camera_scene,
            _expression_phrase(expression),
            _composition_phrase(output_composition, action, "The image is framed as", detail_density),
            camera,
@@ -2281,6 +2287,7 @@ def _normal_row_to_krea(row: dict[str, Any], detail_level: str, style_mode: str)
            f"{pose}" if pose else "",
            f"with {expression}" if expression else "",
            f"in {scene}" if scene else "",
            camera_scene,
            f"framed as {composition}" if composition else "",
            camera,
            style if detail_level != "concise" else "",
@@ -2300,6 +2307,7 @@ def _normal_row_to_krea(row: dict[str, Any], detail_level: str, style_mode: str)
            _couple_clothing_phrase(item) if item else "",
            f"The pose is {pose}" if pose else "",
            f"The setting is {scene}" if scene else "",
            camera_scene,
            f"Facial expressions are {expression}" if expression else "",
            f"The image is framed as {composition}" if composition else "",
            camera,
@@ -2312,6 +2320,7 @@ def _normal_row_to_krea(row: dict[str, Any], detail_level: str, style_mode: str)
        f"{subject}",
        f"featuring {item}" if item else "",
        f"in {scene}" if scene else "",
        camera_scene,
        f"with {expression}" if expression else "",
        f"framed as {composition}" if composition else "",
        camera,
@@ -2332,6 +2341,8 @@ def _insta_pair_to_krea(row: dict[str, Any], detail_level: str, style_mode: str)
    hard = row.get("hardcore_row") if isinstance(row.get("hardcore_row"), dict) else {}
    soft_camera = _pair_camera_phrase(row.get("softcore_camera_directive"), row.get("softcore_camera_config"), soft)
    hard_camera = _pair_camera_phrase(row.get("hardcore_camera_directive"), row.get("hardcore_camera_config"), hard)
    soft_camera_scene = _camera_scene_phrase(soft) or _clean(row.get("softcore_camera_scene_directive"))
    hard_camera_scene = _camera_scene_phrase(hard) or _clean(row.get("hardcore_camera_scene_directive"))
    soft_style = _style_phrase(soft, style_mode)
    hard_style = _style_phrase(hard, style_mode)
    options = row.get("options") if isinstance(row.get("options"), dict) else {}
@@ -2452,6 +2463,7 @@ def _insta_pair_to_krea(row: dict[str, Any], detail_level: str, style_mode: str)
        f"{soft.get('pose')}" if soft.get("pose") else "",
        _expression_phrase(soft_expression),
        f"in {soft.get('scene_text')}" if soft.get("scene_text") else "",
        soft_camera_scene,
        f"framed as {soft_output_composition}" if soft_output_composition else "",
        soft_camera,
        soft_style if detail_level != "concise" else "",
@@ -2465,6 +2477,7 @@ def _insta_pair_to_krea(row: dict[str, Any], detail_level: str, style_mode: str)
        ),
        hard_cast_prose,
        f"set in {hard_scene}" if hard_scene else "",
        hard_camera_scene,
        _expression_phrase(hard_expression),
        _composition_phrase(hard_output_composition, hard_action, detail_density=hard_detail_density),
        hard_camera,
@@ -3440,15 +3440,223 @@ def _camera_caption_text(parsed: dict[str, Any]) -> str:
    return f"{camera_mode} camera framing"
 def _is_coworking_scene(scene_text: Any) -> bool:
    text = str(scene_text or "").lower()
    return any(
        term in text
        for term in (
            "coworking",
            "cowork",
            "office lounge",
            "business cafe",
            "work cafe",
            "shared office",
            "corporate office",
            "office after hours",
            "laptops",
            "warm desks",
            "repeating desks",
            "glass partitions",
            "copier alcove",
        )
    )
 def _camera_geometry_phrase(parsed: dict[str, Any]) -> str:
    direction = str(parsed.get("orbit_direction") or "").strip()
    elevation = str(parsed.get("orbit_elevation_label") or "").strip()
    distance = str(parsed.get("orbit_distance_label") or "").strip()
    custom = str(parsed.get("custom_camera_prompt") or "").strip()
    if not any((direction, elevation, distance)) and custom:
        return custom
    parts = [part for part in (direction, elevation, distance) if part and part != "auto"]
    if parts:
        return ", ".join(parts)
    compact_parts = [
        CAMERA_COMPACT_LABELS.get(str(parsed.get(key) or ""), str(parsed.get(key) or "").replace("_", " "))
        for key in ("shot_size", "angle", "distance")
    ]
    compact_parts = [part for part in compact_parts if part and part != "auto"]
    return ", ".join(compact_parts)
 def _camera_direction_from_text(text: Any) -> str:
    source = str(text or "").lower()
    for label in (
        "front-right quarter view",
        "right side view",
        "back-right quarter view",
        "back view",
        "back-left quarter view",
        "left side view",
        "front-left quarter view",
        "front view",
    ):
        if label in source:
            return label
    return ""
 def _camera_elevation_from_text(text: Any) -> str:
    source = str(text or "").lower()
    for label in ("low-angle shot", "eye-level shot", "elevated shot", "high-angle shot"):
        if label in source:
            return label
    return ""
 def _camera_distance_from_text(text: Any) -> str:
    source = str(text or "").lower()
    for label in ("wide shot", "full-body shot", "three-quarter body shot", "medium shot", "close-up", "extreme close-up"):
        if label in source:
            return label
    return ""
 def _coworking_location_profile(scene_text: Any) -> dict[str, str]:
    text = str(scene_text or "").lower()
    if "business cafe" in text or "work cafe" in text or "cafe" in text:
        return {
            "place": "business cafe coworking counter",
            "foreground": "counter edge, small plant, laptop corner, and polished phone-check surface",
            "midground": "bar stools, warm desk lamps, coffee counter, and laptop users' empty work spots",
            "background": "plants, mirror strip, menu wall, and repeated cafe work tables",
        }
    if "corporate office" in text or "office after hours" in text or "copier" in text:
        return {
            "place": "empty after-hours office",
            "foreground": "copier alcove edge, office chair backs, and the nearest desk corner",
            "midground": "repeating desks, glass partition seams, blinds, and muted monitor glow",
            "background": "rows of empty workstations, city-light windows, and quiet office depth",
        }
    return {
        "place": "coworking lounge",
        "foreground": "nearest desk edge, laptop corner, chair back, and polished tabletop line",
        "midground": "warm work desks, laptop tables, glass partition seams, and open walking aisle",
        "background": "tall windows, repeated desk rows, plants, and soft shared-office depth",
    }
 def _coworking_direction_detail(
    direction: str,
    profile: dict[str, str],
    pov_labels: list[str] | None = None,
 ) -> str:
    direction = str(direction or "").strip().lower()
    foreground = profile["foreground"]
    midground = profile["midground"]
    background = profile["background"]
    if pov_labels:
        if "right side" in direction:
            return f"the visible partner is in right-side profile across the lower foreground: {foreground}; behind them, {midground} runs horizontally toward {background}"
        if "left side" in direction:
            return f"the visible partner is in left-side profile across the lower foreground: {foreground}; behind them, {midground} runs horizontally toward {background}"
        if "back-right" in direction or "back-left" in direction:
            return f"the viewer sees the visible partner from a rear-quarter angle, turning back over one shoulder; {foreground} sits at the lower edge while {midground} leads into {background}"
        if direction == "back view":
            return f"the viewer looks past the visible partner's back toward {midground}, then into {background}, with foreground body cues low in frame"
        if "front-right" in direction or "front-left" in direction:
            return f"the visible partner is close in a front-quarter view over the lower foreground: {foreground}; {midground} recede diagonally into {background}"
        return f"the visible partner faces the viewer over the lower foreground: {foreground}; {midground} stacks clearly in front of {background}"
    if "right side" in direction or "left side" in direction:
        return f"the cast is held in clean side profile along the foreground anchor: {foreground}; {midground} creates horizontal perspective lines, with {background} still visible"
    if "back-right" in direction or "back-left" in direction:
        return f"the cast is viewed from a rear-quarter angle, partly turning back toward the camera; {foreground} stays low in frame while {midground} leads into {background}"
    if direction == "back view":
        return f"the cast is seen from behind with {foreground} at the camera side, facing into {midground} and {background}"
    if "front-right" in direction or "front-left" in direction:
        return f"the cast is placed beside the foreground anchor: {foreground}; {midground} recede diagonally into {background}"
    return f"the cast faces the camera beside the foreground anchor: {foreground}; {midground} is layered between the cast and {background}"
 def _coworking_distance_detail(distance: str, profile: dict[str, str]) -> str:
    distance = str(distance or "").strip().lower()
    if "wide" in distance or "full-body" in distance or "full body" in distance:
        return f"Keep full bodies plus floor aisle, table rows, and enough {profile['background']} to read the whole {profile['place']}."
    if "close" in distance:
        return f"Crop close, but keep one concrete location anchor visible: {profile['foreground']} or a slice of {profile['midground']}."
    return f"Use a medium crop: bodies stay dominant, but the foreground anchor ({profile['foreground']}) and one midground layer ({profile['midground']}) remain visible."
 def _coworking_elevation_detail(elevation: str, profile: dict[str, str]) -> str:
    elevation = str(elevation or "").strip().lower()
    if "low-angle" in elevation:
        return f"Low viewpoint: let {profile['foreground']} loom at the lower edge while windows and partitions rise behind the bodies."
    if "elevated" in elevation:
        return f"Elevated viewpoint: show tabletop surfaces, laptop rectangles, chair positions, and the walking aisle around the bodies."
    if "high-angle" in elevation:
        return f"High viewpoint: look down over the grid of desks, chairs, floor aisle, and body placement so the room layout is explicit."
    return f"Eye-level viewpoint: keep tabletop lines and glass seams straight enough to make the {profile['place']} believable."
 def _coworking_camera_scene_directive(
    scene_text: Any,
    parsed: dict[str, Any],
    pov_labels: list[str] | None = None,
 ) -> str:
    if not _is_coworking_scene(scene_text):
        return ""
    direction = str(parsed.get("orbit_direction") or "").strip()
    elevation = str(parsed.get("orbit_elevation_label") or "").strip()
    distance = str(parsed.get("orbit_distance_label") or "").strip()
    custom_prompt = str(parsed.get("custom_camera_prompt") or "").strip()
    direction = direction or _camera_direction_from_text(custom_prompt)
    elevation = elevation or _camera_elevation_from_text(custom_prompt)
    distance = distance or _camera_distance_from_text(custom_prompt)
    if not any((direction, elevation, distance, custom_prompt)):
        return ""
    profile = _coworking_location_profile(scene_text)
    direction_detail = _coworking_direction_detail(direction, profile, pov_labels)
    distance_detail = _coworking_distance_detail(distance, profile)
    elevation_detail = _coworking_elevation_detail(elevation, profile)
    if pov_labels:
        return (
            f"From the POV participant's position inside the {profile['place']}, {direction_detail}. "
            f"{distance_detail} {elevation_detail} Use the multiangle camera only as spatial geometry for what the viewer can see."
        )
    geometry = _camera_geometry_phrase(parsed)
    geometry_clause = f" from a {geometry}" if geometry else ""
    return (
        f"In the {profile['place']}{geometry_clause}, {direction_detail}. "
        f"{distance_detail} {elevation_detail}"
    )
 def _camera_scene_directive_for_context(
    scene_text: Any,
    composition: Any,
    camera_config: str | dict[str, Any] | None,
    pov_labels: list[str] | None = None,
 ) -> tuple[str, dict[str, Any]]:
    parsed = _parse_camera_config(camera_config)
    if parsed["camera_detail"] == "off" or parsed["camera_mode"] == "disabled":
        return "", parsed
    return _coworking_camera_scene_directive(scene_text, parsed, pov_labels), parsed
 def _apply_camera_config(row: dict[str, Any], camera_config: str | dict[str, Any] | None) -> dict[str, Any]:
    directive, parsed = _camera_directive(camera_config)
    pov_labels = _pov_character_labels(
        _character_slot_label_map(_parse_character_cast(row.get("character_cast_slots"))),
        int(row.get("men_count") or 0) if str(row.get("men_count") or "").isdigit() else 0,
    )
    if not pov_labels:
        pov_labels = [str(label) for label in _list_from(row.get("pov_character_labels")) if str(label).strip()]
    scene_directive, parsed = _camera_scene_directive_for_context(
        row.get("scene_text") or row.get("source_scene_text") or row.get("scene"),
        row.get("composition") or row.get("source_composition"),
        parsed,
        pov_labels,
    )
    row["camera_config"] = parsed
-    row["camera_directive"] = directive
+    row["camera_scene_directive"] = scene_directive
-    if not directive:
+    row["camera_directive"] = "" if pov_labels else directive
    combined_directive = " ".join(part for part in (scene_directive, row["camera_directive"]) if part)
    if not combined_directive:
        return row
-    row["prompt"] = _insert_positive_directive(row["prompt"], directive)
+    row["prompt"] = _insert_positive_directive(row["prompt"], combined_directive)
    camera_caption = _camera_caption_text(parsed)
-    if camera_caption:
+    if camera_caption and not pov_labels:
        row["caption"] = f"{row.get('caption', '').rstrip()}, {camera_caption}"
    return row
@@ -7981,10 +8189,39 @@ def build_insta_of_pair(
    hard_camera_config = _insta_camera_config_with_detail(hard_camera_config, options["camera_detail"])
    soft_camera_directive, soft_camera_config = _camera_directive(soft_camera_config)
    hard_camera_directive, hard_camera_config = _camera_directive(hard_camera_config)
    soft_camera_sentence = f"Camera control: {soft_camera_directive} " if soft_camera_directive else ""
    hard_camera_sentence = f"Camera control: {hard_camera_directive} " if hard_camera_directive else ""
    hard_scene = soft_row["scene_text"] if options["continuity"] == "same_creator_same_room" else hard_row["scene_text"]
    hard_composition = hard_row["composition"]
    soft_pov_camera_labels = (
        pov_character_labels
        if options["softcore_cast"] == "same_as_hardcore"
        else []
    )
    soft_camera_scene_directive, soft_camera_config = _camera_scene_directive_for_context(
        soft_row.get("scene_text"),
        soft_row.get("composition"),
        soft_camera_config,
        soft_pov_camera_labels,
    )
    hard_camera_scene_directive, hard_camera_config = _camera_scene_directive_for_context(
        hard_scene,
        hard_composition,
        hard_camera_config,
        pov_character_labels,
    )
    if soft_pov_camera_labels:
        soft_camera_directive = ""
    if pov_character_labels:
        hard_camera_directive = ""
    soft_row["camera_config"] = soft_camera_config
    soft_row["camera_directive"] = soft_camera_directive
    soft_row["camera_scene_directive"] = soft_camera_scene_directive
    hard_row["camera_config"] = hard_camera_config
    hard_row["camera_directive"] = hard_camera_directive
    hard_row["camera_scene_directive"] = hard_camera_scene_directive
    soft_camera_scene_sentence = f"{soft_camera_scene_directive} " if soft_camera_scene_directive else ""
    hard_camera_scene_sentence = f"{hard_camera_scene_directive} " if hard_camera_scene_directive else ""
    soft_camera_sentence = f"Camera control: {soft_camera_directive} " if soft_camera_directive else ""
    hard_camera_sentence = f"Camera control: {hard_camera_directive} " if hard_camera_directive else ""
    soft_cast = (
        "solo creator setup with Woman A alone"
        if options["softcore_cast"] == "solo"
@@ -8065,6 +8302,7 @@ def build_insta_of_pair(
        f"{soft_cast_presence}"
        f"{soft_cast_styling_sentence}"
        f"{soft_row['softcore_item_prompt_label']}: {soft_row['item']}. Pose: {soft_row['pose']}. Setting: {soft_row['scene_text']}. "
        f"{soft_camera_scene_sentence}"
        f"{_labeled_expression_sentence('Facial expression', soft_row.get('expression'))}"
        f"Composition: {soft_row['composition']}. "
        f"{soft_camera_sentence}"
@@ -8080,6 +8318,7 @@ def build_insta_of_pair(
        f"{hard_clothing_sentence}"
        f"Role graph: {hard_row['role_graph']} Sexual scene: {hard_row['item']}. "
        f"Setting: {hard_scene}. "
        f"{hard_camera_scene_sentence}"
        f"{_labeled_expression_sentence('Facial expressions', hard_row.get('expression'))}"
        f"Composition: {hard_composition}. "
        f"{hard_detail_directive}"
@@ -8104,6 +8343,7 @@ def build_insta_of_pair(
        soft_partner_outfit_text,
        soft_partner_styling["pose"],
        soft_row["scene_text"],
        soft_camera_scene_directive,
        soft_row["composition"],
        _camera_caption_text(soft_camera_config) if soft_camera_directive else "",
    ]
@@ -8117,6 +8357,7 @@ def build_insta_of_pair(
        hard_row["role_graph"],
        hard_row["item"],
        hard_scene,
        hard_camera_scene_directive,
        hard_composition,
        _camera_caption_text(hard_camera_config) if hard_camera_directive else "",
    ]
@@ -8150,5 +8391,7 @@ def build_insta_of_pair(
        "hardcore_camera_config": hard_camera_config,
        "softcore_camera_directive": soft_camera_directive,
        "hardcore_camera_directive": hard_camera_directive,
        "softcore_camera_scene_directive": soft_camera_scene_directive,
        "hardcore_camera_scene_directive": hard_camera_scene_directive,
    }
    return metadata