import torch import numpy as np from PIL import Image import torch.nn.functional as F class Mamad8_QwenEditPlus_Standalone: @classmethod def INPUT_TYPES(s): return { "required": { "clip": ("CLIP",), "image1": ("IMAGE",), "text": ("STRING", {"multiline": True, "default": "Describe the change..."}), }, "optional": { "image2": ("IMAGE",), "image3": ("IMAGE",), "negative_prompt": ("STRING", {"multiline": True, "default": "low quality, blurry"}), "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), } } RETURN_TYPES = ("CONDITIONING", "CONDITIONING") RETURN_NAMES = ("conditioning", "negative_conditioning") FUNCTION = "encode" CATEGORY = "Qwen/Edit_Standalone" def common_preprocessing(self, image): # Conversion du tensor ComfyUI (BHWC) en format PIL pour Qwen if len(image.shape) == 4: image = image[0] img = 255. * image.cpu().numpy() img = Image.fromarray(np.clip(img, 0, 255).astype(np.uint8)) return img def encode(self, clip, image1, text, image2=None, image3=None, negative_prompt="", strength=1.0): # 1. Préparation des images pour le conditionnement visuel images_input = [self.common_preprocessing(image1)] if image2 is not None: images_input.append(self.common_preprocessing(image2)) if image3 is not None: images_input.append(self.common_preprocessing(image3)) # 2. Encodage du texte positif avec les images injectées # Note: Cette méthode utilise l'implémentation spécifique de Qwen2-VL CLIP tokens = clip.tokenize(text) cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True) # Le dictionnaire 'images' est essentiel pour que le modèle Qwen sache quoi modifier conditioning = [[cond, {"pooled_output": pooled, "images": images_input, "strength": strength}]] # 3. Encodage du texte négatif n_tokens = clip.tokenize(negative_prompt) n_cond, n_pooled = clip.encode_from_tokens(n_tokens, return_pooled=True) negative_conditioning = [[n_cond, {"pooled_output": n_pooled}]] return (conditioning, negative_conditioning)