58 lines
2.3 KiB
Python
58 lines
2.3 KiB
Python
import torch
|
|
import numpy as np
|
|
from PIL import Image
|
|
import torch.nn.functional as F
|
|
|
|
class Mamad8_QwenEditPlus_Standalone:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {
|
|
"required": {
|
|
"clip": ("CLIP",),
|
|
"image1": ("IMAGE",),
|
|
"text": ("STRING", {"multiline": True, "default": "Describe the change..."}),
|
|
},
|
|
"optional": {
|
|
"image2": ("IMAGE",),
|
|
"image3": ("IMAGE",),
|
|
"negative_prompt": ("STRING", {"multiline": True, "default": "low quality, blurry"}),
|
|
"strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
|
|
}
|
|
}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
|
|
RETURN_NAMES = ("conditioning", "negative_conditioning")
|
|
FUNCTION = "encode"
|
|
CATEGORY = "Qwen/Edit_Standalone"
|
|
|
|
def common_preprocessing(self, image):
|
|
# Conversion du tensor ComfyUI (BHWC) en format PIL pour Qwen
|
|
if len(image.shape) == 4:
|
|
image = image[0]
|
|
img = 255. * image.cpu().numpy()
|
|
img = Image.fromarray(np.clip(img, 0, 255).astype(np.uint8))
|
|
return img
|
|
|
|
def encode(self, clip, image1, text, image2=None, image3=None, negative_prompt="", strength=1.0):
|
|
# 1. Préparation des images pour le conditionnement visuel
|
|
images_input = [self.common_preprocessing(image1)]
|
|
|
|
if image2 is not None:
|
|
images_input.append(self.common_preprocessing(image2))
|
|
if image3 is not None:
|
|
images_input.append(self.common_preprocessing(image3))
|
|
|
|
# 2. Encodage du texte positif avec les images injectées
|
|
# Note: Cette méthode utilise l'implémentation spécifique de Qwen2-VL CLIP
|
|
tokens = clip.tokenize(text)
|
|
cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)
|
|
|
|
# Le dictionnaire 'images' est essentiel pour que le modèle Qwen sache quoi modifier
|
|
conditioning = [[cond, {"pooled_output": pooled, "images": images_input, "strength": strength}]]
|
|
|
|
# 3. Encodage du texte négatif
|
|
n_tokens = clip.tokenize(negative_prompt)
|
|
n_cond, n_pooled = clip.encode_from_tokens(n_tokens, return_pooled=True)
|
|
negative_conditioning = [[n_cond, {"pooled_output": n_pooled}]]
|
|
|
|
return (conditioning, negative_conditioning) |