Files
ComfyUI/custom_nodes/x-flux-comfyui/clip.py
jaidaken f09734b0ee
Some checks failed
Python Linting / Run Ruff (push) Has been cancelled
Python Linting / Run Pylint (push) Has been cancelled
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Has been cancelled
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Has been cancelled
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Has been cancelled
Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Has been cancelled
Execution Tests / test (macos-latest) (push) Has been cancelled
Execution Tests / test (ubuntu-latest) (push) Has been cancelled
Execution Tests / test (windows-latest) (push) Has been cancelled
Test server launches without errors / test (push) Has been cancelled
Unit Tests / test (macos-latest) (push) Has been cancelled
Unit Tests / test (ubuntu-latest) (push) Has been cancelled
Unit Tests / test (windows-2022) (push) Has been cancelled
Add custom nodes, Civitai loras (LFS), and vast.ai setup script
Includes 30 custom nodes committed directly, 7 Civitai-exclusive
loras stored via Git LFS, and a setup script that installs all
dependencies and downloads HuggingFace-hosted models on vast.ai.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 00:56:42 +00:00

193 lines
5.4 KiB
Python

import json
import os
from transformers import (CLIPImageProcessor,
CLIPVisionModelWithProjection,
CLIPVisionConfig,
AutoConfig)
class FluxClipViT:
def __init__(self, path_model = None):
if path_model is None:
self.model = CLIPVisionModelWithProjection.from_pretrained(
"openai/clip-vit-large-patch14"
)
else:
_dir = os.path.dirname(path_model)
write_config(_dir)
config = CLIPVisionConfig.from_pretrained(
os.path.join(_dir, "flux_clip_config.json")
)
self.model = CLIPVisionModelWithProjection.from_pretrained(
path_model,
config=config,
use_safetensors = True,
)
self.image_processor = CLIPImageProcessor()
self.load_device = next(self.model.parameters()).device
def __call__(self, image):
img = self.image_processor(
images=image, return_tensors="pt"
)
img = img.pixel_values
return self.model(img).image_embeds
def write_config(path):
#check if exists
if os.path.exists(os.path.join(path, "flux_clip_config.json")):
return
with open(os.path.join(path, "flux_clip_config.json"), "w") as f:
json.dump(json_config, f, indent=4)
json_config = {'_name_or_path': 'clip-vit-large-patch14/',
'architectures': ['CLIPModel'],
'initializer_factor': 1.0,
'logit_scale_init_value': 2.6592,
'model_type': 'clip',
'projection_dim': 768,
'text_config': {'_name_or_path': '',
'add_cross_attention': False,
'architectures': None,
'attention_dropout': 0.0,
'bad_words_ids': None,
'bos_token_id': 0,
'chunk_size_feed_forward': 0,
'cross_attention_hidden_size': None,
'decoder_start_token_id': None,
'diversity_penalty': 0.0,
'do_sample': False,
'dropout': 0.0,
'early_stopping': False,
'encoder_no_repeat_ngram_size': 0,
'eos_token_id': 2,
'finetuning_task': None,
'forced_bos_token_id': None,
'forced_eos_token_id': None,
'hidden_act': 'quick_gelu',
'hidden_size': 768,
'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
'initializer_factor': 1.0,
'initializer_range': 0.02,
'intermediate_size': 3072,
'is_decoder': False,
'is_encoder_decoder': False,
'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
'layer_norm_eps': 1e-05,
'length_penalty': 1.0,
'max_length': 20,
'max_position_embeddings': 77,
'min_length': 0,
'model_type': 'clip_text_model',
'no_repeat_ngram_size': 0,
'num_attention_heads': 12,
'num_beam_groups': 1,
'num_beams': 1,
'num_hidden_layers': 12,
'num_return_sequences': 1,
'output_attentions': False,
'output_hidden_states': False,
'output_scores': False,
'pad_token_id': 1,
'prefix': None,
'problem_type': None,
'projection_dim': 768,
'pruned_heads': {},
'remove_invalid_values': False,
'repetition_penalty': 1.0,
'return_dict': True,
'return_dict_in_generate': False,
'sep_token_id': None,
'task_specific_params': None,
'temperature': 1.0,
'tie_encoder_decoder': False,
'tie_word_embeddings': True,
'tokenizer_class': None,
'top_k': 50,
'top_p': 1.0,
'torch_dtype': None,
'torchscript': False,
'transformers_version': '4.16.0.dev0',
'use_bfloat16': False,
'vocab_size': 49408},
'text_config_dict': {'hidden_size': 768,
'intermediate_size': 3072,
'num_attention_heads': 12,
'num_hidden_layers': 12,
'projection_dim': 768},
'torch_dtype': 'float32',
'transformers_version': None,
'vision_config': {'_name_or_path': '',
'add_cross_attention': False,
'architectures': None,
'attention_dropout': 0.0,
'bad_words_ids': None,
'bos_token_id': None,
'chunk_size_feed_forward': 0,
'cross_attention_hidden_size': None,
'decoder_start_token_id': None,
'diversity_penalty': 0.0,
'do_sample': False,
'dropout': 0.0,
'early_stopping': False,
'encoder_no_repeat_ngram_size': 0,
'eos_token_id': None,
'finetuning_task': None,
'forced_bos_token_id': None,
'forced_eos_token_id': None,
'hidden_act': 'quick_gelu',
'hidden_size': 1024,
'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
'image_size': 224,
'initializer_factor': 1.0,
'initializer_range': 0.02,
'intermediate_size': 4096,
'is_decoder': False,
'is_encoder_decoder': False,
'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
'layer_norm_eps': 1e-05,
'length_penalty': 1.0,
'max_length': 20,
'min_length': 0,
'model_type': 'clip_vision_model',
'no_repeat_ngram_size': 0,
'num_attention_heads': 16,
'num_beam_groups': 1,
'num_beams': 1,
'num_hidden_layers': 24,
'num_return_sequences': 1,
'output_attentions': False,
'output_hidden_states': False,
'output_scores': False,
'pad_token_id': None,
'patch_size': 14,
'prefix': None,
'problem_type': None,
'projection_dim': 768,
'pruned_heads': {},
'remove_invalid_values': False,
'repetition_penalty': 1.0,
'return_dict': True,
'return_dict_in_generate': False,
'sep_token_id': None,
'task_specific_params': None,
'temperature': 1.0,
'tie_encoder_decoder': False,
'tie_word_embeddings': True,
'tokenizer_class': None,
'top_k': 50,
'top_p': 1.0,
'torch_dtype': None,
'torchscript': False,
'transformers_version': '4.16.0.dev0',
'use_bfloat16': False},
'vision_config_dict': {'hidden_size': 1024,
'intermediate_size': 4096,
'num_attention_heads': 16,
'num_hidden_layers': 24,
'patch_size': 14,
'projection_dim': 768}}