ComfyUI/custom_nodes/x-flux-comfyui/clip.py

import json
import os
from transformers import (CLIPImageProcessor,
                          CLIPVisionModelWithProjection,
                          CLIPVisionConfig,
                          AutoConfig)


class FluxClipViT:
    def __init__(self, path_model = None):
        if path_model is None:
            self.model = CLIPVisionModelWithProjection.from_pretrained(
                "openai/clip-vit-large-patch14"
            )

        else:
            _dir = os.path.dirname(path_model)
            write_config(_dir)
            config = CLIPVisionConfig.from_pretrained(
                os.path.join(_dir, "flux_clip_config.json")
            )
            self.model = CLIPVisionModelWithProjection.from_pretrained(
                path_model,
                config=config,
                use_safetensors = True,
            )
        self.image_processor = CLIPImageProcessor()
        self.load_device = next(self.model.parameters()).device

    def __call__(self, image):
        img = self.image_processor(
            images=image, return_tensors="pt"
            )
        img = img.pixel_values
        return self.model(img).image_embeds


def write_config(path):
    #check if exists
    if os.path.exists(os.path.join(path, "flux_clip_config.json")):
        return
    with open(os.path.join(path, "flux_clip_config.json"), "w") as f:
        json.dump(json_config, f, indent=4)

json_config = {'_name_or_path': 'clip-vit-large-patch14/',
 'architectures': ['CLIPModel'],
 'initializer_factor': 1.0,
 'logit_scale_init_value': 2.6592,
 'model_type': 'clip',
 'projection_dim': 768,
 'text_config': {'_name_or_path': '',
  'add_cross_attention': False,
  'architectures': None,
  'attention_dropout': 0.0,
  'bad_words_ids': None,
  'bos_token_id': 0,
  'chunk_size_feed_forward': 0,
  'cross_attention_hidden_size': None,
  'decoder_start_token_id': None,
  'diversity_penalty': 0.0,
  'do_sample': False,
  'dropout': 0.0,
  'early_stopping': False,
  'encoder_no_repeat_ngram_size': 0,
  'eos_token_id': 2,
  'finetuning_task': None,
  'forced_bos_token_id': None,
  'forced_eos_token_id': None,
  'hidden_act': 'quick_gelu',
  'hidden_size': 768,
  'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
  'initializer_factor': 1.0,
  'initializer_range': 0.02,
  'intermediate_size': 3072,
  'is_decoder': False,
  'is_encoder_decoder': False,
  'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
  'layer_norm_eps': 1e-05,
  'length_penalty': 1.0,
  'max_length': 20,
  'max_position_embeddings': 77,
  'min_length': 0,
  'model_type': 'clip_text_model',
  'no_repeat_ngram_size': 0,
  'num_attention_heads': 12,
  'num_beam_groups': 1,
  'num_beams': 1,
  'num_hidden_layers': 12,
  'num_return_sequences': 1,
  'output_attentions': False,
  'output_hidden_states': False,
  'output_scores': False,
  'pad_token_id': 1,
  'prefix': None,
  'problem_type': None,
  'projection_dim': 768,
  'pruned_heads': {},
  'remove_invalid_values': False,
  'repetition_penalty': 1.0,
  'return_dict': True,
  'return_dict_in_generate': False,
  'sep_token_id': None,
  'task_specific_params': None,
  'temperature': 1.0,
  'tie_encoder_decoder': False,
  'tie_word_embeddings': True,
  'tokenizer_class': None,
  'top_k': 50,
  'top_p': 1.0,
  'torch_dtype': None,
  'torchscript': False,
  'transformers_version': '4.16.0.dev0',
  'use_bfloat16': False,
  'vocab_size': 49408},
 'text_config_dict': {'hidden_size': 768,
  'intermediate_size': 3072,
  'num_attention_heads': 12,
  'num_hidden_layers': 12,
  'projection_dim': 768},
 'torch_dtype': 'float32',
 'transformers_version': None,
 'vision_config': {'_name_or_path': '',
  'add_cross_attention': False,
  'architectures': None,
  'attention_dropout': 0.0,
  'bad_words_ids': None,
  'bos_token_id': None,
  'chunk_size_feed_forward': 0,
  'cross_attention_hidden_size': None,
  'decoder_start_token_id': None,
  'diversity_penalty': 0.0,
  'do_sample': False,
  'dropout': 0.0,
  'early_stopping': False,
  'encoder_no_repeat_ngram_size': 0,
  'eos_token_id': None,
  'finetuning_task': None,
  'forced_bos_token_id': None,
  'forced_eos_token_id': None,
  'hidden_act': 'quick_gelu',
  'hidden_size': 1024,
  'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
  'image_size': 224,
  'initializer_factor': 1.0,
  'initializer_range': 0.02,
  'intermediate_size': 4096,
  'is_decoder': False,
  'is_encoder_decoder': False,
  'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
  'layer_norm_eps': 1e-05,
  'length_penalty': 1.0,
  'max_length': 20,
  'min_length': 0,
  'model_type': 'clip_vision_model',
  'no_repeat_ngram_size': 0,
  'num_attention_heads': 16,
  'num_beam_groups': 1,
  'num_beams': 1,
  'num_hidden_layers': 24,
  'num_return_sequences': 1,
  'output_attentions': False,
  'output_hidden_states': False,
  'output_scores': False,
  'pad_token_id': None,
  'patch_size': 14,
  'prefix': None,
  'problem_type': None,
  'projection_dim': 768,
  'pruned_heads': {},
  'remove_invalid_values': False,
  'repetition_penalty': 1.0,
  'return_dict': True,
  'return_dict_in_generate': False,
  'sep_token_id': None,
  'task_specific_params': None,
  'temperature': 1.0,
  'tie_encoder_decoder': False,
  'tie_word_embeddings': True,
  'tokenizer_class': None,
  'top_k': 50,
  'top_p': 1.0,
  'torch_dtype': None,
  'torchscript': False,
  'transformers_version': '4.16.0.dev0',
  'use_bfloat16': False},
 'vision_config_dict': {'hidden_size': 1024,
  'intermediate_size': 4096,
  'num_attention_heads': 16,
  'num_hidden_layers': 24,
  'patch_size': 14,
  'projection_dim': 768}}