Merge pull request #103 from hodanov/feature/sdxl

Modify stable_diffusion_xl application.
2024-05-06 12:54:17 +09:00 · 2024-05-06 12:54:17 +09:00 · 608cf88991
commit 608cf88991
parent e4c2f622b2 487359dacf
4 changed files with 29 additions and 83 deletions
--- a/3
+++ b/3
@ -29,7 +29,10 @@ img_by_sd15_img2img:
 img_by_sdxl_txt2img:
 	cd ./cmd && modal run sdxl_txt2img.py \
 	--prompt "A dog is running on the grass" \
+	--n-prompt "" \
 	--height 1024 \
 	--width 1024 \
 	--samples 1 \
+	--steps 30 \
+	--use-upscaler "True" \
 	--output-format "avif"
--- a/app/setup.py
+++ b/app/setup.py
@ -86,13 +86,6 @@ def download_model_sdxl(name: str, model_url: str, token: str):
    )
    pipe.save_pretrained(cache_path, safe_serialization=True)

-    refiner_cache_path = cache_path + "-refiner"
-    refiner = diffusers.StableDiffusionXLImg2ImgPipeline.from_single_file(
-        "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors",
-        cache_dir=refiner_cache_path,
-    )
-    refiner.save_pretrained(refiner_cache_path, safe_serialization=True)
-

 def build_image():
    """
@ -125,11 +118,7 @@ def build_image():
    loras = config.get("loras")
    if loras is not None:
        for lora in loras:
-            download_file(
-                url=lora["url"],
-                file_name=lora["name"],
-                file_path=BASE_CACHE_PATH_LORA,
-            )
+            download_file(url=lora["url"], file_name=lora["name"], file_path=BASE_CACHE_PATH_LORA)

    textual_inversions = config.get("textual_inversions")
    if textual_inversions is not None:
--- a/app/stable_diffusion_xl.py
+++ b/app/stable_diffusion_xl.py
@ -32,36 +32,19 @@ class SDXLTxt2Img:
        else:
            print(f"The directory '{self.cache_path}' does not exist.")

-        self.pipe = diffusers.AutoPipelineForText2Image.from_pretrained(
+        self.pipe = diffusers.DiffusionPipeline.from_pretrained(
            self.cache_path,
            torch_dtype=torch.float16,
            use_safetensors=True,
-            variant="fp16",
        )

-        self.refiner_cache_path = self.cache_path + "-refiner"
-        self.refiner = diffusers.StableDiffusionXLImg2ImgPipeline.from_pretrained(
-            self.refiner_cache_path,
+        self.upscaler_cache_path = self.cache_path
+        self.upscaler = diffusers.StableDiffusionXLImg2ImgPipeline.from_pretrained(
+            self.upscaler_cache_path,
            torch_dtype=torch.float16,
            use_safetensors=True,
-            variant="fp16",
        )

-        # controlnets = config.get("controlnets")
-        # if controlnets is not None:
-        #     for controlnet in controlnets:
-        #         path = os.path.join(BASE_CACHE_PATH_CONTROLNET, controlnet["name"])
-        #         controlnet = diffusers.ControlNetModel.from_pretrained(path, torch_dtype=torch.float16)
-        #         self.controlnet_pipe = diffusers.StableDiffusionControlNetPipeline.from_pretrained(
-        #             self.cache_path,
-        #             controlnet=controlnet,
-        #             custom_pipeline="lpw_stable_diffusion",
-        #             scheduler=self.pipe.scheduler,
-        #             vae=self.pipe.vae,
-        #             torch_dtype=torch.float16,
-        #             use_safetensors=True,
-        #         )
-
    def _count_token(self, p: str, n: str) -> int:
        """
        Count the number of tokens in the prompt and negative prompt.
@ -107,63 +90,35 @@ class SDXLTxt2Img:

        generator = torch.Generator("cuda").manual_seed(seed)
        self.pipe.to("cuda")
+        self.pipe.enable_vae_tiling()
+        self.pipe.enable_xformers_memory_efficient_attention()
        generated_images = self.pipe(
            prompt=prompt,
            negative_prompt=n_prompt,
+            guidance_scale=7,
            height=height,
            width=width,
            generator=generator,
+            num_inference_steps=steps,
        ).images
-        base_images = generated_images

+        if use_upscaler:
+            base_images = generated_images
            for image in base_images:
                image = self._resize_image(image=image, scale_factor=2)
-            self.refiner.to("cuda")
-            refined_images = self.refiner(
+                self.upscaler.to("cuda")
+                self.upscaler.enable_vae_tiling()
+                self.upscaler.enable_xformers_memory_efficient_attention()
+                upscaled_images = self.upscaler(
                    prompt=prompt,
                    negative_prompt=n_prompt,
                    num_inference_steps=steps,
-                strength=0.1,
-                # guidance_scale=7.5,
+                    strength=0.3,
+                    guidance_scale=7,
                    generator=generator,
                    image=image,
                ).images
-        generated_images.extend(refined_images)
-        base_images = refined_images
-
-        """
-        Fix the generated images by the control_v11f1e_sd15_tile when `fix_by_controlnet_tile` is `True`.
-        https://huggingface.co/lllyasviel/control_v11f1e_sd15_tile
-        """
-        # if fix_by_controlnet_tile:
-        #     max_embeddings_multiples = self._count_token(p=prompt, n=n_prompt)
-        #     self.controlnet_pipe.to("cuda")
-        #     self.controlnet_pipe.enable_vae_tiling()
-        #     self.controlnet_pipe.enable_xformers_memory_efficient_attention()
-        #     for image in base_images:
-        #         image = self._resize_image(image=image, scale_factor=2)
-        #         with torch.autocast("cuda"):
-        #             fixed_by_controlnet = self.controlnet_pipe(
-        #                 prompt=prompt * batch_size,
-        #                 negative_prompt=n_prompt * batch_size,
-        #                 num_inference_steps=steps,
-        #                 strength=0.3,
-        #                 guidance_scale=7.5,
-        #                 max_embeddings_multiples=max_embeddings_multiples,
-        #                 generator=generator,
-        #                 image=image,
-        #             ).images
-        #     generated_images.extend(fixed_by_controlnet)
-        #     base_images = fixed_by_controlnet
-
-        # if use_upscaler:
-        #     upscaled = self._upscale(
-        #         base_images=base_images,
-        #         half_precision=False,
-        #         tile=700,
-        #         upscaler=upscaler,
-        #     )
-        #     generated_images.extend(upscaled)
+            generated_images.extend(upscaled_images)

        image_output = []
        for image in generated_images:
--- a/cmd/sd15_txt2img.py
+++ b/cmd/sd15_txt2img.py
@ -1,5 +1,4 @@
 import time
-
 import modal
 import util