diff --git a/Makefile b/Makefile index e2e07fc..f7631e8 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,17 @@ img_by_sd15_txt2img: --fix-by-controlnet-tile "True" \ --output-format "avif" +img_by_sd15_img2img: + cd ./sdcli && modal run sd15_img2img.py \ + --prompt "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k" \ + --n-prompt "" \ + --samples 1 \ + --steps 30 \ + --upscaler "RealESRGAN_x2plus" \ + --use-face-enhancer "False" \ + --fix-by-controlnet-tile "True" \ + --output-format "avif" \ + --base-image-url "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" img_by_sdxl_txt2img: cd ./sdcli && modal run sdxl_txt2img.py \ diff --git a/README.md b/README.md index 0ff0bf6..85e88be 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ This is a Diffusers-based script for running Stable Diffusion on [Modal](https:/ ## Features -1. Image generation using txt2img +1. Image generation using txt2img or img2img. ![](assets/20230902_tile_imgs.png) 2. Upscaling @@ -53,20 +53,21 @@ Images are generated and output to the `outputs/` directory. ```txt . -├── .env # Secrets manager +├── .env # Secrets manager ├── Makefile ├── README.md -├── sdcli/ # A directory with scripts to run inference. -│   ├── outputs/ # Images are outputted this directory. -│   ├── sd15_txt2img.py # A script to run sd15_txt2img inference. -│   ├── sdxl_txt2img.py # A script to run sdxl_txt2img inference. +├── sdcli/ # A directory with scripts to run inference. +│   ├── outputs/ # Images are outputted this directory. +│   ├── sd15_img2img.py # A script to run sd15_img2img inference. +│   ├── sd15_txt2img.py # A script to run sd15_txt2img inference. +│   ├── sdxl_txt2img.py # A script to run sdxl_txt2img inference. │   └── util.py -└── setup_files/ # A directory with config files. - ├── __main__.py # A main script to run inference. - ├── Dockerfile # To build a base image. - ├── config.yml # To set a model, vae and some tools. +└── setup_files/ # A directory with config files. + ├── __main__.py # A main script to run inference. + ├── Dockerfile # To build a base image. + ├── config.yml # To set a model, vae and some tools. ├── requirements.txt - ├── setup.py # Build an application to deploy on Modal. + ├── setup.py # Build an application to deploy on Modal. ├── stable_diffusion_1_5.py # There is a class to run inference about sd15. └── stable_diffusion_xl.py # There is a class to run inference about sdxl. ``` diff --git a/README_ja.md b/README_ja.md index 358dc9e..992b4c7 100644 --- a/README_ja.md +++ b/README_ja.md @@ -4,7 +4,7 @@ ## このスクリプトでできること -1. txt2imgによる画像生成ができます。 +1. txt2imgまたはimt2imgによる画像生成ができます。 ![txt2imgでの生成画像例](assets/20230902_tile_imgs.png) @@ -53,20 +53,21 @@ modal token new ```txt . -├── .env # Secrets manager +├── .env # Secrets manager ├── Makefile ├── README.md -├── sdcli/ # A directory with scripts to run inference. -│   ├── outputs/ # Images are outputted this directory. -│   ├── sd15_txt2img.py # A script to run sd15_txt2img inference. -│   ├── sdxl_txt2img.py # A script to run sdxl_txt2img inference. +├── sdcli/ # A directory with scripts to run inference. +│   ├── outputs/ # Images are outputted this directory. +│   ├── sd15_img2img.py # A script to run sd15_img2img inference. +│   ├── sd15_txt2img.py # A script to run sd15_txt2img inference. +│   ├── sdxl_txt2img.py # A script to run sdxl_txt2img inference. │   └── util.py -└── setup_files/ # A directory with config files. - ├── __main__.py # A main script to run inference. - ├── Dockerfile # To build a base image. - ├── config.yml # To set a model, vae and some tools. +└── setup_files/ # A directory with config files. + ├── __main__.py # A main script to run inference. + ├── Dockerfile # To build a base image. + ├── config.yml # To set a model, vae and some tools. ├── requirements.txt - ├── setup.py # Build an application to deploy on Modal. + ├── setup.py # Build an application to deploy on Modal. ├── stable_diffusion_1_5.py # There is a class to run inference about sd15. └── stable_diffusion_xl.py # There is a class to run inference about sdxl. ``` diff --git a/sdcli/sd15_img2img.py b/sdcli/sd15_img2img.py new file mode 100644 index 0000000..34771fd --- /dev/null +++ b/sdcli/sd15_img2img.py @@ -0,0 +1,58 @@ +import time + +import modal +import util + +stub = modal.Stub("run-stable-diffusion-cli") +stub.run_inference = modal.Function.from_name("stable-diffusion-cli", "SD15.run_img2img_inference") + + +@stub.local_entrypoint() +def main( + prompt: str, + n_prompt: str, + samples: int = 5, + batch_size: int = 1, + steps: int = 20, + seed: int = -1, + upscaler: str = "", + use_face_enhancer: str = "False", + fix_by_controlnet_tile: str = "False", + output_format: str = "png", + base_image_url: str = "", +): + """ + This function is the entrypoint for the Runway CLI. + The function pass the given prompt to StableDiffusion on Modal, + gets back a list of images and outputs images to local. + """ + directory = util.make_directory() + seed_generated = seed + for i in range(samples): + if seed == -1: + seed_generated = util.generate_seed() + start_time = time.time() + images = stub.run_inference.remote( + prompt=prompt, + n_prompt=n_prompt, + batch_size=batch_size, + steps=steps, + seed=seed_generated, + upscaler=upscaler, + use_face_enhancer=use_face_enhancer == "True", + fix_by_controlnet_tile=fix_by_controlnet_tile == "True", + output_format=output_format, + base_image_url=base_image_url, + ) + util.save_images(directory, images, seed_generated, i, output_format) + total_time = time.time() - start_time + print(f"Sample {i} took {total_time:.3f}s ({(total_time)/len(images):.3f}s / image).") + + prompts: dict[str, int | str] = { + "prompt": prompt, + "n_prompt": n_prompt, + "samples": samples, + "batch_size": batch_size, + "steps": steps, + } + util.save_prompts(prompts) diff --git a/sdcli/sd15_txt2img.py b/sdcli/sd15_txt2img.py index 26d876f..64131f4 100644 --- a/sdcli/sd15_txt2img.py +++ b/sdcli/sd15_txt2img.py @@ -4,7 +4,7 @@ import modal import util stub = modal.Stub("run-stable-diffusion-cli") -stub.run_inference = modal.Function.from_name("stable-diffusion-cli", "SD15Txt2Img.run_inference") +stub.run_inference = modal.Function.from_name("stable-diffusion-cli", "SD15.run_txt2img_inference") @stub.local_entrypoint() diff --git a/setup_files/__main__.py b/setup_files/__main__.py index 2d88a7b..48130f2 100644 --- a/setup_files/__main__.py +++ b/setup_files/__main__.py @@ -7,7 +7,7 @@ from setup import stub @stub.function(gpu="A10G") def main(): - stable_diffusion_1_5.SD15Txt2Img + stable_diffusion_1_5.SD15 stable_diffusion_xl.SDXLTxt2Img diff --git a/setup_files/stable_diffusion_1_5.py b/setup_files/stable_diffusion_1_5.py index 8b55529..741046a 100644 --- a/setup_files/stable_diffusion_1_5.py +++ b/setup_files/stable_diffusion_1_5.py @@ -18,9 +18,9 @@ from setup import ( gpu="A10G", secrets=[Secret.from_dotenv(__file__)], ) -class SD15Txt2Img: +class SD15: """ - A class that wraps the Stable Diffusion pipeline and scheduler. + SD15 is a class that runs inference using Stable Diffusion 1.5. """ def __enter__(self): @@ -50,6 +50,7 @@ class SD15Txt2Img: self.cache_path, subfolder="scheduler", ) + # self.pipe.scheduler = diffusers.LCMScheduler.from_config(self.pipe.scheduler.config) vae = config.get("vae") if vae is not None: @@ -121,7 +122,7 @@ class SD15Txt2Img: return max_embeddings_multiples @method() - def run_inference( + def run_txt2img_inference( self, prompt: str, n_prompt: str, @@ -148,7 +149,7 @@ class SD15Txt2Img: self.pipe.enable_xformers_memory_efficient_attention() with torch.autocast("cuda"): generated_images = self.pipe( - prompt * batch_size, + prompt=prompt * batch_size, negative_prompt=n_prompt * batch_size, height=height, width=width, @@ -202,6 +203,87 @@ class SD15Txt2Img: return image_output + @method() + def run_img2img_inference( + self, + prompt: str, + n_prompt: str, + batch_size: int = 1, + steps: int = 30, + seed: int = 1, + upscaler: str = "", + use_face_enhancer: bool = False, + fix_by_controlnet_tile: bool = False, + output_format: str = "png", + base_image_url: str = "", + ) -> list[bytes]: + """ + Runs the Stable Diffusion pipeline on the given prompt and outputs images. + """ + import pillow_avif # noqa: F401 + import torch + from diffusers.utils import load_image + + max_embeddings_multiples = self._count_token(p=prompt, n=n_prompt) + generator = torch.Generator("cuda").manual_seed(seed) + self.pipe.to("cuda") + self.pipe.enable_vae_tiling() + self.pipe.enable_xformers_memory_efficient_attention() + with torch.autocast("cuda"): + generated_images = self.pipe( + prompt=prompt * batch_size, + negative_prompt=n_prompt * batch_size, + num_inference_steps=steps, + guidance_scale=7.5, + max_embeddings_multiples=max_embeddings_multiples, + generator=generator, + image=load_image(base_image_url), + ).images + + base_images = generated_images + + """ + Fix the generated images by the control_v11f1e_sd15_tile when `fix_by_controlnet_tile` is `True`. + https://huggingface.co/lllyasviel/control_v11f1e_sd15_tile + """ + if fix_by_controlnet_tile: + self.controlnet_pipe.to("cuda") + self.controlnet_pipe.enable_vae_tiling() + self.controlnet_pipe.enable_xformers_memory_efficient_attention() + for image in base_images: + image = self._resize_image(image=image, scale_factor=2) + with torch.autocast("cuda"): + fixed_by_controlnet = self.controlnet_pipe( + prompt=prompt * batch_size, + negative_prompt=n_prompt * batch_size, + num_inference_steps=steps, + strength=0.3, + guidance_scale=7.5, + max_embeddings_multiples=max_embeddings_multiples, + generator=generator, + image=image, + ).images + generated_images.extend(fixed_by_controlnet) + base_images = fixed_by_controlnet + + if upscaler != "": + upscaled = self._upscale( + base_images=base_images, + half_precision=False, + tile=700, + upscaler=upscaler, + use_face_enhancer=use_face_enhancer, + ) + generated_images.extend(upscaled) + + image_output = [] + for image in generated_images: + with io.BytesIO() as buf: + image.save(buf, format=output_format) + image_output.append(buf.getvalue()) + + return image_output + def _resize_image(self, image: PIL.Image.Image, scale_factor: int) -> PIL.Image.Image: image = image.convert("RGB") width, height = image.size