From 77ebc71c69827b0fbddf6b811a353554e33600ce Mon Sep 17 00:00:00 2001
From: hodanov <1031hoda@gmail.com>
Date: Sat, 8 Jul 2023 20:33:05 +0900
Subject: [PATCH] Separate setup.py.

---
 Makefile             |   2 +-
 setup_files/main.py  | 289 ++++++++++++++++++++++++++++++++++++++++
 setup_files/setup.py | 304 ++-----------------------------------------
 3 files changed, 299 insertions(+), 296 deletions(-)
 create mode 100644 setup_files/main.py

diff --git a/Makefile b/Makefile
index 38750ff..b2db197 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 deploy:
-	modal deploy ./setup_files/setup.py
+	cd ./setup_files && modal deploy main.py
 
 # `--upscaler` is a name of upscaler you want to use.
 # You can use upscalers the below:
diff --git a/setup_files/main.py b/setup_files/main.py
new file mode 100644
index 0000000..1374d0f
--- /dev/null
+++ b/setup_files/main.py
@@ -0,0 +1,289 @@
+from __future__ import annotations
+
+import io
+import os
+
+import diffusers
+import PIL.Image
+import torch
+from modal import Secret, method
+from modal.cls import ClsMixin
+
+from setup import (BASE_CACHE_PATH, BASE_CACHE_PATH_CONTROLNET,
+                   BASE_CACHE_PATH_LORA, BASE_CACHE_PATH_TEXTUAL_INVERSION,
+                   stub)
+
+
+@stub.cls(
+    gpu="A10G",
+    secrets=[Secret.from_dotenv(__file__)],
+)
+class StableDiffusion(ClsMixin):
+    """
+    A class that wraps the Stable Diffusion pipeline and scheduler.
+    """
+
+    def __enter__(self):
+        import yaml
+
+        config = {}
+        with open("/config.yml", "r") as file:
+            config = yaml.safe_load(file)
+        self.cache_path = os.path.join(BASE_CACHE_PATH, config["model"]["name"])
+        if os.path.exists(self.cache_path):
+            print(f"The directory '{self.cache_path}' exists.")
+        else:
+            print(f"The directory '{self.cache_path}' does not exist.")
+
+        torch.cuda.memory._set_allocator_settings("max_split_size_mb:256")
+
+        self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(
+            self.cache_path,
+            custom_pipeline="lpw_stable_diffusion",
+            torch_dtype=torch.float16,
+        )
+
+        # TODO: Add support for other schedulers.
+        self.pipe.scheduler = diffusers.EulerAncestralDiscreteScheduler.from_pretrained(
+            # self.pipe.scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
+            self.cache_path,
+            subfolder="scheduler",
+        )
+
+        vae = config.get("vae")
+        if vae is not None:
+            self.pipe.vae = diffusers.AutoencoderKL.from_pretrained(
+                self.cache_path,
+                subfolder="vae",
+            )
+        self.pipe.to("cuda")
+
+        loras = config.get("loras")
+        if loras is not None:
+            for lora in loras:
+                path = os.path.join(BASE_CACHE_PATH_LORA, lora["name"])
+                if os.path.exists(path):
+                    print(f"The directory '{path}' exists.")
+                else:
+                    print(f"The directory '{path}' does not exist. Need to execute 'modal deploy' first.")
+                self.pipe.load_lora_weights(".", weight_name=path)
+
+        textual_inversions = config.get("textual_inversions")
+        if textual_inversions is not None:
+            for textual_inversion in textual_inversions:
+                path = os.path.join(BASE_CACHE_PATH_TEXTUAL_INVERSION, textual_inversion["name"])
+                if os.path.exists(path):
+                    print(f"The directory '{path}' exists.")
+                else:
+                    print(f"The directory '{path}' does not exist. Need to execute 'modal deploy' first.")
+                self.pipe.load_textual_inversion(path)
+
+        self.pipe.enable_xformers_memory_efficient_attention()
+
+        # TODO: Repair the controlnet loading.
+        controlnets = config.get("controlnets")
+        if controlnets is not None:
+            for controlnet in controlnets:
+                path = os.path.join(BASE_CACHE_PATH_CONTROLNET, controlnet["name"])
+                controlnet = diffusers.ControlNetModel.from_pretrained(path, torch_dtype=torch.float16)
+                self.controlnet_pipe = diffusers.StableDiffusionControlNetPipeline.from_pretrained(
+                    self.cache_path,
+                    controlnet=controlnet,
+                    custom_pipeline="lpw_stable_diffusion",
+                    scheduler=self.pipe.scheduler,
+                    vae=self.pipe.vae,
+                    torch_dtype=torch.float16,
+                )
+                self.controlnet_pipe.to("cuda")
+                self.controlnet_pipe.enable_xformers_memory_efficient_attention()
+
+    @method()
+    def count_token(self, p: str, n: str) -> int:
+        """
+        Count the number of tokens in the prompt and negative prompt.
+        """
+        from transformers import CLIPTokenizer
+
+        tokenizer = CLIPTokenizer.from_pretrained(
+            self.cache_path,
+            subfolder="tokenizer",
+        )
+        token_size_p = len(tokenizer.tokenize(p))
+        token_size_n = len(tokenizer.tokenize(n))
+        token_size = token_size_p
+        if token_size_p <= token_size_n:
+            token_size = token_size_n
+
+        max_embeddings_multiples = 1
+        max_length = tokenizer.model_max_length - 2
+        if token_size > max_length:
+            max_embeddings_multiples = token_size // max_length + 1
+
+        print(f"token_size: {token_size}, max_embeddings_multiples: {max_embeddings_multiples}")
+
+        return max_embeddings_multiples
+
+    @method()
+    def run_inference(
+        self,
+        prompt: str,
+        n_prompt: str,
+        height: int = 512,
+        width: int = 512,
+        samples: int = 1,
+        batch_size: int = 1,
+        steps: int = 30,
+        seed: int = 1,
+        upscaler: str = "",
+        use_face_enhancer: bool = False,
+        fix_by_controlnet_tile: bool = False,
+    ) -> list[bytes]:
+        """
+        Runs the Stable Diffusion pipeline on the given prompt and outputs images.
+        """
+
+        max_embeddings_multiples = self.count_token(p=prompt, n=n_prompt)
+        generator = torch.Generator("cuda").manual_seed(seed)
+        with torch.inference_mode():
+            with torch.autocast("cuda"):
+                generated_images = self.pipe.text2img(
+                    prompt * batch_size,
+                    negative_prompt=n_prompt * batch_size,
+                    height=height,
+                    width=width,
+                    num_inference_steps=steps,
+                    guidance_scale=7.5,
+                    max_embeddings_multiples=max_embeddings_multiples,
+                    generator=generator,
+                ).images
+
+        base_images = generated_images
+
+        """
+        Fix the generated images by the control_v11f1e_sd15_tile when `fix_by_controlnet_tile` is `True`.
+        https://huggingface.co/lllyasviel/control_v11f1e_sd15_tile
+        """
+        if fix_by_controlnet_tile:
+            for image in base_images:
+                image = self.resize_image(image=image, scale_factor=2)
+                with torch.inference_mode():
+                    with torch.autocast("cuda"):
+                        fixed_by_controlnet = self.controlnet_pipe(
+                            prompt=prompt * batch_size,
+                            negative_prompt=n_prompt * batch_size,
+                            num_inference_steps=steps,
+                            strength=0.3,
+                            guidance_scale=7.5,
+                            max_embeddings_multiples=max_embeddings_multiples,
+                            generator=generator,
+                            image=image,
+                        ).images
+            generated_images.extend(fixed_by_controlnet)
+            base_images = fixed_by_controlnet
+
+        if upscaler != "":
+            upscaled = self.upscale(
+                base_images=base_images,
+                half_precision=False,
+                tile=700,
+                upscaler=upscaler,
+                use_face_enhancer=use_face_enhancer,
+            )
+            generated_images.extend(upscaled)
+
+        image_output = []
+        for image in generated_images:
+            with io.BytesIO() as buf:
+                image.save(buf, format="PNG")
+                image_output.append(buf.getvalue())
+
+        return image_output
+
+    @method()
+    def resize_image(self, image: PIL.Image.Image, scale_factor: int) -> PIL.Image.Image:
+        image = image.convert("RGB")
+        width, height = image.size
+        img = image.resize((width * scale_factor, height * scale_factor), resample=PIL.Image.LANCZOS)
+        return img
+
+    @method()
+    def upscale(
+        self,
+        base_images: list[PIL.Image],
+        half_precision: bool = False,
+        tile: int = 0,
+        tile_pad: int = 10,
+        pre_pad: int = 0,
+        upscaler: str = "",
+        use_face_enhancer: bool = False,
+    ) -> list[PIL.Image]:
+        """
+        Upscale the generated images by the upscaler when `upscaler` is selected.
+        The upscaler can be selected from the following list:
+        - `RealESRGAN_x4plus`
+        - `RealESRNet_x4plus`
+        - `RealESRGAN_x4plus_anime_6B`
+        - `RealESRGAN_x2plus`
+        https://github.com/xinntao/Real-ESRGAN
+        """
+        import numpy
+        from basicsr.archs.rrdbnet_arch import RRDBNet
+        from gfpgan import GFPGANer
+        from realesrgan import RealESRGANer
+        from tqdm import tqdm
+
+        model_name = upscaler
+        if model_name == "RealESRGAN_x4plus":
+            upscale_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
+            netscale = 4
+        elif model_name == "RealESRNet_x4plus":
+            upscale_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
+            netscale = 4
+        elif model_name == "RealESRGAN_x4plus_anime_6B":
+            upscale_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
+            netscale = 4
+        elif model_name == "RealESRGAN_x2plus":
+            upscale_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
+            netscale = 2
+        else:
+            raise NotImplementedError("Model name not supported")
+
+        upsampler = RealESRGANer(
+            scale=netscale,
+            model_path=os.path.join(BASE_CACHE_PATH, "esrgan", f"{model_name}.pth"),
+            dni_weight=None,
+            model=upscale_model,
+            tile=tile,
+            tile_pad=tile_pad,
+            pre_pad=pre_pad,
+            half=half_precision,
+            gpu_id=None,
+        )
+
+        if use_face_enhancer:
+            face_enhancer = GFPGANer(
+                model_path=os.path.join(BASE_CACHE_PATH, "esrgan", "GFPGANv1.3.pth"),
+                upscale=netscale,
+                arch="clean",
+                channel_multiplier=2,
+                bg_upsampler=upsampler,
+            )
+
+        upscaled_imgs = []
+        with tqdm(total=len(base_images)) as progress_bar:
+            for img in base_images:
+                img = numpy.array(img)
+                if use_face_enhancer:
+                    _, _, enhance_result = face_enhancer.enhance(
+                        img,
+                        has_aligned=False,
+                        only_center_face=False,
+                        paste_back=True,
+                    )
+                else:
+                    enhance_result, _ = upsampler.enhance(img)
+
+                upscaled_imgs.append(PIL.Image.fromarray(enhance_result))
+                progress_bar.update(1)
+
+        return upscaled_imgs
diff --git a/setup_files/setup.py b/setup_files/setup.py
index 946604c..b7405c9 100644
--- a/setup_files/setup.py
+++ b/setup_files/setup.py
@@ -1,13 +1,9 @@
 from __future__ import annotations
 
-import io
 import os
-from urllib.request import Request, urlopen
 
 import diffusers
-import yaml
-from modal import Image, Mount, Secret, Stub, method
-from modal.cls import ClsMixin
+from modal import Image, Mount, Secret, Stub
 
 BASE_CACHE_PATH = "/vol/cache"
 BASE_CACHE_PATH_LORA = "/vol/cache/lora"
@@ -19,6 +15,8 @@ def download_file(url, file_name, file_path):
     """
     Download files.
     """
+    from urllib.request import Request, urlopen
+
     req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
     downloaded = urlopen(req).read()
     dir_names = os.path.join(file_path, file_name)
@@ -70,6 +68,8 @@ def build_image():
     """
     Build the Docker image.
     """
+    import yaml
+
     token = os.environ["HUGGING_FACE_TOKEN"]
     config = {}
     with open("/config.yml", "r") as file:
@@ -109,302 +109,16 @@ def build_image():
 
 stub = Stub("stable-diffusion-cli")
 base_stub = Image.from_dockerfile(
-    path="./setup_files/Dockerfile",
-    context_mount=Mount.from_local_file("./setup_files/requirements.txt"),
+    path="Dockerfile",
+    context_mount=Mount.from_local_file("requirements.txt"),
 )
 stub.image = base_stub.extend(
     dockerfile_commands=[
         "FROM base",
-        "COPY ./config.yml /",
+        "COPY config.yml /",
     ],
-    context_mount=Mount.from_local_file("./setup_files/config.yml"),
+    context_mount=Mount.from_local_file("config.yml"),
 ).run_function(
     build_image,
     secrets=[Secret.from_dotenv(__file__)],
 )
-
-
-@stub.cls(
-    gpu="A10G",
-    secrets=[Secret.from_dotenv(__file__)],
-)
-class StableDiffusion(ClsMixin):
-    """
-    A class that wraps the Stable Diffusion pipeline and scheduler.
-    """
-
-    def __enter__(self):
-        import torch
-
-        config = {}
-        with open("/config.yml", "r") as file:
-            config = yaml.safe_load(file)
-        self.cache_path = os.path.join(BASE_CACHE_PATH, config["model"]["name"])
-        if os.path.exists(self.cache_path):
-            print(f"The directory '{self.cache_path}' exists.")
-        else:
-            print(f"The directory '{self.cache_path}' does not exist.")
-
-        torch.cuda.memory._set_allocator_settings("max_split_size_mb:256")
-
-        self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(
-            self.cache_path,
-            custom_pipeline="lpw_stable_diffusion",
-            torch_dtype=torch.float16,
-        )
-
-        # TODO: Add support for other schedulers.
-        self.pipe.scheduler = diffusers.EulerAncestralDiscreteScheduler.from_pretrained(
-            # self.pipe.scheduler = diffusers.DPMSolverMultistepScheduler.from_pretrained(
-            self.cache_path,
-            subfolder="scheduler",
-        )
-
-        vae = config.get("vae")
-        if vae is not None:
-            self.pipe.vae = diffusers.AutoencoderKL.from_pretrained(
-                self.cache_path,
-                subfolder="vae",
-            )
-        self.pipe.to("cuda")
-
-        loras = config.get("loras")
-        if loras is not None:
-            for lora in loras:
-                path = os.path.join(BASE_CACHE_PATH_LORA, lora["name"])
-                if os.path.exists(path):
-                    print(f"The directory '{path}' exists.")
-                else:
-                    print(f"The directory '{path}' does not exist. Download it...")
-                    download_file(lora["download_url"], lora["name"], BASE_CACHE_PATH_LORA)
-                self.pipe.load_lora_weights(".", weight_name=path)
-
-        textual_inversions = config.get("textual_inversions")
-        if textual_inversions is not None:
-            for textual_inversion in textual_inversions:
-                path = os.path.join(BASE_CACHE_PATH_TEXTUAL_INVERSION, textual_inversion["name"])
-                if os.path.exists(path):
-                    print(f"The directory '{path}' exists.")
-                else:
-                    print(f"The directory '{path}' does not exist. Download it...")
-                    download_file(
-                        textual_inversion["download_url"],
-                        textual_inversion["name"],
-                        BASE_CACHE_PATH_TEXTUAL_INVERSION,
-                    )
-                self.pipe.load_textual_inversion(path)
-
-        self.pipe.enable_xformers_memory_efficient_attention()
-
-        # TODO: Repair the controlnet loading.
-        controlnets = config.get("controlnets")
-        if controlnets is not None:
-            for controlnet in controlnets:
-                path = os.path.join(BASE_CACHE_PATH_CONTROLNET, controlnet["name"])
-                controlnet = diffusers.ControlNetModel.from_pretrained(path, torch_dtype=torch.float16)
-                self.controlnet_pipe = diffusers.StableDiffusionControlNetPipeline.from_pretrained(
-                    self.cache_path,
-                    controlnet=controlnet,
-                    custom_pipeline="lpw_stable_diffusion",
-                    scheduler=self.pipe.scheduler,
-                    vae=self.pipe.vae,
-                    torch_dtype=torch.float16,
-                )
-                self.controlnet_pipe.to("cuda")
-                self.controlnet_pipe.enable_xformers_memory_efficient_attention()
-
-    @method()
-    def count_token(self, p: str, n: str) -> int:
-        """
-        Count the number of tokens in the prompt and negative prompt.
-        """
-        from transformers import CLIPTokenizer
-
-        tokenizer = CLIPTokenizer.from_pretrained(
-            self.cache_path,
-            subfolder="tokenizer",
-        )
-        token_size_p = len(tokenizer.tokenize(p))
-        token_size_n = len(tokenizer.tokenize(n))
-        token_size = token_size_p
-        if token_size_p <= token_size_n:
-            token_size = token_size_n
-
-        max_embeddings_multiples = 1
-        max_length = tokenizer.model_max_length - 2
-        if token_size > max_length:
-            max_embeddings_multiples = token_size // max_length + 1
-
-        print(f"token_size: {token_size}, max_embeddings_multiples: {max_embeddings_multiples}")
-
-        return max_embeddings_multiples
-
-    @method()
-    def run_inference(
-        self,
-        prompt: str,
-        n_prompt: str,
-        height: int = 512,
-        width: int = 512,
-        samples: int = 1,
-        batch_size: int = 1,
-        steps: int = 30,
-        seed: int = 1,
-        upscaler: str = "",
-        use_face_enhancer: bool = False,
-        fix_by_controlnet_tile: bool = False,
-    ) -> list[bytes]:
-        """
-        Runs the Stable Diffusion pipeline on the given prompt and outputs images.
-        """
-        import torch
-
-        max_embeddings_multiples = self.count_token(p=prompt, n=n_prompt)
-        generator = torch.Generator("cuda").manual_seed(seed)
-        with torch.inference_mode():
-            with torch.autocast("cuda"):
-                generated_images = self.pipe.text2img(
-                    prompt * batch_size,
-                    negative_prompt=n_prompt * batch_size,
-                    height=height,
-                    width=width,
-                    num_inference_steps=steps,
-                    guidance_scale=7.5,
-                    max_embeddings_multiples=max_embeddings_multiples,
-                    generator=generator,
-                ).images
-
-        base_images = generated_images
-
-        """
-        Fix the generated images by the control_v11f1e_sd15_tile when `fix_by_controlnet_tile` is `True`.
-        https://huggingface.co/lllyasviel/control_v11f1e_sd15_tile
-        """
-        if fix_by_controlnet_tile:
-            for image in base_images:
-                image = self.resize_image(image=image, scale_factor=2)
-                with torch.inference_mode():
-                    with torch.autocast("cuda"):
-                        fixed_by_controlnet = self.controlnet_pipe(
-                            prompt=prompt * batch_size,
-                            negative_prompt=n_prompt * batch_size,
-                            num_inference_steps=steps,
-                            strength=0.3,
-                            guidance_scale=7.5,
-                            max_embeddings_multiples=max_embeddings_multiples,
-                            generator=generator,
-                            image=image,
-                        ).images
-            generated_images.extend(fixed_by_controlnet)
-            base_images = fixed_by_controlnet
-
-        if upscaler != "":
-            upscaled = self.upscale(
-                base_images=base_images,
-                half_precision=False,
-                tile=700,
-                upscaler=upscaler,
-                use_face_enhancer=use_face_enhancer,
-            )
-            generated_images.extend(upscaled)
-
-        image_output = []
-        for image in generated_images:
-            with io.BytesIO() as buf:
-                image.save(buf, format="PNG")
-                image_output.append(buf.getvalue())
-
-        return image_output
-
-    @method()
-    def resize_image(self, image: Image.Image, scale_factor: int) -> Image.Image:
-        from PIL import Image
-
-        image = image.convert("RGB")
-        width, height = image.size
-        img = image.resize((width * scale_factor, height * scale_factor), resample=Image.LANCZOS)
-        return img
-
-    @method()
-    def upscale(
-        self,
-        base_images: list[Image.Image],
-        half_precision: bool = False,
-        tile: int = 0,
-        tile_pad: int = 10,
-        pre_pad: int = 0,
-        upscaler: str = "",
-        use_face_enhancer: bool = False,
-    ) -> list[Image.Image]:
-        """
-        Upscale the generated images by the upscaler when `upscaler` is selected.
-        The upscaler can be selected from the following list:
-        - `RealESRGAN_x4plus`
-        - `RealESRNet_x4plus`
-        - `RealESRGAN_x4plus_anime_6B`
-        - `RealESRGAN_x2plus`
-        https://github.com/xinntao/Real-ESRGAN
-        """
-        import numpy
-        from basicsr.archs.rrdbnet_arch import RRDBNet
-        from PIL import Image
-        from realesrgan import RealESRGANer
-        from tqdm import tqdm
-
-        model_name = upscaler
-        if model_name == "RealESRGAN_x4plus":
-            upscale_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
-            netscale = 4
-        elif model_name == "RealESRNet_x4plus":
-            upscale_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
-            netscale = 4
-        elif model_name == "RealESRGAN_x4plus_anime_6B":
-            upscale_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
-            netscale = 4
-        elif model_name == "RealESRGAN_x2plus":
-            upscale_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
-            netscale = 2
-        else:
-            raise NotImplementedError("Model name not supported")
-
-        upsampler = RealESRGANer(
-            scale=netscale,
-            model_path=os.path.join(BASE_CACHE_PATH, "esrgan", f"{model_name}.pth"),
-            dni_weight=None,
-            model=upscale_model,
-            tile=tile,
-            tile_pad=tile_pad,
-            pre_pad=pre_pad,
-            half=half_precision,
-            gpu_id=None,
-        )
-
-        from gfpgan import GFPGANer
-
-        if use_face_enhancer:
-            face_enhancer = GFPGANer(
-                model_path=os.path.join(BASE_CACHE_PATH, "esrgan", "GFPGANv1.3.pth"),
-                upscale=netscale,
-                arch="clean",
-                channel_multiplier=2,
-                bg_upsampler=upsampler,
-            )
-
-        upscaled_imgs = []
-        with tqdm(total=len(base_images)) as progress_bar:
-            for img in base_images:
-                img = numpy.array(img)
-                if use_face_enhancer:
-                    _, _, enhance_result = face_enhancer.enhance(
-                        img,
-                        has_aligned=False,
-                        only_center_face=False,
-                        paste_back=True,
-                    )
-                else:
-                    enhance_result, _ = upsampler.enhance(img)
-
-                upscaled_imgs.append(Image.fromarray(enhance_result))
-                progress_bar.update(1)
-
-        return upscaled_imgs