From 04d525591202864a133d6a62e04e77362dcd9c60 Mon Sep 17 00:00:00 2001
From: hodanov <1031hoda@gmail.com>
Date: Sun, 26 Nov 2023 12:03:13 +0900
Subject: [PATCH 1/2] Modify setup.py to use a safetensors file.

---
 setup_files/__main__.py       |   7 +--
 setup_files/config.sample.yml |  14 ++---
 setup_files/setup.py          |  20 +++---
 setup_files/txt2img.py        | 111 ++++++++++++----------------------
 4 files changed, 59 insertions(+), 93 deletions(-)

diff --git a/setup_files/__main__.py b/setup_files/__main__.py
index 833fecb..d1ff313 100644
--- a/setup_files/__main__.py
+++ b/setup_files/__main__.py
@@ -1,14 +1,13 @@
 from __future__ import annotations
 
 from setup import stub
-from txt2img import new_stable_diffusion
+from txt2img import StableDiffusion
 
 
 @stub.function(gpu="A10G")
 def main():
-    sd = new_stable_diffusion()
-    print(f"Deploy '{sd.__class__.__name__}'.")
+    StableDiffusion
 
 
 if __name__ == "__main__":
-    main()
+    main.local()
diff --git a/setup_files/config.sample.yml b/setup_files/config.sample.yml
index cb6119d..52ec0d8 100644
--- a/setup_files/config.sample.yml
+++ b/setup_files/config.sample.yml
@@ -7,28 +7,28 @@
 ##########
 # You can use a diffusers model and VAE on hugging face.
 model:
-  name: stable-diffusion-2-1
-  repo_id: stabilityai/stable-diffusion-2-1
+  name: stable-diffusion-1-5
+  url: https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors
 vae:
   name: sd-vae-ft-mse
-  repo_id: stabilityai/sd-vae-ft-mse
+  url: https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors
 ##########
 # Add LoRA if you want to use one. You can use a download url such as the below.
 # ex)
 # loras:
 #   - name: hogehoge.safetensors
-#     download_url: https://hogehoge/xxxx
+#     url: https://hogehoge/xxxx
 #   - name: fugafuga.safetensors
-#     download_url: https://fugafuga/xxxx
+#     url: https://fugafuga/xxxx
 
 ##########
 # You can use Textual Inversion and ControlNet also. Usage is the same as `loras`.
 # ex)
 # textual_inversions:
 #   - name: hogehoge
-#     download_url: https://hogehoge/xxxx
+#     url: https://hogehoge/xxxx
 #   - name: fugafuga
-#     download_url: https://fugafuga/xxxx
+#     url: https://fugafuga/xxxx
 controlnets:
   - name: control_v11f1e_sd15_tile
     repo_id: lllyasviel/control_v11f1e_sd15_tile
diff --git a/setup_files/setup.py b/setup_files/setup.py
index b7405c9..726bf01 100644
--- a/setup_files/setup.py
+++ b/setup_files/setup.py
@@ -38,26 +38,26 @@ def download_controlnet(name: str, repo_id: str, token: str):
     controlnet.save_pretrained(cache_path, safe_serialization=True)
 
 
-def download_vae(name: str, repo_id: str, token: str):
+def download_vae(name: str, model_url: str, token: str):
     """
     Download a vae.
     """
     cache_path = os.path.join(BASE_CACHE_PATH, name)
-    vae = diffusers.AutoencoderKL.from_pretrained(
-        repo_id,
+    vae = diffusers.AutoencoderKL.from_single_file(
+        pretrained_model_link_or_path=model_url,
         use_auth_token=token,
         cache_dir=cache_path,
     )
     vae.save_pretrained(cache_path, safe_serialization=True)
 
 
-def download_model(name: str, repo_id: str, token: str):
+def download_model(name: str, model_url: str, token: str):
     """
     Download a model.
     """
     cache_path = os.path.join(BASE_CACHE_PATH, name)
-    pipe = diffusers.StableDiffusionPipeline.from_pretrained(
-        repo_id,
+    pipe = diffusers.StableDiffusionPipeline.from_single_file(
+        pretrained_model_link_or_path=model_url,
         use_auth_token=token,
         cache_dir=cache_path,
     )
@@ -77,11 +77,11 @@ def build_image():
 
     model = config.get("model")
     if model is not None:
-        download_model(name=model["name"], repo_id=model["repo_id"], token=token)
+        download_model(name=model["name"], model_url=model["url"], token=token)
 
     vae = config.get("vae")
     if vae is not None:
-        download_vae(name=model["name"], repo_id=vae["repo_id"], token=token)
+        download_vae(name=model["name"], model_url=vae["url"], token=token)
 
     controlnets = config.get("controlnets")
     if controlnets is not None:
@@ -92,7 +92,7 @@ def build_image():
     if loras is not None:
         for lora in loras:
             download_file(
-                url=lora["download_url"],
+                url=lora["url"],
                 file_name=lora["name"],
                 file_path=BASE_CACHE_PATH_LORA,
             )
@@ -101,7 +101,7 @@ def build_image():
     if textual_inversions is not None:
         for textual_inversion in textual_inversions:
             download_file(
-                url=textual_inversion["download_url"],
+                url=textual_inversion["url"],
                 file_name=textual_inversion["name"],
                 file_path=BASE_CACHE_PATH_TEXTUAL_INVERSION,
             )
diff --git a/setup_files/txt2img.py b/setup_files/txt2img.py
index eefecdf..4a0e110 100644
--- a/setup_files/txt2img.py
+++ b/setup_files/txt2img.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import abc
 import io
 import os
 
@@ -9,51 +8,20 @@ import PIL.Image
 import torch
 from modal import Secret, method
 
-from setup import (BASE_CACHE_PATH, BASE_CACHE_PATH_CONTROLNET,
-                   BASE_CACHE_PATH_LORA, BASE_CACHE_PATH_TEXTUAL_INVERSION,
-                   stub)
-
-
-def new_stable_diffusion() -> StableDiffusionInterface:
-    return StableDiffusion()
-
-
-class StableDiffusionInterface(metaclass=abc.ABCMeta):
-    """
-    A StableDiffusionInterface is an interface that will be used for StableDiffusion class creation.
-    """
-
-    @classmethod
-    def __subclasshook__(cls, subclass):
-        return hasattr(subclass, "run_inference") and callable(subclass.run_inference)
-
-    @abc.abstractmethod
-    @method()
-    def run_inference(
-        self,
-        prompt: str,
-        n_prompt: str,
-        height: int = 512,
-        width: int = 512,
-        samples: int = 1,
-        batch_size: int = 1,
-        steps: int = 30,
-        seed: int = 1,
-        upscaler: str = "",
-        use_face_enhancer: bool = False,
-        fix_by_controlnet_tile: bool = False,
-    ) -> list[bytes]:
-        """
-        Run inference.
-        """
-        raise NotImplementedError
+from setup import (
+    BASE_CACHE_PATH,
+    BASE_CACHE_PATH_CONTROLNET,
+    BASE_CACHE_PATH_LORA,
+    BASE_CACHE_PATH_TEXTUAL_INVERSION,
+    stub,
+)
 
 
 @stub.cls(
     gpu="A10G",
     secrets=[Secret.from_dotenv(__file__)],
 )
-class StableDiffusion(StableDiffusionInterface):
+class StableDiffusion:
     """
     A class that wraps the Stable Diffusion pipeline and scheduler.
     """
@@ -70,12 +38,11 @@ class StableDiffusion(StableDiffusionInterface):
         else:
             print(f"The directory '{self.cache_path}' does not exist.")
 
-        # torch.cuda.memory._set_allocator_settings("max_split_size_mb:256")
-
         self.pipe = diffusers.StableDiffusionPipeline.from_pretrained(
             self.cache_path,
             custom_pipeline="lpw_stable_diffusion",
             torch_dtype=torch.float16,
+            use_safetensors=True,
         )
 
         # TODO: Add support for other schedulers.
@@ -90,8 +57,8 @@ class StableDiffusion(StableDiffusionInterface):
             self.pipe.vae = diffusers.AutoencoderKL.from_pretrained(
                 self.cache_path,
                 subfolder="vae",
+                use_safetensors=True,
             )
-        self.pipe.to("cuda")
 
         loras = config.get("loras")
         if loras is not None:
@@ -113,7 +80,7 @@ class StableDiffusion(StableDiffusionInterface):
                     print(f"The directory '{path}' does not exist. Need to execute 'modal deploy' first.")
                 self.pipe.load_textual_inversion(path)
 
-        self.pipe.enable_xformers_memory_efficient_attention()
+        self.pipe = self.pipe.to("cuda")
 
         # TODO: Repair the controlnet loading.
         controlnets = config.get("controlnets")
@@ -128,9 +95,9 @@ class StableDiffusion(StableDiffusionInterface):
                     scheduler=self.pipe.scheduler,
                     vae=self.pipe.vae,
                     torch_dtype=torch.float16,
+                    use_safetensors=True,
                 )
-                self.controlnet_pipe.to("cuda")
-                self.controlnet_pipe.enable_xformers_memory_efficient_attention()
+            self.controlnet_pipe = self.controlnet_pipe.to("cuda")
 
     def _count_token(self, p: str, n: str) -> int:
         """
@@ -164,7 +131,6 @@ class StableDiffusion(StableDiffusionInterface):
         n_prompt: str,
         height: int = 512,
         width: int = 512,
-        samples: int = 1,
         batch_size: int = 1,
         steps: int = 30,
         seed: int = 1,
@@ -175,21 +141,21 @@ class StableDiffusion(StableDiffusionInterface):
         """
         Runs the Stable Diffusion pipeline on the given prompt and outputs images.
         """
-
         max_embeddings_multiples = self._count_token(p=prompt, n=n_prompt)
         generator = torch.Generator("cuda").manual_seed(seed)
-        with torch.inference_mode():
-            with torch.autocast("cuda"):
-                generated_images = self.pipe(
-                    prompt * batch_size,
-                    negative_prompt=n_prompt * batch_size,
-                    height=height,
-                    width=width,
-                    num_inference_steps=steps,
-                    guidance_scale=7.5,
-                    max_embeddings_multiples=max_embeddings_multiples,
-                    generator=generator,
-                ).images
+        self.pipe.enable_vae_tiling()
+        self.pipe.enable_xformers_memory_efficient_attention()
+        with torch.autocast("cuda"):
+            generated_images = self.pipe(
+                prompt * batch_size,
+                negative_prompt=n_prompt * batch_size,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                guidance_scale=7.5,
+                max_embeddings_multiples=max_embeddings_multiples,
+                generator=generator,
+            ).images
 
         base_images = generated_images
 
@@ -198,20 +164,21 @@ class StableDiffusion(StableDiffusionInterface):
         https://huggingface.co/lllyasviel/control_v11f1e_sd15_tile
         """
         if fix_by_controlnet_tile:
+            self.controlnet_pipe.enable_vae_tiling()
+            self.controlnet_pipe.enable_xformers_memory_efficient_attention()
             for image in base_images:
                 image = self._resize_image(image=image, scale_factor=2)
-                with torch.inference_mode():
-                    with torch.autocast("cuda"):
-                        fixed_by_controlnet = self.controlnet_pipe(
-                            prompt=prompt * batch_size,
-                            negative_prompt=n_prompt * batch_size,
-                            num_inference_steps=steps,
-                            strength=0.3,
-                            guidance_scale=7.5,
-                            max_embeddings_multiples=max_embeddings_multiples,
-                            generator=generator,
-                            image=image,
-                        ).images
+                with torch.autocast("cuda"):
+                    fixed_by_controlnet = self.controlnet_pipe(
+                        prompt=prompt * batch_size,
+                        negative_prompt=n_prompt * batch_size,
+                        num_inference_steps=steps,
+                        strength=0.3,
+                        guidance_scale=7.5,
+                        max_embeddings_multiples=max_embeddings_multiples,
+                        generator=generator,
+                        image=image,
+                    ).images
             generated_images.extend(fixed_by_controlnet)
             base_images = fixed_by_controlnet
 

From 1c182cbcef5e27936d2a37d4bc8f7eb61d9b33e0 Mon Sep 17 00:00:00 2001
From: hodanov <1031hoda@gmail.com>
Date: Sun, 26 Nov 2023 12:03:25 +0900
Subject: [PATCH 2/2] Update README.

---
 README.md    | 48 ++++++++++++++++++------------------------------
 README_ja.md | 52 +++++++++++++++++++++-------------------------------
 2 files changed, 39 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index c1bbe36..15044a1 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 [日本語版 README はこちら](README_ja.md)
 
-# Stable Diffusion Modal
+# Stable Diffusion CLI on Modal
 
-This is a Diffusers-based script for running Stable Diffusion on [Modal](https://modal.com/). It can perform txt2img inference and has the ability to increase resolution using ControlNet Tile and Upscaler.
+This is a Diffusers-based script for running Stable Diffusion on [Modal](https://modal.com/). This script has no WebUI and only works with CLI. It can perform txt2img inference and has the ability to increase resolution using ControlNet Tile and Upscaler.
 
 ## Features
 
@@ -25,13 +25,13 @@ The app requires the following to run:
 
 The `modal-client` is the Python library. In order to install that:
 
-```
+```bash
 pip install modal-client
 ```
 
 And you need a modal token to use this script:
 
-```
+```bash
 modal token new
 ```
 
@@ -51,7 +51,7 @@ Images are generated and output to the `outputs/` directory.
 
 ## Directory structure
 
-```
+```txt
 .
 ├── .env                    # Secrets manager
 ├── Makefile
@@ -73,7 +73,7 @@ Images are generated and output to the `outputs/` directory.
 
 ### 1. `git clone` the repository
 
-```
+```bash
 git clone https://github.com/hodanov/stable-diffusion-modal.git
 cd stable-diffusion-modal
 ```
@@ -84,53 +84,41 @@ Hugging Add hugging_face_token to .env file.
 
 This script downloads and uses a model from HuggingFace, but if you want to use a model in a private repository, you will need to set this environment variable.
 
-```
+```txt
 HUGGING_FACE_TOKEN="Write your hugging face token here."
 ```
 
 ### 3. Add the model to ./setup_files/config.yml
 
-Add the model used for inference. VAE, LoRA, and Textual Inversion are also configurable.
+Add the model used for inference. Use the Safetensors file as is. VAE, LoRA, and Textual Inversion are also configurable.
 
-```
+```yml
 # ex)
 model:
-  name: stable-diffusion-2-1
-  repo_id: stabilityai/stable-diffusion-2-1
+  name: stable-diffusion-1-5
+  url: https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors # Specify URL for the safetensor file.
 vae:
   name: sd-vae-ft-mse
-  repo_id: stabilityai/sd-vae-ft-mse
+  url: https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors
 controlnets:
   - name: control_v11f1e_sd15_tile
     repo_id: lllyasviel/control_v11f1e_sd15_tile
 ```
 
-Use a model configured for Diffusers, such as the one found in [this repository](https://huggingface.co/stabilityai/stable-diffusion-2-1). Files in safetensor format shared by Civitai etc. need to be converted (you can do so with a script in the diffusers official repository).
+If you want to use LoRA and Textual Inversion, configure as follows.
 
-[https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py](https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py)
-
-```
-# Example of using conversion script
-python ./diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py --from_safetensors \
---checkpoint_path="Write the filename of safetensor format here" \
---dump_path="Write the output path here" \
---device='cuda:0'
-```
-
-LoRA and Textual Inversion don't require any conversion and can directly use safetensors files. Add the download link to config.yml as below.
-
-```
+```yml
 # Example
 loras:
   - name: lora_name.safetensors # Specify the LoRA file name. Any name is fine, but the extension `.safetensors` is required.
-    download_url: download_link_here # Specify the download link for the safetensor file.
+    url: download_link_here # Specify the download link for the safetensor file.
 ```
 
 ### 4. Setting prompts
 
 Set the prompt to Makefile.
 
-```
+```makefile
 # ex)
 run:
  cd ./sdcli && modal run txt2img.py \
@@ -150,7 +138,7 @@ run:
 
 Execute the below command. An application will be deployed on Modal.
 
-```
+```bash
 make deploy
 ```
 
@@ -158,7 +146,7 @@ make deploy
 
 The txt2img inference is executed with the following command.
 
-```
+```bash
 make run
 ```
 
diff --git a/README_ja.md b/README_ja.md
index 60e491b..89fdb6d 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -1,12 +1,12 @@
-# Stable Diffusion Modal
+# Stable Diffusion CLI on Modal
 
-[Modal](https://modal.com/)上でStable Diffusionを動かすためのDiffusersベースのスクリプトです。txt2imgの推論を実行することができ、ControlNet TileとUpscalerを利用した高解像度化の機能を備えています。
+[Modal](https://modal.com/)上でStable Diffusionを動かすためのDiffusersベースのスクリプトです。WebUIは無く、CLIでのみ動作します。txt2imgの推論を実行することができ、ControlNet TileとUpscalerを利用した高解像度化の機能を備えています。
 
 ## このスクリプトでできること
 
 1. txt2imgによる画像生成ができます。
 
-![](assets/20230902_tile_imgs.png)
+![txt2imgでの生成画像例](assets/20230902_tile_imgs.png)
 
 2. アップスケーラーとControlNet Tileを利用した高解像度な画像を生成することができます。
 
@@ -27,13 +27,13 @@
 
 `modal-client`はModalをCLIから操作するためのPythonライブラリです。下記のようにインストールします:
 
-```
+```bash
 pip install modal-client
 ```
 
 And you need a modal token to use this script:
 
-```
+```bash
 modal token new
 ```
 
@@ -51,7 +51,7 @@ modal token new
 
 ## ディレクトリ構成
 
-```
+```txt
 .
 ├── .env                    # Secrets manager
 ├── Makefile
@@ -73,7 +73,7 @@ modal token new
 
 ### 1. リポジトリをgit cloneする
 
-```
+```bash
 git clone https://github.com/hodanov/stable-diffusion-modal.git
 cd stable-diffusion-modal
 ```
@@ -84,53 +84,43 @@ Hugging FaceのトークンをHUGGING_FACE_TOKENに記入します。
 
 このスクリプトはHuggingFaceからモデルをダウンロードして使用しますが、プライベートリポジトリにあるモデルを参照する場合、この環境変数の設定が必要です。
 
-```
+```txt
 HUGGING_FACE_TOKEN="ここにHuggingFaceのトークンを記載する"
 ```
 
 ### 3. ./setup_files/config.ymlを設定する
 
-推論に使うモデルを設定します。VAE、LoRA、Textual Inversionも設定可能です。
+推論に使うモデルを設定します。Safetensorsファイルをそのまま利用します。VAE、LoRA、Textual Inversionも設定可能です。
 
-```
+下記のように、nameにモデル名、urlにSafetensorsファイルがあるURLを指定します。
+
+```yml
 # 設定例
 model:
-  name: stable-diffusion-2-1 # モデル名を指定
-  repo_id: stabilityai/stable-diffusion-2-1 # リポジトリのID（「プロファイル名/モデル名」の形で指定）
+  name: stable-diffusion-1-5
+  url: https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors # Specify URL for the safetensor file.
 vae:
   name: sd-vae-ft-mse
-  repo_id: stabilityai/sd-vae-ft-mse
+  url: https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors
 controlnets:
   - name: control_v11f1e_sd15_tile
     repo_id: lllyasviel/control_v11f1e_sd15_tile
 ```
 
-ModelとVAEは[こちらのリポジトリ](https://huggingface.co/stabilityai/stable-diffusion-2-1)にあるような、Diffusersのために構成されたモデルを利用します。Civitaiなどで共有されているsafetensors形式のファイルは変換が必要です（diffusersの公式リポジトリにあるスクリプトで変換できます）。
+LoRAは下記のように指定します。
 
-[変換スクリプト](https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py)
-
-LoRAとTextual Inversionは変換不要で、safetensorsファイルをそのまま利用できます。
-
-```
+```yml
 # 設定例
 loras:
   - name: mecha.safetensors # ファイル名を指定。任意の名前で良いが、拡張子`.safetensors`は必須。
-    download_url: https://civitai.com/api/download/models/150907?type=Model&format=SafeTensor # ダウンロードリンクを指定
-```
-
-```
-# 変換スクリプトの使用例
-python ./diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py --from_safetensors \
---checkpoint_path="ここに変換したいsafetensors形式のファイルを指定" \
---dump_path="出力先を指定" \
---device='cuda:0'
+    url: https://civitai.com/api/download/models/150907?type=Model&format=SafeTensor # ダウンロードリンクを指定
 ```
 
 ### 4. Makefileの設定（プロンプトの設定）
 
 プロンプトをMakefileに設定します。
 
-```
+```makefile
 # 設定例
 run:
  cd ./sdcli && modal run txt2img.py \
@@ -160,7 +150,7 @@ run:
 
 下記のコマンドでModal上にアプリケーションが構築されます。
 
-```
+```bash
 make deploy
 ```
 
@@ -168,6 +158,6 @@ make deploy
 
 下記のコマンドでtxt2img推論が実行されます。
 
-```
+```bash
 make run
 ```