fix: log and async task issues with modal script

This commit is contained in:
bennykok 2024-02-01 13:42:01 +08:00
parent 38fea1e79f
commit d8951df35f

View File

@ -37,7 +37,9 @@ if not deploy_test:
# dockerfile_image = Image.from_dockerfile(f"{current_directory}/Dockerfile", context_mount=Mount.from_local_dir(f"{current_directory}/data", remote_path="/data")) # dockerfile_image = Image.from_dockerfile(f"{current_directory}/Dockerfile", context_mount=Mount.from_local_dir(f"{current_directory}/data", remote_path="/data"))
dockerfile_image = ( dockerfile_image = (
modal.Image.debian_slim() modal.Image.debian_slim(
python_version="3.11",
)
.apt_install("git", "wget") .apt_install("git", "wget")
.pip_install( .pip_install(
"git+https://github.com/modal-labs/asgiproxy.git", "httpx", "tqdm" "git+https://github.com/modal-labs/asgiproxy.git", "httpx", "tqdm"
@ -83,7 +85,7 @@ if not deploy_test:
# Time to wait between API check attempts in milliseconds # Time to wait between API check attempts in milliseconds
COMFY_API_AVAILABLE_INTERVAL_MS = 50 COMFY_API_AVAILABLE_INTERVAL_MS = 50
# Maximum number of API check attempts # Maximum number of API check attempts
COMFY_API_AVAILABLE_MAX_RETRIES = 500 COMFY_API_AVAILABLE_MAX_RETRIES = 1000
# Time to wait between poll attempts in milliseconds # Time to wait between poll attempts in milliseconds
COMFY_POLLING_INTERVAL_MS = 250 COMFY_POLLING_INTERVAL_MS = 250
# Maximum number of poll attempts # Maximum number of poll attempts
@ -94,7 +96,8 @@ COMFY_HOST = "127.0.0.1:8188"
async def check_server(url, retries=50, delay=500): async def check_server(url, retries=50, delay=500):
import aiohttp import aiohttp
for i in range(retries): # for i in range(retries):
while True:
try: try:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(url) as response: async with session.get(url) as response:
@ -157,6 +160,7 @@ class ComfyDeployRunner:
async def read_stream(self, stream, isStderr): async def read_stream(self, stream, isStderr):
import time import time
while True: while True:
try:
line = await stream.readline() line = await stream.readline()
if line: if line:
l = line.decode('utf-8').strip() l = line.decode('utf-8').strip()
@ -181,6 +185,9 @@ class ComfyDeployRunner:
}) })
else: else:
break break
except asyncio.CancelledError:
# Handle the cancellation here if needed
break # Break out of the loop on cancellation
@enter() @enter()
async def setup(self): async def setup(self):
@ -197,24 +204,22 @@ class ComfyDeployRunner:
# env={**os.environ, "COLUMNS": "10000"} # env={**os.environ, "COLUMNS": "10000"}
) )
stdout_task = asyncio.create_task(
self.read_stream(self.server_process.stdout, False))
stderr_task = asyncio.create_task(
self.read_stream(self.server_process.stderr, True))
await check_server(
f"http://{COMFY_HOST}",
COMFY_API_AVAILABLE_MAX_RETRIES,
COMFY_API_AVAILABLE_INTERVAL_MS,
)
stdout_task.cancel()
stderr_task.cancel()
@exit() @exit()
async def cleanup(self, exc_type, exc_value, traceback): async def cleanup(self, exc_type, exc_value, traceback):
print(f"comfy-modal - cleanup", exc_type, exc_value, traceback) print(f"comfy-modal - cleanup", exc_type, exc_value, traceback)
# self.server_process.kill() # Get the current event loop
loop = asyncio.get_event_loop()
# Check if the event loop is closed
if loop.is_closed():
print("The event loop is closed.")
else:
try:
self.server_process.terminate()
await self.server_process.wait()
except Exception as e:
print("Issues when cleaning up", e)
print("The event loop is open.")
@method() @method()
async def run(self, input: Input): async def run(self, input: Input):
@ -228,6 +233,7 @@ class ComfyDeployRunner:
stderr_task = asyncio.create_task( stderr_task = asyncio.create_task(
self.read_stream(self.server_process.stderr, True)) self.read_stream(self.server_process.stderr, True))
try:
class TimeoutError(Exception): class TimeoutError(Exception):
pass pass
@ -244,9 +250,18 @@ class ComfyDeployRunner:
signal.signal(signal.SIGALRM, timeout_handler) signal.signal(signal.SIGALRM, timeout_handler)
try: try:
# Set an alarm for some seconds in the future
signal.alarm(run_timeout) # 5 seconds timeout signal.alarm(run_timeout) # 5 seconds timeout
ok = await check_server(
f"http://{COMFY_HOST}",
COMFY_API_AVAILABLE_MAX_RETRIES,
COMFY_API_AVAILABLE_INTERVAL_MS,
)
if not ok:
raise Exception("ComfyUI API is not available")
# Set an alarm for some seconds in the future
data = json.dumps({ data = json.dumps({
"run_id": input.prompt_id, "run_id": input.prompt_id,
"status": "started", "status": "started",
@ -295,6 +310,20 @@ class ComfyDeployRunner:
except TimeoutError: except TimeoutError:
print("Operation timed out") print("Operation timed out")
return {"status": "failed"} return {"status": "failed"}
except Exception as e:
print(f"Unexpected error occurred: {str(e)}")
data = json.dumps({
"run_id": input.prompt_id,
"status": "failed",
"time": datetime.now().isoformat()
}).encode('utf-8')
async with aiohttp.ClientSession() as session:
async with session.post(input.status_endpoint, data=data) as response:
print("response", response)
self.machine_logs.append({
"logs": str(e),
"timestamp": time.time()
})
finally: finally:
signal.alarm(0) signal.alarm(0)
@ -312,9 +341,11 @@ class ComfyDeployRunner:
print("uploaded log_data") print("uploaded log_data")
# print(data) # print(data)
self.machine_logs = [] self.machine_logs = []
finally:
stdout_task.cancel() stdout_task.cancel()
stderr_task.cancel() stderr_task.cancel()
await stdout_task
await stderr_task
return result return result