From 81daf3f67daf1c8edc7e8f85bcbddd8523fbd3b8 Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Thu, 15 Jan 2026 14:31:00 +0100 Subject: [PATCH] Add prompt enhancement feature to video generation - Introduced `enhance_prompt`, `max_tokens`, and `temperature` parameters in `generate_video` function for improved prompt handling. - Implemented prompt enhancement logic using the new `enhance_t2v` method in the text encoder. - Added command-line arguments for prompt enhancement options. - Created new system prompt files for T2V and I2V generation to guide the enhancement process. --- mlx_video/generate.py | 40 ++- .../ltx/prompts/gemma_i2v_system_prompt.txt | 30 +++ .../ltx/prompts/gemma_t2v_system_prompt.txt | 40 +++ mlx_video/models/ltx/text_encoder.py | 236 ++++++++++++++++-- 4 files changed, 320 insertions(+), 26 deletions(-) create mode 100644 mlx_video/models/ltx/prompts/gemma_i2v_system_prompt.txt create mode 100644 mlx_video/models/ltx/prompts/gemma_t2v_system_prompt.txt diff --git a/mlx_video/generate.py b/mlx_video/generate.py index 4bf889f..4c78bb3 100644 --- a/mlx_video/generate.py +++ b/mlx_video/generate.py @@ -160,6 +160,9 @@ def generate_video( output_path: str = "output.mp4", save_frames: bool = False, verbose: bool = True, + enhance_prompt: bool = False, + max_tokens: int = 512, + temperature: float = 0.7, ): """Generate video from text prompt. @@ -206,6 +209,12 @@ def generate_video( text_encoder.load(model_path=model_path, text_encoder_path=text_encoder_path) mx.eval(text_encoder.parameters()) + # Optionally enhance the prompt + if enhance_prompt: + print(f"{Colors.MAGENTA}✨ Enhancing prompt...{Colors.RESET}") + prompt = text_encoder.enhance_t2v(prompt, max_tokens=max_tokens, temperature=temperature, seed=seed, verbose=verbose) + print(f"{Colors.DIM}Enhanced: {prompt[:150]}{'...' if len(prompt) > 150 else ''}{Colors.RESET}") + text_embeddings, _ = text_encoder(prompt) mx.eval(text_embeddings) @@ -373,7 +382,7 @@ Examples: help="Frames per second for output video (default: 24)" ) parser.add_argument( - "--output", "-o", + "--output-path", type=str, default="output.mp4", help="Output video path (default: output.mp4)" @@ -400,20 +409,27 @@ Examples: action="store_true", help="Verbose output" ) + parser.add_argument( + "--enhance-prompt", + action="store_true", + help="Enhance the prompt using Gemma before generation" + ) + parser.add_argument( + "--max-tokens", + type=int, + default=512, + help="Maximum number of tokens to generate (default: 512)" + ) + parser.add_argument( + "--temperature", + type=float, + default=0.7, + help="Temperature for prompt enhancement (default: 0.7)" + ) args = parser.parse_args() generate_video( - model_repo=args.model_repo, - text_encoder_repo=args.text_encoder_repo, - prompt=args.prompt, - height=args.height, - width=args.width, - num_frames=args.num_frames, - seed=args.seed, - fps=args.fps, - output_path=args.output, - save_frames=args.save_frames, - verbose=args.verbose, + **vars(args) ) diff --git a/mlx_video/models/ltx/prompts/gemma_i2v_system_prompt.txt b/mlx_video/models/ltx/prompts/gemma_i2v_system_prompt.txt new file mode 100644 index 0000000..0d67724 --- /dev/null +++ b/mlx_video/models/ltx/prompts/gemma_i2v_system_prompt.txt @@ -0,0 +1,30 @@ +You are a Creative Assistant writing concise, action-focused image-to-video prompts. Given an image (first frame) and user Raw Input Prompt, generate a prompt to guide video generation from that image. + +#### Guidelines: +- Analyze the Image: Identify Subject, Setting, Elements, Style and Mood. +- Follow user Raw Input Prompt: Include all requested motion, actions, camera movements, audio, and details. If in conflict with the image, prioritize user request while maintaining visual consistency (describe transition from image to user's scene). +- Describe only changes from the image: Don't reiterate established visual details. Inaccurate descriptions may cause scene cuts. +- Active language: Use present-progressive verbs ("is walking," "speaking"). If no action specified, describe natural movements. +- Chronological flow: Use temporal connectors ("as," "then," "while"). +- Audio layer: Describe complete soundscape throughout the prompt alongside actions—NOT at the end. Align audio intensity with action tempo. Include natural background audio, ambient sounds, effects, speech or music (when requested). Be specific (e.g., "soft footsteps on tile") not vague (e.g., "ambient sound"). +- Speech (only when requested): Provide exact words in quotes with character's visual/voice characteristics (e.g., "The tall man speaks in a low, gravelly voice"), language if not English and accent if relevant. If general conversation mentioned without text, generate contextual quoted dialogue. (i.e., "The man is talking" input -> the output should include exact spoken words, like: "The man is talking in an excited voice saying: 'You won't believe what I just saw!' His hands gesture expressively as he speaks, eyebrows raised with enthusiasm. The ambient sound of a quiet room underscores his animated speech.") +- Style: Include visual style at beginning: "Style: