Refactor generate.py to ensure temporal coordinates and position grids are processed in bfloat16 for consistency with PyTorch's precision behavior. Update denoise_dev_av function to apply standard ratio rescaling for audio and video guidance, enhancing numerical fidelity and model compatibility.
This commit is contained in:
@@ -147,6 +147,12 @@ class LTXModelConfig(BaseModelConfig):
|
||||
if self.audio_positional_embedding_max_pos is None:
|
||||
self.audio_positional_embedding_max_pos = [20]
|
||||
|
||||
# PyTorch LTX-2 configurator has a bug: it reads "frequencies_precision"
|
||||
# instead of "rope_double_precision" from the config, so double_precision_rope
|
||||
# is always False in PyTorch regardless of what the config file says. Since the
|
||||
# model was trained with this behavior, we must match it.
|
||||
self.double_precision_rope = False
|
||||
|
||||
# Convert string enum values if loading from dict
|
||||
if isinstance(self.model_type, str):
|
||||
self.model_type = LTXModelType(self.model_type)
|
||||
|
||||
Reference in New Issue
Block a user