Spaces:
Runtime error
Runtime error
| # coding=utf-8 | |
| # judge voice-over | |
| from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer | |
| import logging | |
| class Step02: | |
| def __init__(self, model_path, step2_mode): | |
| self.modal = "video" | |
| self.log = logging.getLogger(self.__class__.__name__) | |
| self.log.setLevel(logging.INFO) | |
| self.model, self.processor, self.tokenizer = model_init(model_path) | |
| self.preprocess = self.processor[self.modal] | |
| self.step2_mode = step2_mode | |
| def run_step0(self, video_path, modal_type='v'): | |
| question = f"Generate high-quality audio from video step-by-step." | |
| # if modal_type == "a": | |
| # self.model.model.vision_tower = None | |
| # elif modal_type == "v": | |
| # self.model.model.audio_tower = None | |
| # elif modal_type == "av": | |
| # pass | |
| # else: | |
| # raise NotImplementedError | |
| self.log.info("######################################################################################################") | |
| self.log.info("Generate high-quality audio from video step-by-step...") | |
| audio_video_tensor = self.preprocess(video_path, va=False) | |
| output = mm_infer( | |
| audio_video_tensor, | |
| question, | |
| model=self.model, | |
| tokenizer=self.tokenizer, | |
| modal=self.modal, | |
| do_sample=False, | |
| ) | |
| return output | |
| def run_step2(self, video_audio_path, modal_type='av'): | |
| question = f"Given a video and its corresponding audio, determine whether the audio contains voice-over? Options: A. Yes, B. No. Choose A or B." | |
| # if modal_type == "a": | |
| # self.model.model.vision_tower = None | |
| # elif modal_type == "v": | |
| # self.model.model.audio_tower = None | |
| # elif modal_type == "av": | |
| # pass | |
| # else: | |
| # raise NotImplementedError | |
| audio_video_tensor = self.preprocess(video_audio_path, va=True) | |
| output = mm_infer( | |
| audio_video_tensor, | |
| question, | |
| model=self.model, | |
| tokenizer=self.tokenizer, | |
| modal=self.modal, | |
| do_sample=False, | |
| ) | |
| if self.step2_mode == "cot": | |
| output = output.split("<CONCLUSION>")[-1][1] | |
| print("1111111111111111111111111: ", output) | |
| output = (output == "A") | |
| if output: | |
| self.log.info(f"The video generated by Step1 ({video_audio_path}) contains voice-over.") | |
| else: | |
| self.log.info(f"The video generated by Step1 ({video_audio_path}) does not contain voice-over.") | |
| self.log.info("Finish Step2 successfully.\n") | |
| return output | |