SAMI 자막 파일을 SRT 파일로 변환하는 python source code  (동영상 자막 변환, 비디오 자막 포맷 변환, 자막 파일 포맷 변경 방법)

"""
Convert SMI file to SRT file
"""
import argparse

import os
from bs4 import BeautifulSoup
import re
import chardet


def get_encoding(file_path):
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read())
        return result["encoding"]


def change_file_extension(srt_file_path, new_ext="srt"):
    root, ext = os.path.splitext(srt_file_path)
    smi_file_path = f"{root}.{new_ext}"
    return smi_file_path


# -----------------------------------------------------------------------------
def convert_time_format(ms):
    ms = int(ms)
    s, ms = divmod(ms, 1000)
    m, s = divmod(s, 60)
    h, m = divmod(m, 60)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"


def html_to_plain_text(html):
    soup = BeautifulSoup(html, "html.parser")

    # 줄바꿈 유지
    for br in soup.find_all("br"):
        br.replace_with("\n")
    for p in soup.find_all("p"):
        p.append("\n")

    # 빈 줄 제거
    text = soup.get_text().strip()
    lines = text.split("\n")
    non_blank_lines = [line for line in lines if line.strip()]
    result = "\n".join(non_blank_lines)

    # 문자열이 "<" 로 시작하면 srt에서 에러가 발생함
    if result.startswith("<"):
        return ""

    return result


# -----------------------------------------------------------------------------
def convert_smi_to_srt(smi_file, srt_file):
    #
    encoding_str = get_encoding(smi_file)
    with open(smi_file, "r", encoding=encoding_str) as file:
        smi = file.readlines()

    #
    subtitle_count = 1
    body = False
    start_time = ""
    text = ""
    srt_sub = ""
    for i, line in enumerate(smi):
        line = line.strip()
        if body == False and line.upper().startswith("<SYNC START=") == False:
            continue
        body = True

        if line.upper().startswith("<SYNC START="):
            match = re.match(r"^(\d+)", line[12:])
            time = convert_time_format(match.group(1))

            if text.strip() != "":
                # 유닛 종료
                text = html_to_plain_text(text)
                sub = f"{subtitle_count}\n{start_time} --> {time}\n{text.strip()}\n\n"
                srt_sub += sub
                subtitle_count += 1
                start_time = time  # 자막 파일 내용중에 에러난 부분이 있을 수 있어서  유지가 필요
                text = ""

            else:
                start_time = time
                text = html_to_plain_text(line)

        elif line.upper().startswith("</BODY>"):
            break
        else:
            text += html_to_plain_text(line) + "\n"

    #
    with open(srt_file, "w", encoding="utf-8") as srt:
        srt.write(srt_sub)

    print("srt_file", srt_file)


# -----------------------------------------------------------------------------
def main():
    parser = argparse.ArgumentParser(
        description="Convert SMI file to SRT file.",
        usage="%(prog)s <file_path>",
    )

    parser.add_argument("file_path", metavar="FILE", type=str, help="The target file")

    args = parser.parse_args()

    target_file = args.file_path

    #
    root, ext = os.path.splitext(target_file)
    if ext != ".smi":
        print("smi 파일이 아닙니다.")
        return
    srt_file_path = change_file_extension(target_file, "srt")

    convert_smi_to_srt(target_file, srt_file_path)


if __name__ == "__main__":
    main()

//

 

 

반응형
Posted by codens