import re import itertools import html from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union from pydantic import ( AnyUrl, BaseModel, ConfigDict, Field, StringConstraints, computed_field, field_validator, model_validator, ) class TableCell(BaseModel): """TableCell.""" row_span: int = 1 col_span: int = 1 start_row_offset_idx: int end_row_offset_idx: int start_col_offset_idx: int end_col_offset_idx: int text: str column_header: bool = False row_header: bool = False row_section: bool = False @model_validator(mode="before") @classmethod def from_dict_format(cls, data: Any) -> Any: """from_dict_format.""" if isinstance(data, Dict): # Check if this is a native BoundingBox or a bbox from docling-ibm-models if ( # "bbox" not in data # or data["bbox"] is None # or isinstance(data["bbox"], BoundingBox) "text" in data ): return data text = data["bbox"].get("token", "") if not len(text): text_cells = data.pop("text_cell_bboxes", None) if text_cells: for el in text_cells: text += el["token"] + " " text = text.strip() data["text"] = text return data class TableData(BaseModel): # TBD """BaseTableData.""" table_cells: List[TableCell] = [] num_rows: int = 0 num_cols: int = 0 @computed_field # type: ignore @property def grid( self, ) -> List[List[TableCell]]: """grid.""" # Initialise empty table data grid (only empty cells) table_data = [ [ TableCell( text="", start_row_offset_idx=i, end_row_offset_idx=i + 1, start_col_offset_idx=j, end_col_offset_idx=j + 1, ) for j in range(self.num_cols) ] for i in range(self.num_rows) ] # Overwrite cells in table data for which there is actual cell content. for cell in self.table_cells: for i in range( min(cell.start_row_offset_idx, self.num_rows), min(cell.end_row_offset_idx, self.num_rows), ): for j in range( min(cell.start_col_offset_idx, self.num_cols), min(cell.end_col_offset_idx, self.num_cols), ): table_data[i][j] = cell return table_data """ OTSL """ OTSL_NL = "" OTSL_FCEL = "" OTSL_ECEL = "" OTSL_LCEL = "" OTSL_UCEL = "" OTSL_XCEL = "" def otsl_extract_tokens_and_text(s: str): # Pattern to match anything enclosed by < > # (including the angle brackets themselves) # pattern = r"(<[^>]+>)" pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")" # Find all tokens (e.g. "", "", etc.) tokens = re.findall(pattern, s) # Remove any tokens that start with " # 重新构建texts以包含补充的 new_texts = [] text_idx = 0 for row_idx, row in enumerate(split_row_tokens): for col_idx, token in enumerate(row): new_texts.append(token) # 如果这个token在原始texts中有对应的文本内容,添加它 if text_idx < len(texts) and texts[text_idx] == token: text_idx += 1 # 检查下一个是否是文本内容(不是token) if (text_idx < len(texts) and texts[text_idx] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]): new_texts.append(texts[text_idx]) text_idx += 1 new_texts.append(OTSL_NL) if text_idx < len(texts) and texts[text_idx] == OTSL_NL: text_idx += 1 texts = new_texts def count_right(tokens, c_idx, r_idx, which_tokens): span = 0 c_idx_iter = c_idx while tokens[r_idx][c_idx_iter] in which_tokens: c_idx_iter += 1 span += 1 if c_idx_iter >= len(tokens[r_idx]): return span return span def count_down(tokens, c_idx, r_idx, which_tokens): span = 0 r_idx_iter = r_idx while tokens[r_idx_iter][c_idx] in which_tokens: r_idx_iter += 1 span += 1 if r_idx_iter >= len(tokens): return span return span for i, text in enumerate(texts): cell_text = "" if text in [ OTSL_FCEL, OTSL_ECEL, ]: row_span = 1 col_span = 1 right_offset = 1 if text != OTSL_ECEL and (texts[i + 1] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]): cell_text = texts[i + 1] right_offset = 2 # Check next element(s) for lcel / ucel / xcel, # set properly row_span, col_span next_right_cell = "" if i + right_offset < len(texts): next_right_cell = texts[i + right_offset] next_bottom_cell = "" if r_idx + 1 < len(split_row_tokens): if c_idx < len(split_row_tokens[r_idx + 1]): next_bottom_cell = split_row_tokens[r_idx + 1][c_idx] if next_right_cell in [ OTSL_LCEL, OTSL_XCEL, ]: # we have horisontal spanning cell or 2d spanning cell col_span += count_right( split_row_tokens, c_idx + 1, r_idx, [OTSL_LCEL, OTSL_XCEL], ) if next_bottom_cell in [ OTSL_UCEL, OTSL_XCEL, ]: # we have a vertical spanning cell or 2d spanning cell row_span += count_down( split_row_tokens, c_idx, r_idx + 1, [OTSL_UCEL, OTSL_XCEL], ) table_cells.append( TableCell( text=cell_text.strip(), row_span=row_span, col_span=col_span, start_row_offset_idx=r_idx, end_row_offset_idx=r_idx + row_span, start_col_offset_idx=c_idx, end_col_offset_idx=c_idx + col_span, ) ) if text in [ OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL, ]: c_idx += 1 if text == OTSL_NL: r_idx += 1 c_idx = 0 return table_cells, split_row_tokens # def export_to_html(table_data: TableData): # nrows = table_data.num_rows # ncols = table_data.num_cols # if len(table_data.table_cells) == 0: # return "" # body = "" # grid = table_data.grid # for i in range(nrows): # body += "" # for j in range(ncols): # cell: TableCell = grid[i][j] # rowspan, rowstart = ( # cell.row_span, # cell.start_row_offset_idx, # ) # colspan, colstart = ( # cell.col_span, # cell.start_col_offset_idx, # ) # if rowstart != i: # continue # if colstart != j: # continue # content = html.escape(cell.text.strip()) # celltag = "td" # if cell.column_header: # celltag = "th" # opening_tag = f"{celltag}" # if rowspan > 1: # opening_tag += f' rowspan="{rowspan}"' # if colspan > 1: # opening_tag += f' colspan="{colspan}"' # body += f"<{opening_tag}>{content}" # body += "" # # dir = get_text_direction(text) # body = f"{body}
" # return body def export_to_html(table_data: TableData) -> str: nrows = table_data.num_rows ncols = table_data.num_cols # print(nrows, ncols) if not table_data.table_cells: return "" current_grid = table_data.grid html_str_list = [] for i in range(nrows): html_str_list.append("") for j in range(ncols): cell: TableCell = current_grid[i][j] if cell.start_row_offset_idx != i or cell.start_col_offset_idx != j: continue # content = html.escape(cell.text.strip()) content = cell.text.strip() cell_tag_name = "th" if cell.column_header else "td" opening_tag_parts = [f"<{cell_tag_name}"] if cell.row_span > 1: opening_tag_parts.append(f' rowspan="{cell.row_span}"') if cell.col_span > 1: opening_tag_parts.append(f' colspan="{cell.col_span}"') opening_tag_parts.append(">") opening_tag = "".join(opening_tag_parts) html_str_list.append(f"{opening_tag}{content}") html_str_list.append("") body_content = "".join(html_str_list) return f"{body_content}
" def convert_otsl_to_html(otsl_content: str) -> str: # if not otsl_content.endswith("\n"): # return "" tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content) table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens) table_data = TableData( num_rows=len(split_row_tokens), num_cols=( max(len(row) for row in split_row_tokens) if split_row_tokens else 0 ), table_cells=table_cells, ) result = export_to_html(table_data) return result if __name__ == "__main__": import time # test a = """ \n \n""" b = """ReviewerRepresentationConsultantSpeaker's BureauOwnership/ Partnership/ PrincipalPersonal ResearchInstitutional, Organizational, or Other Financial BenefitExpert Witness John E. BrushOfficial Reviewer–ACCF Board of Trustees● United HealthcareNoneNoneNone● PROMETHEUS Payment (Board member)None David P. FaxonOfficial Reviewer–AHA● Johnson & JohnsonNone● CULPRIT Trial (PI)*None● Circulation: Cardiovascular Interventions—Editor*None ● RIVA Medical Robert A. HarringtonOfficial Reviewer–AHA● AstraZeneca*NoneNone● AstraZenecaNoneNone ● Baxter● Baxter ● CSL Behring● Bristol-Myers Squibb* ● Eli Lilly● GlaxoSmithKline ● Luiypold● The Medicines Company ● Merck● Merck* ● Novartis● Portola* ● Otsuka Maryland Research Institute● Schering-Plough* ● Regado ● Sanofi-aventis ● Schering-Plough* ● WebMD* Judith S. HochmanOfficial Reviewer–ACCF/AHA Task Force on Practice Guidelines● BMS/SanofiNoneNone● Johnson & Johnson/Bayer Healthcare AG (DSMB)NoneNone ● Eli Lilly ● GlaxoSmithKline ● Millennium Pharmaceuticals/ Schering-Plough● Schering-Plough (TIMI 50) (DSMB) Rodney H. ZimmermannOfficial Reviewer–ACCF Board of Governors● AstraZeneca● AstraZenecaNone● AstraZenecaNoneNone ● Boehringer Ingelheim● Merck-Frost● Sanofi-aventis● Sanofi-aventis ● Bristol-Myers Squibb● Servier ● Medtronic ● Sanofi-aventis ● Schering-Plough Steven BrownOrganizational Reviewer–AAFPNoneNoneNoneNoneNoneNone Joseph C. ClevelandOrganizational Reviewer–STS● Baxter BiosurgeryNoneNoneNone● HeartwareNone ● Essential Pharmaceuticals● Thoratec Wyatt DeckerOrganizational Reviewer–ACEPNoneNoneNoneNoneNoneNone Joseph A. de GregorioOrganizational Reviewer–SCAINoneNoneNoneNoneNoneNone Deborah B. DiercksOrganizational Reviewer–ACEP● AstraZenecaNoneNoneNone● Society of Chest Pain Centers and ProvidersNone ● Sanofi-aventis ● Schering-Plough Benjamin HattenOrganizational Reviewer–ACEPNoneNoneNoneNoneNoneNone Loren F. HiratzkaOrganizational Reviewer–STSNoneNoneNoneNone● Cardiac, Vascular, and Thoracic Surgeons*None ● TriHealth (Bethesda North and Good Samaritan Hospitals)* Jason H. RogersOrganizational Reviewer–SCAI● Ample MedicalNoneNoneNoneNoneNone Vincenza T. ShowOrganizational Reviewer–ACPNoneNoneNone● Boehringer Ingelheim*● ACP*None ● Bristol-Myers Squibb* """ print(convert_otsl_to_html(b))