import re
import itertools
import html
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
from pydantic import (
AnyUrl,
BaseModel,
ConfigDict,
Field,
StringConstraints,
computed_field,
field_validator,
model_validator,
)
class TableCell(BaseModel):
"""TableCell."""
row_span: int = 1
col_span: int = 1
start_row_offset_idx: int
end_row_offset_idx: int
start_col_offset_idx: int
end_col_offset_idx: int
text: str
column_header: bool = False
row_header: bool = False
row_section: bool = False
@model_validator(mode="before")
@classmethod
def from_dict_format(cls, data: Any) -> Any:
"""from_dict_format."""
if isinstance(data, Dict):
# Check if this is a native BoundingBox or a bbox from docling-ibm-models
if (
# "bbox" not in data
# or data["bbox"] is None
# or isinstance(data["bbox"], BoundingBox)
"text"
in data
):
return data
text = data["bbox"].get("token", "")
if not len(text):
text_cells = data.pop("text_cell_bboxes", None)
if text_cells:
for el in text_cells:
text += el["token"] + " "
text = text.strip()
data["text"] = text
return data
class TableData(BaseModel): # TBD
"""BaseTableData."""
table_cells: List[TableCell] = []
num_rows: int = 0
num_cols: int = 0
@computed_field # type: ignore
@property
def grid(
self,
) -> List[List[TableCell]]:
"""grid."""
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
)
for j in range(self.num_cols)
]
for i in range(self.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in self.table_cells:
for i in range(
min(cell.start_row_offset_idx, self.num_rows),
min(cell.end_row_offset_idx, self.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, self.num_cols),
min(cell.end_col_offset_idx, self.num_cols),
):
table_data[i][j] = cell
return table_data
"""
OTSL
"""
OTSL_NL = ""
OTSL_FCEL = ""
OTSL_ECEL = ""
OTSL_LCEL = ""
OTSL_UCEL = ""
OTSL_XCEL = ""
def otsl_extract_tokens_and_text(s: str):
# Pattern to match anything enclosed by < >
# (including the angle brackets themselves)
# pattern = r"(<[^>]+>)"
pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")"
# Find all tokens (e.g. "", "", etc.)
tokens = re.findall(pattern, s)
# Remove any tokens that start with "
# 重新构建texts以包含补充的
new_texts = []
text_idx = 0
for row_idx, row in enumerate(split_row_tokens):
for col_idx, token in enumerate(row):
new_texts.append(token)
# 如果这个token在原始texts中有对应的文本内容,添加它
if text_idx < len(texts) and texts[text_idx] == token:
text_idx += 1
# 检查下一个是否是文本内容(不是token)
if (text_idx < len(texts) and
texts[text_idx] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]):
new_texts.append(texts[text_idx])
text_idx += 1
new_texts.append(OTSL_NL)
if text_idx < len(texts) and texts[text_idx] == OTSL_NL:
text_idx += 1
texts = new_texts
def count_right(tokens, c_idx, r_idx, which_tokens):
span = 0
c_idx_iter = c_idx
while tokens[r_idx][c_idx_iter] in which_tokens:
c_idx_iter += 1
span += 1
if c_idx_iter >= len(tokens[r_idx]):
return span
return span
def count_down(tokens, c_idx, r_idx, which_tokens):
span = 0
r_idx_iter = r_idx
while tokens[r_idx_iter][c_idx] in which_tokens:
r_idx_iter += 1
span += 1
if r_idx_iter >= len(tokens):
return span
return span
for i, text in enumerate(texts):
cell_text = ""
if text in [
OTSL_FCEL,
OTSL_ECEL,
]:
row_span = 1
col_span = 1
right_offset = 1
if text != OTSL_ECEL and (texts[i + 1] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]):
cell_text = texts[i + 1]
right_offset = 2
# Check next element(s) for lcel / ucel / xcel,
# set properly row_span, col_span
next_right_cell = ""
if i + right_offset < len(texts):
next_right_cell = texts[i + right_offset]
next_bottom_cell = ""
if r_idx + 1 < len(split_row_tokens):
if c_idx < len(split_row_tokens[r_idx + 1]):
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
if next_right_cell in [
OTSL_LCEL,
OTSL_XCEL,
]:
# we have horisontal spanning cell or 2d spanning cell
col_span += count_right(
split_row_tokens,
c_idx + 1,
r_idx,
[OTSL_LCEL, OTSL_XCEL],
)
if next_bottom_cell in [
OTSL_UCEL,
OTSL_XCEL,
]:
# we have a vertical spanning cell or 2d spanning cell
row_span += count_down(
split_row_tokens,
c_idx,
r_idx + 1,
[OTSL_UCEL, OTSL_XCEL],
)
table_cells.append(
TableCell(
text=cell_text.strip(),
row_span=row_span,
col_span=col_span,
start_row_offset_idx=r_idx,
end_row_offset_idx=r_idx + row_span,
start_col_offset_idx=c_idx,
end_col_offset_idx=c_idx + col_span,
)
)
if text in [
OTSL_FCEL,
OTSL_ECEL,
OTSL_LCEL,
OTSL_UCEL,
OTSL_XCEL,
]:
c_idx += 1
if text == OTSL_NL:
r_idx += 1
c_idx = 0
return table_cells, split_row_tokens
# def export_to_html(table_data: TableData):
# nrows = table_data.num_rows
# ncols = table_data.num_cols
# if len(table_data.table_cells) == 0:
# return ""
# body = ""
# grid = table_data.grid
# for i in range(nrows):
# body += ""
# for j in range(ncols):
# cell: TableCell = grid[i][j]
# rowspan, rowstart = (
# cell.row_span,
# cell.start_row_offset_idx,
# )
# colspan, colstart = (
# cell.col_span,
# cell.start_col_offset_idx,
# )
# if rowstart != i:
# continue
# if colstart != j:
# continue
# content = html.escape(cell.text.strip())
# celltag = "td"
# if cell.column_header:
# celltag = "th"
# opening_tag = f"{celltag}"
# if rowspan > 1:
# opening_tag += f' rowspan="{rowspan}"'
# if colspan > 1:
# opening_tag += f' colspan="{colspan}"'
# body += f"<{opening_tag}>{content}{celltag}>"
# body += "
"
# # dir = get_text_direction(text)
# body = f""
# return body
def export_to_html(table_data: TableData) -> str:
nrows = table_data.num_rows
ncols = table_data.num_cols
# print(nrows, ncols)
if not table_data.table_cells:
return ""
current_grid = table_data.grid
html_str_list = []
for i in range(nrows):
html_str_list.append("")
for j in range(ncols):
cell: TableCell = current_grid[i][j]
if cell.start_row_offset_idx != i or cell.start_col_offset_idx != j:
continue
# content = html.escape(cell.text.strip())
content = cell.text.strip()
cell_tag_name = "th" if cell.column_header else "td"
opening_tag_parts = [f"<{cell_tag_name}"]
if cell.row_span > 1:
opening_tag_parts.append(f' rowspan="{cell.row_span}"')
if cell.col_span > 1:
opening_tag_parts.append(f' colspan="{cell.col_span}"')
opening_tag_parts.append(">")
opening_tag = "".join(opening_tag_parts)
html_str_list.append(f"{opening_tag}{content}{cell_tag_name}>")
html_str_list.append("
")
body_content = "".join(html_str_list)
return f""
def convert_otsl_to_html(otsl_content: str) -> str:
# if not otsl_content.endswith("\n"):
# return ""
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
table_data = TableData(
num_rows=len(split_row_tokens),
num_cols=(
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
),
table_cells=table_cells,
)
result = export_to_html(table_data)
return result
if __name__ == "__main__":
import time
# test
a = """
\n
\n"""
b = """ReviewerRepresentationConsultantSpeaker's BureauOwnership/ Partnership/ PrincipalPersonal ResearchInstitutional, Organizational, or Other Financial BenefitExpert Witness
John E. BrushOfficial Reviewer–ACCF Board of Trustees● United HealthcareNoneNoneNone● PROMETHEUS Payment (Board member)None
David P. FaxonOfficial Reviewer–AHA● Johnson & JohnsonNone● CULPRIT Trial (PI)*None● Circulation: Cardiovascular Interventions—Editor*None
● RIVA Medical
Robert A. HarringtonOfficial Reviewer–AHA● AstraZeneca*NoneNone● AstraZenecaNoneNone
● Baxter● Baxter
● CSL Behring● Bristol-Myers Squibb*
● Eli Lilly● GlaxoSmithKline
● Luiypold● The Medicines Company
● Merck● Merck*
● Novartis● Portola*
● Otsuka Maryland Research Institute● Schering-Plough*
● Regado
● Sanofi-aventis
● Schering-Plough*
● WebMD*
Judith S. HochmanOfficial Reviewer–ACCF/AHA Task Force on Practice Guidelines● BMS/SanofiNoneNone● Johnson & Johnson/Bayer Healthcare AG (DSMB)NoneNone
● Eli Lilly
● GlaxoSmithKline
● Millennium Pharmaceuticals/ Schering-Plough● Schering-Plough (TIMI 50) (DSMB)
Rodney H. ZimmermannOfficial Reviewer–ACCF Board of Governors● AstraZeneca● AstraZenecaNone● AstraZenecaNoneNone
● Boehringer Ingelheim● Merck-Frost● Sanofi-aventis● Sanofi-aventis
● Bristol-Myers Squibb● Servier
● Medtronic
● Sanofi-aventis
● Schering-Plough
Steven BrownOrganizational Reviewer–AAFPNoneNoneNone