GetDocParsingResult.Content structure
{
"doc_info": { # File basic information
"languages": [ # Language
"zh",
"en"
],
"doc_type": "pdf", # File type, including 'pdf', 'csv', 'xlsx', 'doc', 'docx', and 'txt'.
"pdf_toc": [{ # File level and page number
"title": "xxx", #Cover
"level": 0, #Level
"page": 0 #Page number
}],
"pages": 366, # Number of pages
"page_list": [{ # Page height and other information
"imageWidth": 596.0, #Page width
"imageHeight": 842.0, #Page height
"pageIdAllDocs": 0, #Page IDs of all files
"fileIndex": 0, File index
"pageIdCurDoc": 0, #Page ID of the current file
"angle": 0 #Page angle
}],
"doc_data": #File content
[{ "uniqueId":"about_us_para", #Content block ID
"page_num":"01", #Page number
"index": "xxx", #index
"name": "xxx", #Content block name
"type": "xxxx", # Content block type, including ['Title', 'Text', 'Caption', 'Section-header', 'Footnote', 'Page-header', 'Formula','Page-footer', 'Table', 'Figure', and 'Picture']
"subType":"xxx", # Content block subtype
"text": "xxx", # Text of content block
"before_text": xxx, # Text before content block
"after_text": xxx, # Text after content block
"extInfo":[ # Coordinates and other information of content block
{"uniqueId": "b0x1x0", # ID of content subblock
"pos": [{"x": 229.0, "y": 208.0}, {"x": 421.0, "y": 208.0}, {"x": 421.0, "y": 242.0}, {"x": 229.0, "y": 242.0}], # Coordinates of content subblock, four coordinate points, x and y indicates axes x and y
"text": "Kurt Götze", #Text of content subblock
"type": "Text", # Type of content subblock
"subType": "Text", # Subtype of content subblock
"pageNum": [0], # Page number of content subblock
"index": 0 #index
}]
}]
}