Unverified Commit bcbbee8c authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2622 from myhloli/dev

Dev
parents 3cc3f754 ced5a7b4
"use client";
import ErrorBoundary from "@/components/error-boundary";
import styles from "./home.module.scss";
import { SlotID, Path } from "@/constant/route";
import {
BrowserRouter,
Routes,
Route,
Outlet,
Navigate,
useLocation,
HashRouter,
} from "react-router-dom";
import { ExtractorSide } from "./extract-side";
import { LanguageProvider } from "@/context/language-provider";
import PDFUpload from "@/pages/extract/components/pdf-upload";
import PDFExtractionJob from "@/pages/extract/components/pdf-extraction";
export function WindowContent() {
const location = useLocation();
const isHome = location.pathname === Path.Home;
return (
<>
<ExtractorSide className={isHome ? styles["sidebar-show"] : ""} />
<div className="flex-1">
<Outlet />
</div>
</>
);
}
function Screen() {
const renderContent = () => {
return (
<div className="w-full h-full flex" id={SlotID.AppBody}>
<Routes>
<Route path="/" element={<WindowContent />}>
<Route
index
element={<Navigate to="/OpenSourceTools/Extractor/PDF" replace />}
/>
<Route
path="/OpenSourceTools/Extractor/PDF"
element={<PDFUpload />}
/>
<Route
path="/OpenSourceTools/Extractor/PDF/:jobID"
element={<PDFExtractionJob />}
/>
<Route
path="*"
element={<Navigate to="/OpenSourceTools/Extractor/PDF" replace />}
/>
</Route>
</Routes>
</div>
);
};
return <>{renderContent()}</>;
}
export function Home() {
return (
<ErrorBoundary>
<LanguageProvider>
<HashRouter>
<Screen />
</HashRouter>
</LanguageProvider>
</ErrorBoundary>
);
}
import { Routes, Route } from "react-router-dom";
import PDFUpload from "@/pages/extract/components/pdf-upload";
import PDFExtractionJob from "@/pages/extract/components/pdf-extraction";
function AppRoutes() {
return (
<>
<Route path="/OpenSourceTools/Extractor/PDF" element={<PDFUpload />} />
<Route
path="/OpenSourceTools/Extractor/PDF/:jobID"
element={<PDFExtractionJob />}
/>
</>
);
}
export default AppRoutes;
import {
getExtractTaskIdProgress,
getPdfExtractQueue,
TaskIdResItem,
} from "@/api/extract";
import { create } from "zustand";
import { useCallback, useEffect, useRef, useState } from "react";
import { useParams } from "react-router-dom";
import { UPDATE_TASK_LIST } from "@/constant/event";
import { useQuery } from "@tanstack/react-query";
interface ExtractorState {
taskInfo: TaskIdResItem;
queueLoading: boolean | null;
interfaceError: boolean;
setTaskInfo: (taskInfo: TaskIdResItem) => void;
setQueueLoading: (loading: boolean | null) => void;
setInterfaceError: (error: boolean) => void;
}
const defaultTaskInfo: TaskIdResItem = {
id: 0,
rank: 0,
state: "pending",
url: "",
type: "unknown",
queues: -1,
};
const useExtractorStore = create<ExtractorState>((set) => ({
taskInfo: defaultTaskInfo,
queueLoading: null,
interfaceError: false,
setTaskInfo: (taskInfo: any) => set({ taskInfo }),
setQueueLoading: (loading) => set({ queueLoading: loading }),
setInterfaceError: (error) => set({ interfaceError: error }),
}));
export const useJobExtraction = () => {
const { jobID } = useParams<{ jobID: string }>();
const {
setTaskInfo,
setQueueLoading,
queueLoading,
taskInfo,
interfaceError,
setInterfaceError,
} = useExtractorStore();
const timeoutRef = useRef<NodeJS.Timeout | null>(null);
const [isPolling, setIsPolling] = useState(true);
const stopTaskLoading = () => {
setQueueLoading(false);
};
// Query for task progress
const taskProgressQuery = useQuery({
queryKey: ["taskProgress", jobID],
queryFn: () => {
setQueueLoading(true);
setIsPolling(true);
return getExtractTaskIdProgress(jobID!)
.then((res) => {
if (res?.state === "done" || res?.state === "failed") {
stopTaskLoading();
document.dispatchEvent(
new CustomEvent("UPDATE_TASK_LIST", {
detail: { state: res.state, id: jobID },
})
);
}
if (res) {
setTaskInfo(res);
}
return res;
})
.catch(() => {
stopTaskLoading();
setTaskInfo({ state: "failed" });
});
},
enabled: false,
});
// Query for queue status
const queueStatusQuery = useQuery({
queryKey: ["queueStatus", jobID],
queryFn: async () => {
setQueueLoading(true);
const response = await getPdfExtractQueue(jobID).then((res) => {
// setTaskInfo({ rand: "failed" });
if (res) {
const targetPendingRunningJob = res?.filter(
(i) => String(i.id) === jobID
)?.[0];
if (targetPendingRunningJob) {
setTaskInfo(targetPendingRunningJob);
} else {
setIsPolling(false);
setQueueLoading(false);
getExtractTaskIdProgress(jobID!).then((res) => {
setTaskInfo(res as any);
});
}
}
return res;
});
return response;
},
enabled:
isPolling &&
(taskProgressQuery?.data?.state === "running" ||
taskProgressQuery?.data?.state === "pending"),
refetchInterval: 2000, // Poll every 2 seconds
});
useEffect(() => {
if (taskProgressQuery.data?.state === "done") {
stopTaskLoading();
setInterfaceError(false);
setIsPolling(false);
if (timeoutRef.current) {
clearTimeout(timeoutRef.current);
} else {
timeoutRef.current = setTimeout(() => {
document.dispatchEvent(
new CustomEvent(UPDATE_TASK_LIST, {
detail: { state: "done", jobID },
})
);
}, 10);
}
} else if (taskProgressQuery.data?.state === "failed") {
stopTaskLoading();
setInterfaceError(true);
setIsPolling(false);
if (timeoutRef.current) {
clearTimeout(timeoutRef.current);
} else {
timeoutRef.current = setTimeout(() => {
document.dispatchEvent(
new CustomEvent(UPDATE_TASK_LIST, {
detail: { state: "failed", jobID },
})
);
}, 10);
}
}
// TIP这里得用taskInfo
}, [taskProgressQuery.data]);
const refreshQueue = () => {
// stop last ID polling
setIsPolling(false);
setTaskInfo(defaultTaskInfo);
taskProgressQuery.refetch();
};
useEffect(() => {
if (jobID) {
// stop last ID polling d
setTaskInfo(defaultTaskInfo);
taskProgressQuery.refetch();
}
}, [jobID]);
return {
taskInfo: taskInfo,
isLoading: queueLoading,
isError:
interfaceError || taskProgressQuery.isError || queueStatusQuery.isError,
refreshQueue,
};
};
import { create } from "zustand";
import { Language } from "@/constant";
import { LOCALE_STORAGE_KEY } from "@/constant/storage";
type LanguageType = (typeof Language)[keyof typeof Language];
type LanguageStore = {
language: LanguageType;
setLanguage: (language: LanguageType) => void;
toggleLanguage: () => void;
};
const getInitialLanguage = (): LanguageType => {
// Try to get language setting from localStorage
const savedLanguage = localStorage.getItem(
LOCALE_STORAGE_KEY
) as LanguageType;
if (savedLanguage && Object.values(Language).includes(savedLanguage)) {
return savedLanguage;
}
// If no valid language setting in localStorage, try to get browser language
const browserLanguage = navigator.language.toLowerCase();
if (browserLanguage.startsWith("zh")) {
return Language.ZH_CN;
} else if (browserLanguage.startsWith("en")) {
return Language.EN_US;
}
// Default to Chinese
return Language.ZH_CN;
};
export const useLanguageStore = create<LanguageStore>((set) => ({
language: getInitialLanguage(),
setLanguage: (language) => {
localStorage.setItem(LOCALE_STORAGE_KEY, language);
set({ language });
},
toggleLanguage: () =>
set((state) => {
const newLanguage =
state.language === Language.ZH_CN ? Language.EN_US : Language.ZH_CN;
localStorage.setItem(LOCALE_STORAGE_KEY, newLanguage);
return { language: newLanguage };
}),
}));
// mdStore.ts
import { create } from "zustand";
import axios from "axios";
import { updateMarkdownContent, UpdateMarkdownRequest } from "@/api/extract"; // 确保路径正确
interface MdContent {
content: string;
isLoading: boolean;
}
type AnchorType =
| "span"
| "div"
| "comment"
| "data-attribute"
| "hr"
| "mark"
| "p";
interface AnchorOptions {
type: AnchorType;
prefix?: string;
style?: string;
className?: string;
customAttributes?: Record<string, string>;
}
const defaultAnchorOptions: AnchorOptions = {
type: "span",
prefix: "md-anchor-",
style: "display:none;",
className: "",
customAttributes: {},
};
interface MdState {
mdContents: Record<string, MdContent>;
allMdContent: string;
allMdContentWithAnchor: string;
error: Error | null;
currentRequestId: number;
setMdUrlArr: (urls: string[]) => Promise<void>;
getAllMdContent: (data: string[]) => string;
setAllMdContent: (val?: string) => void;
setAllMdContentWithAnchor: (val?: string) => void;
getContentWithAnchors: (
data: string[],
options?: Partial<AnchorOptions>
) => string;
jumpToAnchor: (anchorId: string) => number;
reset: () => void;
updateMdContent: (
fileKey: string,
pageNumber: string | number,
newContent: string
) => Promise<void>;
}
const MAX_CONCURRENT_REQUESTS = 2;
const initialState = {
mdContents: {},
allMdContent: "",
allMdContentWithAnchor: "",
error: null,
currentRequestId: 0,
};
const useMdStore = create<MdState>((set, get) => ({
...initialState,
reset: () => {
set(initialState);
},
setAllMdContent: (value?: string) => {
set(() => ({
allMdContent: value,
}));
},
setAllMdContentWithAnchor: (value?: string) => {
set(() => ({
allMdContentWithAnchor: value,
}));
},
setMdUrlArr: async (urls: string[]) => {
const requestId = get().currentRequestId + 1;
set((state) => ({ currentRequestId: requestId, error: null }));
const fetchContent = async (url: string): Promise<[string, string]> => {
try {
const response = await axios.get<string>(url);
return [url, response.data];
} catch (error) {
if (get().currentRequestId === requestId) {
set((state) => ({ error: error as Error }));
}
return [url, ""];
}
};
const fetchWithConcurrency = async (
urls: string[]
): Promise<[string, string][]> => {
const queue = [...urls];
const results: [string, string][] = [];
const inProgress = new Set<Promise<[string, string]>>();
while (queue.length > 0 || inProgress.size > 0) {
while (inProgress.size < MAX_CONCURRENT_REQUESTS && queue.length > 0) {
const url = queue.shift()!;
const promise = fetchContent(url);
inProgress.add(promise);
promise.then((result) => {
results.push(result);
inProgress.delete(promise);
});
}
if (inProgress.size > 0) {
await Promise.race(inProgress);
}
}
return results;
};
const results = await fetchWithConcurrency(urls);
if (get().currentRequestId === requestId) {
const newMdContents: Record<string, MdContent> = {};
results.forEach(([url, content]) => {
newMdContents[url] = { content, isLoading: false };
});
set((state) => ({
mdContents: newMdContents,
allMdContent: state.getAllMdContent(results.map((i) => i[1])),
allMdContentWithAnchor: state.getContentWithAnchors(
results.map((i) => i[1])
),
}));
}
},
getAllMdContent: (data) => {
return data?.join("\n\n");
},
getContentWithAnchors: (data: string[], options?: Partial<AnchorOptions>) => {
const opts = { ...defaultAnchorOptions, ...options };
const generateAnchorTag = (index: number) => {
const id = `${opts.prefix}${index}`;
const attributes = Object.entries(opts.customAttributes || {})
.map(([key, value]) => `${key}="${value}"`)
.join(" ");
switch (opts.type) {
case "span":
case "div":
case "mark":
case "p":
return `<${opts.type} id="${id}" style="${opts.style}" class="${opts.className}" ${attributes}></${opts.type}>`;
case "comment":
return `<!-- anchor: ${id} -->`;
case "data-attribute":
return `<span data-anchor="${id}" style="${opts.style}" class="${opts.className}" ${attributes}></span>`;
case "hr":
return `<hr id="${id}" style="${opts.style}" class="${opts.className}" ${attributes}>`;
default:
return `<span id="${id}" style="${opts.style}" class="${opts.className}" ${attributes}></span>`;
}
};
return data
?.map((content, index) => {
const anchorTag = generateAnchorTag(index);
return `${anchorTag}\n\n${content}`;
})
.join("\n\n");
},
jumpToAnchor: (anchorId: string) => {
const { mdContents } = get();
const contentArray = Object.values(mdContents).map(
(content) => content.content
);
let totalLength = 0;
for (let i = 0; i < contentArray.length; i++) {
if (anchorId === `md-anchor-${i}`) {
return totalLength;
}
totalLength += contentArray[i].length + 2; // +2 for "\n\n"
}
return -1; // Anchor not found
},
updateMdContent: async (
fileKey: string,
pageNumber: string,
newContent: string
) => {
try {
const params: UpdateMarkdownRequest = {
file_key: fileKey,
data: {
[pageNumber]: newContent,
},
};
const result = await updateMarkdownContent(params);
if (result && result.success) {
// 更新本地状态
set((state) => {
const updatedMdContents = { ...state.mdContents };
if (updatedMdContents[fileKey]) {
updatedMdContents[fileKey] = {
...updatedMdContents[fileKey],
content: newContent,
};
}
// 重新计算 allMdContent 和 allMdContentWithAnchor
const contentArray = Object.values(updatedMdContents).map(
(content) => content.content
);
const newAllMdContent = state.getAllMdContent(contentArray);
const newAllMdContentWithAnchor =
state.getContentWithAnchors(contentArray);
return {
mdContents: updatedMdContents,
allMdContent: newAllMdContent,
allMdContentWithAnchor: newAllMdContentWithAnchor,
};
});
} else {
throw new Error("Failed to update Markdown content");
}
} catch (error) {
set({ error: error as Error });
throw error;
}
},
}));
export default useMdStore;
$page-min-witch: 1260px;
\ No newline at end of file
export type ExtractTaskType =
| "pdf"
| "formula-detect"
| "formula-extract"
| "table-recogn";
export const EXTRACTOR_TYPE_LIST = {
table: "table",
formula: "formula",
pdf: "PDF",
};
export enum FORMULA_TYPE {
extract = "extract",
detect = "detect",
}
export enum MD_PREVIEW_TYPE {
preview = "preview",
code = "code",
}
export async function downloadFileUseAScript(
url: string,
filename?: string
): Promise<void> {
try {
// 发起请求获取文件
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
// 获取文件内容的 Blob
const blob = await response.blob();
// 创建一个 Blob URL
const blobUrl = window.URL.createObjectURL(blob);
// 创建一个隐藏的<a>元素
const link = document.createElement("a");
link.style.display = "none";
link.href = blobUrl;
// 设置下载的文件名
const contentDisposition = response.headers.get("Content-Disposition");
const fileName =
filename ||
(contentDisposition
? contentDisposition.split("filename=")[1].replace(/['"]/g, "")
: url.split("/").pop() || "download");
link.download = fileName;
// 将链接添加到文档中并触发点击
document.body.appendChild(link);
link.click();
// 清理
document.body.removeChild(link);
window.URL.revokeObjectURL(blobUrl);
} catch (error) {
console.error("Download failed:", error);
}
}
export enum ELocal {
"zh-CN" = "zh-CN",
"en-US" = "en-US",
}
export const locale: { [key: string]: string } = {
[ELocal["zh-CN"]]: "中文",
[ELocal["en-US"]]: "En",
};
export const localeName: { [key: string]: string } = {
[ELocal["zh-CN"]]: "nameZh",
[ELocal["en-US"]]: "name",
};
export const getLocale = () => {
return localStorage.getItem("umi_locale") || ELocal["zh-CN"];
};
export const windowOpen = (
url: string,
type?: "_blank" | "_parent" | "_self" | "_top"
) => {
const a = document.createElement("a");
a.setAttribute("href", url);
a.setAttribute("target", type || "_blank");
a.rel = "noreferrer";
document.body.appendChild(a);
if (a.click) {
a?.click();
} else {
try {
let evt = new Event("click", {
bubbles: false,
cancelable: true,
});
a.dispatchEvent(evt);
} catch (error) {
window.open(url, type || "_blank");
}
}
document.body.removeChild(a);
};
/// <reference types="vite/client" />
module.exports = {
content: [
"./src/**/*.{js,jsx,ts,tsx}",
],
plugins: [
function ({ addUtilities }) {
const newUtilities = {
'.scrollbar-thin': {
scrollbarWidth: '2px',
// scrollbarColor: 'rgba(13, 83, 222, 1)',
'&::-webkit-scrollbar': {
width: '6px',
height: '6px'
},
'&::-webkit-scrollbar-track': {
backgroundColor: 'transparent'
},
'&::-webkit-scrollbar-thumb': {
// backgroundColor: 'rgba(13, 83, 222, 0.01)',
borderRadius: '20px',
border: '3px solid transparent'
},
'&:hover::-webkit-scrollbar-thumb': {
width: '6px',
border: '3px solid rgb(229 231 235)',
backgroundColor: 'rgb(229 231 235)'
}
}
// 你可以添加更多自定义的滚动条样式
};
addUtilities(newUtilities, ['responsive', 'hover']);
},
],
// ...other configurations
}
\ No newline at end of file
{
"compilerOptions": {
"target": "ES2020",
"useDefineForClassFields": true,
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"isolatedModules": true,
"moduleDetection": "force",
"noEmit": true,
"jsx": "react-jsx",
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true,
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["src"]
}
{
"files": [],
"compilerOptions": {
// ... other options ...
"types": ["node"]
},
"references": [
{ "path": "./tsconfig.app.json" },
{ "path": "./tsconfig.node.json" }
]
}
{
"compilerOptions": {
"target": "ES2022",
"lib": ["ES2023"],
"module": "ESNext",
"skipLibCheck": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"isolatedModules": true,
"moduleDetection": "force",
"noEmit": true,
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true
},
"include": ["vite.config.ts"]
}
import { defineConfig } from "vite";
import react from "@vitejs/plugin-react";
import path from "path";
// https://vitejs.dev/config/
export default defineConfig({
plugins: [react()],
server: {
proxy: {
"/api": {
target: "http://localhost:5559",
changeOrigin: true,
},
},
},
css: {
modules: {
localsConvention: "camelCaseOnly", // transfer kebab-case to camelCase
scopeBehaviour: "local",
generateScopedName: "[name]__[local]___[hash:base64:5]",
},
},
publicDir: "public",
resolve: {
alias: {
"@": path.resolve(__dirname, "./src"),
},
},
});
# MinerU Local web_demo
## Feature Overview
<p align="center">
<img src="images/web_demo_1.png" width="600px" style="vertical-align:middle;">
</p>
- Supports uploading PDFs and calling MinerU for processing
- Supports online editing of the Markdown results parsed by MinerU
- Supports viewing of historical tasks
## Installation and Deployment
0. MinerU Installation and Deployment
```
# The service depends on mineru, please ensure mineru is installed first
```
1. Package the front-end interface
```bash
# First, navigate to the front-end directory
cd projects/web
# Modify the configuration
# Change the IP in the target field of the file vite.config.ts to your own computer's IP
# Build the front-end project
npm install -g yarn
yarn install
yarn build
```
2. Install service dependencies
```bash
# First, navigate to the back-end directory
cd projects/web_demo
# Install dependencies
pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
```
3. Start the service
```bash
# Navigate to the program directory
cd projects/web_demo/web_demo
# Start the service
python3 app.py or python app.py
# Access the interface by visiting the started address in the browser
```
ps:API documentation
```
https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7
```
# MinerU本地web_demo
## 功能简介
<p align="center">
<img src="images/web_demo_1.png" width="600px" style="vertical-align:middle;">
</p>
- 支持上传pdf,并调用MinerU进行处理
- 支持对MinerU解析的Markdown结果进行在线修改
- 支持查看历史任务
## 安装部署
0. MinerU安装部署
```
# 服务依赖MinerU,请先确保MinerU已安装
```
1. 打包前端界面
```bash
# 先进入前端目录
cd projects/web
# 修改配置
# 将文件vite.config.ts中的target中的IP更改为自己电脑IP
# 打包前端项目
npm install -g yarn
yarn install
yarn build
```
2. 安装服务依赖
```bash
# 先进入后端目录
cd projects/web_demo
# 安装依赖
pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
```
3. 启动服务
```bash
# 进入程序目录
cd projects/web_demo/web_demo
# 启动服务
python3 app.py 或者 python app.py
# 在浏览器访问启动的地址即可访问界面
```
ps:接口文档
```
https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment