123 lines
3.1 KiB
TypeScript
123 lines
3.1 KiB
TypeScript
/**
|
|
* URL Fetch tool - fetch and extract content from web pages
|
|
*/
|
|
|
|
import { Tool } from 'ollama';
|
|
import { ToolHandler, ToolResult } from './types';
|
|
|
|
export const urlFetchTool: Tool = {
|
|
type: 'function',
|
|
function: {
|
|
name: 'fetch_url',
|
|
description:
|
|
'Fetch content from a URL and extract the main text. Useful for reading articles, documentation, or any web page content.',
|
|
parameters: {
|
|
type: 'object',
|
|
properties: {
|
|
url: {
|
|
type: 'string',
|
|
description: 'The URL to fetch content from',
|
|
},
|
|
max_length: {
|
|
type: 'number',
|
|
description:
|
|
'Maximum number of characters to return (default: 5000). Longer content will be truncated.',
|
|
},
|
|
},
|
|
required: ['url'],
|
|
},
|
|
},
|
|
};
|
|
|
|
// Simple HTML to text conversion
|
|
function htmlToText(html: string): string {
|
|
return (
|
|
html
|
|
// Remove scripts and styles
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
// Remove HTML comments
|
|
.replace(/<!--[\s\S]*?-->/g, '')
|
|
// Replace common block elements with newlines
|
|
.replace(/<\/(p|div|h[1-6]|li|tr|br)[^>]*>/gi, '\n')
|
|
.replace(/<(br|hr)[^>]*\/?>/gi, '\n')
|
|
// Remove remaining HTML tags
|
|
.replace(/<[^>]+>/g, ' ')
|
|
// Decode common HTML entities
|
|
.replace(/ /g, ' ')
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
// Clean up whitespace
|
|
.replace(/[ \t]+/g, ' ')
|
|
.replace(/\n\s*\n/g, '\n\n')
|
|
.trim()
|
|
);
|
|
}
|
|
|
|
export const urlFetchHandler: ToolHandler = async (args): Promise<ToolResult> => {
|
|
const url = args.url as string;
|
|
const maxLength = (args.max_length as number) || 5000;
|
|
|
|
if (!url) {
|
|
return {
|
|
success: false,
|
|
error: 'No URL provided',
|
|
};
|
|
}
|
|
|
|
// Validate URL
|
|
try {
|
|
new URL(url);
|
|
} catch {
|
|
return {
|
|
success: false,
|
|
error: 'Invalid URL format',
|
|
};
|
|
}
|
|
|
|
try {
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (compatible; ChatGPZ/1.0; +https://github.com/your-repo)',
|
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
},
|
|
signal: AbortSignal.timeout(10000), // 10 second timeout
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return {
|
|
success: false,
|
|
error: `HTTP ${response.status}: ${response.statusText}`,
|
|
};
|
|
}
|
|
|
|
const contentType = response.headers.get('content-type') || '';
|
|
const html = await response.text();
|
|
|
|
let text: string;
|
|
if (contentType.includes('text/plain')) {
|
|
text = html;
|
|
} else {
|
|
text = htmlToText(html);
|
|
}
|
|
|
|
// Truncate if needed
|
|
if (text.length > maxLength) {
|
|
text = `${text.substring(0, maxLength)}\n\n[Content truncated...]`;
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
result: `Content from ${url}:\n\n${text}`,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Failed to fetch URL',
|
|
};
|
|
}
|
|
};
|