Learn how to extract and process web content using DumplingAI
const axios = require('axios'); async function scrapePage(url) { try { const response = await axios.post('https://app.dumplingai.com/api/v1/scrape', { url: url, format: 'markdown', cleaned: true }, { headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer YOUR_API_KEY' } } ); console.log('Page Title:', response.data.title); console.log('Content:', response.data.content.substring(0, 500) + '...'); return response.data; } catch (error) { console.error('Error:', error.response ? error.response.data : error.message); } } scrapePage('https://example.com');
// Same as before, but with format set to 'html' { url: 'https://example.com', format: 'html', cleaned: true }
{ url: 'https://example.com', format: 'markdown', cleaned: true, renderJs: true }
const cheerio = require('cheerio'); async function extractLinks(url) { const scrapedData = await scrapePage(url); // For HTML format if (scrapedData.format === 'html') { const $ = cheerio.load(scrapedData.content); const links = []; $('a').each((i, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); if (href) { links.push({ href, text }); } }); return links; } // For Markdown format, you would need a Markdown parser return []; } extractLinks('https://example.com').then(links => { console.log(`Found ${links.length} links:`); links.slice(0, 10).forEach(link => { console.log(`- ${link.text}: ${link.href}`); }); });
async function scrapeAndStoreInKnowledgeBase(url, knowledgeBaseId) { const scrapedData = await scrapePage(url); const response = await axios.post('https://app.dumplingai.com/api/v1/knowledge-bases/add', { knowledgeBaseId: knowledgeBaseId, content: scrapedData.content, metadata: { source: url, title: scrapedData.title, scrapedAt: new Date().toISOString() } }, { headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer YOUR_API_KEY' } } ); return response.data; }