Building a Content Extraction Workflow

This example demonstrates how to build a complete content extraction workflow using Dumpling AI. We’ll show how to:

  1. Scrape web content
  2. Extract the most important information
  3. Store the information in a knowledge base
  4. Create a search interface for the extracted content

Setup

First, let’s set up our environment and API key:

const axios = require('axios');

// Configuration
const API_KEY = 'YOUR_API_KEY';
const BASE_URL = 'https://app.dumplingai.com/api/v1';
const KNOWLEDGE_BASE_ID = 'YOUR_KNOWLEDGE_BASE_ID'; // Create this in the dashboard

// Helper function for API requests
async function callDumplingAPI(endpoint, data) {
  try {
    const response = await axios.post(`${BASE_URL}/${endpoint}`, data, {
      headers: {
        'Content-Type': 'application/json',
        'Authorization': `Bearer ${API_KEY}`
      }
    });
    return response.data;
  } catch (error) {
    console.error(`Error calling ${endpoint}:`, error.response ? error.response.data : error.message);
    throw error;
  }
}

Step 1: Scrape Web Content

First, let’s scrape content from a website:

async function scrapeWebsite(url) {
  console.log(`Scraping website: ${url}`);
  
  const result = await callDumplingAPI('scrape', {
    url: url,
    format: 'markdown',
    cleaned: true,
    renderJs: true
  });
  
  console.log(`Successfully scraped: ${result.title}`);
  return result;
}

// Usage
const url = 'https://example.com/article';
const scrapedContent = await scrapeWebsite(url);

Step 2: Extract Key Information

Next, let’s use an AI agent to extract the most important information from the content:

async function extractKeyInformation(content, title) {
  console.log(`Extracting key information from: ${title}`);
  
  const prompt = `
    Extract the most important information from the following content. 
    Format your response as JSON with the following structure:
    {
      "title": "The main title",
      "summary": "A concise summary (max 150 words)",
      "keyPoints": ["point 1", "point 2", "point 3"],
      "entities": [{"name": "Entity name", "type": "person/organization/location/etc"}],
      "categories": ["category1", "category2"]
    }
    
    Content:
    ${content}
  `;
  
  const result = await callDumplingAPI('agents/generate-completion', {
    messages: [
      {
        role: 'user',
        content: prompt
      }
    ],
    agentId: 'YOUR_AGENT_ID', // Replace with your agent ID
    parseJson: true
  });
  
  console.log(`Successfully extracted information with ${result.parsedJson.keyPoints.length} key points`);
  return result.parsedJson;
}

// Usage
const extractedInfo = await extractKeyInformation(scrapedContent.content, scrapedContent.title);

Step 3: Store in Knowledge Base

Now, let’s store the extracted information in a knowledge base:

async function storeInKnowledgeBase(extractedInfo, sourceUrl) {
  console.log(`Storing information in knowledge base...`);
  
  // Format content for the knowledge base
  const formattedContent = `
    # ${extractedInfo.title}
    
    ## Summary
    ${extractedInfo.summary}
    
    ## Key Points
    ${extractedInfo.keyPoints.map(point => `- ${point}`).join('\n')}
    
    ## Categories
    ${extractedInfo.categories.join(', ')}
  `;
  
  const result = await callDumplingAPI('knowledge-bases/add', {
    knowledgeBaseId: KNOWLEDGE_BASE_ID,
    content: formattedContent,
    metadata: {
      title: extractedInfo.title,
      source: sourceUrl,
      extractedAt: new Date().toISOString(),
      categories: extractedInfo.categories,
      entities: extractedInfo.entities
    }
  });
  
  console.log(`Successfully stored in knowledge base with ID: ${result.id}`);
  return result;
}

// Usage
const storedResult = await storeInKnowledgeBase(extractedInfo, url);

Step 4: Create a Search Interface

Finally, let’s create a simple function to search our knowledge base:

async function searchKnowledgeBase(query) {
  console.log(`Searching knowledge base for: ${query}`);
  
  const result = await callDumplingAPI('knowledge-bases/search', {
    knowledgeBaseId: KNOWLEDGE_BASE_ID,
    query: query,
    maxResults: 5
  });
  
  console.log(`Found ${result.results.length} results for query: ${query}`);
  
  // Format and display results
  result.results.forEach((item, index) => {
    console.log(`\nResult ${index + 1} (Score: ${item.score.toFixed(2)}):`);
    console.log(`Title: ${item.metadata.title}`);
    console.log(`Source: ${item.metadata.source}`);
    console.log(`Extracted: ${new Date(item.metadata.extractedAt).toLocaleString()}`);
    console.log(`Categories: ${item.metadata.categories.join(', ')}`);
    console.log(`\nContent Preview: ${item.content.substring(0, 200)}...`);
  });
  
  return result.results;
}

// Usage
const searchResults = await searchKnowledgeBase("analytics dashboard features");

Putting It All Together

Now, let’s put everything together into a complete workflow:

async function contentExtractionWorkflow(urls) {
  const processedUrls = [];
  
  for (const url of urls) {
    try {
      console.log(`\n==== Processing URL: ${url} ====\n`);
      
      // Step 1: Scrape website
      const scrapedContent = await scrapeWebsite(url);
      
      // Step 2: Extract key information
      const extractedInfo = await extractKeyInformation(
        scrapedContent.content, 
        scrapedContent.title
      );
      
      // Step 3: Store in knowledge base
      const storedResult = await storeInKnowledgeBase(extractedInfo, url);
      
      processedUrls.push({
        url,
        title: extractedInfo.title,
        success: true,
        knowledgeBaseItemId: storedResult.id
      });
      
    } catch (error) {
      console.error(`Error processing URL ${url}:`, error);
      processedUrls.push({
        url,
        success: false,
        error: error.message
      });
    }
  }
  
  console.log(`\n==== Processing Complete ====\n`);
  console.log(`Successfully processed ${processedUrls.filter(u => u.success).length} out of ${urls.length} URLs`);
  
  return processedUrls;
}

// Example usage
const urlsToProcess = [
  'https://example.com/article1',
  'https://example.com/article2',
  'https://example.com/blog/post1'
];

const results = await contentExtractionWorkflow(urlsToProcess);

// Search the processed content
const searchResults = await searchKnowledgeBase("machine learning trends");

Web Interface Example

Here’s a simple Express.js backend with a search endpoint:

const express = require('express');
const cors = require('cors');
const app = express();
const port = 3000;

app.use(cors());
app.use(express.json());

// Search endpoint
app.post('/api/search', async (req, res) => {
  try {
    const { query } = req.body;
    
    if (!query) {
      return res.status(400).json({ error: 'Query is required' });
    }
    
    const results = await searchKnowledgeBase(query);
    
    res.json({
      query,
      results: results.map(item => ({
        title: item.metadata.title,
        source: item.metadata.source,
        extractedAt: item.metadata.extractedAt,
        categories: item.metadata.categories,
        preview: item.content.substring(0, 200) + '...',
        score: item.score
      }))
    });
  } catch (error) {
    console.error('Search error:', error);
    res.status(500).json({ error: 'Search failed', details: error.message });
  }
});

app.listen(port, () => {
  console.log(`Server running on port ${port}`);
});

And a simple HTML frontend:

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Knowledge Base Search</title>
  <style>
    body {
      font-family: Arial, sans-serif;
      max-width: 800px;
      margin: 0 auto;
      padding: 20px;
    }
    .search-box {
      display: flex;
      margin-bottom: 20px;
    }
    .search-box input {
      flex-grow: 1;
      padding: 10px;
      font-size: 16px;
      border: 1px solid #ddd;
      border-radius: 4px 0 0 4px;
    }
    .search-box button {
      padding: 10px 20px;
      background: #000;
      color: white;
      border: none;
      border-radius: 0 4px 4px 0;
      cursor: pointer;
    }
    .result {
      margin-bottom: 20px;
      padding: 15px;
      border: 1px solid #ddd;
      border-radius: 4px;
    }
    .result-title {
      margin-top: 0;
      font-size: 18px;
    }
    .result-meta {
      color: #666;
      font-size: 14px;
      margin-bottom: 10px;
    }
    .result-preview {
      color: #333;
      line-height: 1.5;
    }
    .categories {
      display: flex;
      flex-wrap: wrap;
      gap: 5px;
    }
    .category {
      background: #f0f0f0;
      padding: 3px 8px;
      border-radius: 12px;
      font-size: 12px;
    }
  </style>
</head>
<body>
  <h1>Knowledge Base Search</h1>
  
  <div class="search-box">
    <input type="text" id="search-input" placeholder="Search the knowledge base...">
    <button id="search-button">Search</button>
  </div>
  
  <div id="results-container"></div>
  
  <script>
    document.getElementById('search-button').addEventListener('click', performSearch);
    document.getElementById('search-input').addEventListener('keypress', function(e) {
      if (e.key === 'Enter') {
        performSearch();
      }
    });
    
    async function performSearch() {
      const query = document.getElementById('search-input').value.trim();
      
      if (!query) return;
      
      const resultsContainer = document.getElementById('results-container');
      resultsContainer.innerHTML = '<p>Searching...</p>';
      
      try {
        const response = await fetch('http://localhost:3000/api/search', {
          method: 'POST',
          headers: {
            'Content-Type': 'application/json'
          },
          body: JSON.stringify({ query })
        });
        
        const data = await response.json();
        
        if (data.results.length === 0) {
          resultsContainer.innerHTML = '<p>No results found.</p>';
          return;
        }
        
        resultsContainer.innerHTML = '';
        
        data.results.forEach(result => {
          const resultElement = document.createElement('div');
          resultElement.className = 'result';
          
          const date = new Date(result.extractedAt).toLocaleDateString();
          
          resultElement.innerHTML = `
            <h3 class="result-title">${result.title}</h3>
            <div class="result-meta">
              <div>Source: <a href="${result.source}" target="_blank">${result.source}</a></div>
              <div>Extracted: ${date}</div>
              <div class="categories">
                ${result.categories.map(cat => `<span class="category">${cat}</span>`).join('')}
              </div>
            </div>
            <p class="result-preview">${result.preview}</p>
          `;
          
          resultsContainer.appendChild(resultElement);
        });
      } catch (error) {
        resultsContainer.innerHTML = `<p>Error: ${error.message}</p>`;
      }
    }
  </script>
</body>
</html>

Conclusion

This example demonstrates a complete content extraction workflow using Dumpling AI:

  1. Scraping web content with the scrape endpoint
  2. Extracting key information with AI agents
  3. Storing and organizing the information in a knowledge base
  4. Creating a search interface to make the information easily accessible

You can extend this workflow by:

  • Adding scheduled scraping of websites for regular updates
  • Implementing content categorization and tagging
  • Adding user authentication for personalized search experiences
  • Creating visualizations of the extracted data

For more examples and ideas, check out our other guides and tutorials.