After Hugging Face and spaCy. I have been messing about with straight Python.
Hugging Face and spaCy went pretty good but too complicated (for average person) IMO due to possible dependency hell.
So I went with straight Python as it is easier for anyone to set up.
The code is slow and still being worked on but works.
http://bezazz.com/Archive.zip
EDIT: Would need to pip install
aiohttp
flask-caching
requests
beautifulsoup4
markupsafe
I recommend a venv
— Creation of a virtual environment
uvicorn chatGPT:app --host 0.0.0.0 --port 5000
EDIT2: Here is where I stopped with spacy. Above is Python only.
import requests
import spacy
from flask import Flask, render_template, request
from urllib.parse import quote_plus
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
# YaCy instance URL and search parameters
yacy_url = "http://localhost:8090/yacysearch.json"
app = Flask(__name__)
def rank_result_by_spacy(text):
"""Process the text using SpaCy and return a score based on entity recognition."""
doc = nlp(text)
entities = [ent for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE"]] # Example of scoring based on entities
return len(entities)
def query_yacy(query):
"""Query YaCy and retrieve search results."""
params = {
"query": query, # The search term entered by the user
"maximumRecords": 5, # Get top 5 results
"contentdom": "text" # Search only for text-based content
}
try:
response = requests.get(yacy_url, params=params)
response.raise_for_status()
data = response.json()
return data.get("channels", [])
except requests.RequestException as e:
print(f"Error querying YaCy: {e}")
return []
def get_cached_page(url):
"""Fetch the cached page from YaCy."""
cached_url = f"http://localhost:8090/ViewCachedPage?url={quote_plus(url)}"
try:
response = requests.get(cached_url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error retrieving cached page for URL {url}: {e}")
return None
@app.route("/", methods=["GET", "POST"])
def home():
results_with_scores = []
if request.method == "POST":
query = request.form.get("query")
if query:
results = query_yacy(query)
# Process and rank the results using SpaCy
for result in results:
page_title = result.get("title")
page_url = result.get("link")
# Step 1: Get the cached page content
cached_content = get_cached_page(page_url)
if cached_content:
# Step 2: Process the cached page content with SpaCy
score = rank_result_by_spacy(cached_content)
else:
# If no cached content, set score to 0
score = 0
results_with_scores.append((score, page_title, page_url))
# Sort results by SpaCy score
results_with_scores.sort(reverse=True, key=lambda x: x[0])
return render_template("index.html", results=results_with_scores)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=5000, debug=True)
I just asked chatGPT to generate an index.html as that is long gone.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>YaCy Search</title>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/4.5.2/css/bootstrap.min.css">
<style>
body {
padding-top: 50px;
}
.result {
border-bottom: 1px solid #ddd;
padding-bottom: 15px;
margin-bottom: 15px;
}
.score {
font-size: 1.2em;
font-weight: bold;
}
</style>
</head>
<body>
<div class="container">
<h1 class="text-center">YaCy Search with SpaCy Ranking</h1>
<!-- Search Form -->
<form method="POST" class="form-inline justify-content-center">
<input type="text" name="query" class="form-control" placeholder="Search..." required>
<button type="submit" class="btn btn-primary ml-2">Search</button>
</form>
{% if results %}
<div class="mt-4">
<h3>Search Results</h3>
<div class="list-group">
{% for score, title, url in results %}
<div class="result">
<h4><a href="{{ url }}" target="_blank">{{ title }}</a></h4>
<p class="score">Score: {{ score }}</p>
<p><a href="{{ url }}" target="_blank">{{ url }}</a></p>
</div>
{% endfor %}
</div>
</div>
{% endif %}
{% if not results %}
<div class="mt-4">
<p>No results found. Please try another search.</p>
</div>
{% endif %}
</div>
<script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.9.3/dist/umd/popper.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/4.5.2/js/bootstrap.min.js"></script>
</body>
</html>