noellarkin Posted January 4, 2023 Posted January 4, 2023 Readability is a Python Library that emulates the "Reading Mode" used by Browsers, ie it takes an input URL, and returns the simplified HTML. It removes headers, footers and scripts. I made a simple server out of it, which takes CLI arguments for server IP and server Port to start the server. Default IP and port are 127.0.0.1:8900 Example requests that can be made: http://127.0.0.1:8900?url=https://google.com&output_type=TITLE http://127.0.0.1:8900?url=https://google.com&output_type=SHORT_TITLE http://127.0.0.1:8900?url=https://google.com&output_type=CONTENT http://127.0.0.1:8900?url=https://google.com&output_type=SUMMARY http://127.0.0.1:8900/health (to check if the server is running) expandcollapse popupimport http.server import requests import re import logging import sys from readability import Document # Set up logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") class RequestHandler(http.server.BaseHTTPRequestHandler): def do_GET(self): # Log the request logging.info(f"Received request: {self.path}") # Regular expression to match URLs URL_REGEX = re.compile(r"^https?://.+$") # Allowed output types ALLOWED_OUTPUT_TYPES = ["TITLE", "SHORT_TITLE", "CONTENT", "SUMMARY"] if self.path == "/health": # This is a health check request, return a 200 status code self.send_response(200) self.send_header("Content-type", "text/plain") self.send_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0") self.end_headers() self.wfile.write(b"OK") else: # Parse the query string to get the URL and output type query_string = self.path[2:] query_params = query_string.split("&") url = query_params[0].split("=")[1] output_type = query_params[1].split("=")[1] # Validate the input if not URL_REGEX.match(url): # URL is invalid self.send_response(400) self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(b"Invalid URL") elif output_type not in ALLOWED_OUTPUT_TYPES: # Output type is invalid self.send_response(400) self.send_header("Content-type", "text/plain") self.send_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0") self.end_headers() self.wfile.write(b"Invalid output type") else: # Input is valid, proceed with processing the request try: doc = Document(requests.get(url).content) output = { "TITLE": doc.title(), "SHORT_TITLE": doc.short_title(), "CONTENT": doc.content(), "SUMMARY": doc.summary() }[output_type] # Send the response self.send_response(200) self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(output.encode()) except Exception as e: # Log the error logging.error(f"Error: {e}") # Return an error message to the client self.send_response(500) self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(b"An error occurred while processing the request") # Get the server IP and port from the command line arguments server_ip = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1" server_port = int(sys.argv[2]) if len(sys.argv) > 2 else 8900 # Create the server and run it indefinitely server_address = (server_ip, server_port) httpd = http.server.HTTPServer(server_address, RequestHandler) # Log an info message when the server starts logging.info("Server started") httpd.serve_forever() Note: make sure you have the readability library https://github.com/buriy/python-readability before using this pip install readability-lxml
noellarkin Posted January 4, 2023 Author Posted January 4, 2023 Example use cases: Wordpress Blog Posthttps://lmilosis.wordpress.com/2020/01/26/19/ http://127.0.0.1:8900/?url=https://lmilosis.wordpress.com/2020/01/26/19/&output_type=SUMMARY News Articlehttps://us.cnn.com/2023/01/04/weather/severe-storm-tornado-threat-south-wednesday/index.html http://127.0.0.1:8900/?url=https://us.cnn.com/2023/01/04/weather/severe-storm-tornado-threat-south-wednesday/index.html&output_type=SUMMARY It doesn't do too welll with JS heavy sites. You may want to edit the script to take HTML source as input instead if you're using another tool to scrape the HTML.
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now