LinkFinder performs a broadcrawl looking for webpages with comment sections on them. If it finds one it marks it as a match and uploads it to the database. These pages can then be sorted through later to be used for SEO. I used Python with Scrapy for most of the project as well as PHP + HTML & CSS to create the link viewer and a MySQL database.
The spider starts by crawling my seed sites in start_urls and finds any links. for each page we check if it has a comment section by searching for elements with an id of ‘comment’ or ‘comments’ or a class of ‘comments’ as these are very strong indicators that there is a comment section present. if it’s a match we add that url to the pipeline.
import scrapy class LinkSpider(scrapy.Spider): name = "links" start_urls = [ "https://www.bobvila.com/", "https://www.uglyducklinghouse.com/", "https://allurausa.com/blog/33-home-renovation-blogs", "https://www.bhg.com/home-improvement/", "https://www.windowreplacementcost.org/", "https://lp4.networx.com/", "https://www.bathandshower.org/", "https://www.thespruce.com/", "https://www.mrbuilditandco.com/", "https://interior.feedspot.com/home_improvement_blogs/", "https://www.familyhandyman.com/", "https://www.littlehouseonthecorner.com/", "https://www.budgetdumpster.com/blog/best-home-improvement-blogs", "https://www.hgtv.com/design/remodel/topics/home-improvement", "https://www.bathandshower.org/", "https://www.localcarpetquotes.com/", "https://carpet-cleaning.homeblue.com/", "https://www.stanleysteemer.com/services/carpet-cleaning", "https://www.stanleysteemer.com/locations/TN/Nashville/996", "https://www.homeadvisor.com/c.Carpet-Upholstery-Cleaning.Nashville.TN.-12011.html", "https://evercleantn.com/", "https://www.zerorez.com/nashville", "https://www.procareofnashville.com/", "https://www.chemdryofnashville.com/", "https://www.blazerbrosrugcleanerstn.com/", "https://www.localcarpetquotes.com/", "https://appliancemastery.com/what-is-a-dryer-vent-cleaning/", "https://ductdoctor.com/7-benefits-of-cleaning-dryer-vents/", "https://www.marthastewart.com/how-to-clean-dryer-vent-8649137", "https://www.consumerreports.org/appliances/clothes-dryers/how-to-clean-your-dryer-vent-a5519869329/", "https://allstarpros.com/", "https://request.angi.com/", "https://www.dryerventwizard.com/dryer-vent-cleaning-benefits", "https://www.westhoustondryerventcleaning.com/the-benefits-of-a-clean-dryer-vent/", "https://www.ductgurus.com/benefits-of-dryer-vent-cleaning/", "https://www.popularmechanics.com/home/a40119985/dryer-maintenance-tips/", "https://allstarpros.com/", "https://localhvacquotes.com/d/VentCleaning", "https://www.epa.gov/indoor-air-quality-iaq/should-you-have-air-ducts-your-home-cleaned", "https://www.bobvila.com/articles/cleaning-air-ducts/", "https://www.epa.gov/indoor-air-quality-iaq/should-i-have-air-ducts-my-home-cleaned", "https://www.bobvila.com/articles/reasons-to-skip-cleaning-air-ducts/", "https://todayshomeowner.com/hvac/guides/what-you-need-to-know-about-cleaning-air-ducts/", "https://williamson.mcneece.hvac-dealer.com/", ] def parse(self, response): print(response) if ( response.css("#comment").get() is not None or response.css("#comments").get() is not None or response.css(".comments").get() is not None ): yield { "match": response.url, } links = response.css('a::attr("href")').getall() for link in links: yield response.follow(link, self.parse)
My pipeline module is where I upload any matches my spider found to the database. The from_crawler classmethod grabs the DB info from settings and populates our class variables. That way I don’t have to hard code the DB info here and instead have everything neatly organized and easy to find in settings. In process_item we grab all of the matches in our pipeline and upload them to the database together using a prepared statement.
from itemadapter import ItemAdapter import mysql.connector class BacklinkfinderPipeline: def __init__(self, db_host, db_user, db_password, db_name): self.db_host = db_host self.db_user = db_user self.db_password = db_password self.db_name = db_name @classmethod def from_crawler(cls, crawler): return cls( db_host = crawler.settings.get("DB_HOST"), db_user = crawler.settings.get("DB_USER"), db_password = crawler.settings.get("DB_PASSWORD"), db_name = crawler.settings.get("DB_NAME"), ) def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get("match") is not None: mydb = mysql.connector.connect( host = db_host, user = db_user, password = db_password, database = db_name ) cursor = mydb.cursor() values = [adapter["match"]] cursor.execute('INSERT INTO MATCHES (url) VALUES (%s)', values) mydb.commit() cursor.close() return item
Most of my settings is just configuring scrapy for broad crawling. I set robotstxt_obey to true to be respectful to servers that don’t want to be crawled and set download_delay to 3 so it doesn’t overload any single domain with requests all at once. I disabled cookies, disabled retries, reduced the download timeout and disabled redirects to speed up crawl times as well as increased concurrent requests and reactor threadpool size so it can crawl more domains at once.
BOT_NAME = "BacklinkFinder" SPIDER_MODULES = ["BacklinkFinder.spiders"] NEWSPIDER_MODULE = "BacklinkFinder.spiders" #DB Connection Settings DB_HOST = "localhost" DB_USER = "root" DB_PASSWORD = "" DB_NAME = "links" ##This Priority Queue Works Better Than ScrapyPriorityQueue When Doing Broad Crawls SCHEDULER_PRIORITY_QUEUE = "scrapy.pqueues.DownloaderAwarePriorityQueue" # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = "BacklinkFinder (+http://www.yourdomain.com)" # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 ##For Broad Crawls The Higher The Better CONCURRENT_REQUESTS = 50 ##Sets DNS Resolution Pool Size REACTOR_THREADPOOL_MAXSIZE = 10 ##INFO logs a lot less info than DEBUG LOG_LEVEL = "INFO" # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 ##This prevents us from overloading a single domain with requests DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False ##We don't need them and turning them off reduces the download size COOKIES_ENABLED = False ##So we don't get stuck retrying a broken domain and using up resources RETRY_ENABLED = False ##So we don't waste resources waiting forever for a domain to resolve DOWNLOAD_TIMEOUT = 15 ##Redirects can waste a lot of time so we abandon them to save resources REDIRECT_ENABLED = False ##If the site is AJAX crawlable access the plain html version provided AJAXCRAWL_ENABLED = True # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", # "Accept-Language": "en", #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # "BacklinkFinder.middlewares.BacklinkfinderSpiderMiddleware": 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # "BacklinkFinder.middlewares.BacklinkfinderDownloaderMiddleware": 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # "scrapy.extensions.telnet.TelnetConsole": None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ##Include our pipelines ITEM_PIPELINES = { "BacklinkFinder.pipelines.BacklinkfinderPipeline": 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = "httpcache" #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" # Set settings whose default value is deprecated to a future-proof value REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" FEED_EXPORT_ENCODING = "utf-8" ##This forces us to crawl in BFO order which saves memory resources DEPTH_PRIORITY = 1 ##type of disk queue that will be used by scheduler. Set to FIFO SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue" ##type of in-memory queue used by scheduler. Set to FIFO SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue"
I made 2 basic web pages to display links. The first displays all the matches I haven’t gone through yet in a format that makes it easy to copy and paste multiple links at once. I added a line break every 400 links so that I can easily copy a batch of 400 links to process through Majestic (an online SEO tool) that can process a max of 400 links at a time. The second page displays links that I have already filtered through Majestic and Majestic shows that they are valuable pages to get a backlink from.
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <link rel="stylesheet" href="index.css"> <title>Link Finder</title> </head> <body> <?php $servername = "localhost"; $username = "root"; $password = ""; $database = "links"; $conn = new mysqli($servername, $username, $password, $database); if ($conn->connect_error) { die("Connection failed: " . $conn->connect_error); } $sql = "SELECT * FROM `matches` WHERE `citation-flow` is NULL"; $result = $conn->query($sql); if ($result->num_rows > 0) { echo $result->num_rows . " pages not run through majestic</br></br>"; $i = 1; while($row = $result->fetch_assoc()) { ?><a href="<?php echo $row["url"] ?>" target="_blank"><?php echo $row["url"] ?></a></br><?php if ($i % 400 == 0) { ?></br><?php } $i++; } } $conn->close(); ?> </body> </html>
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <link rel="stylesheet" href="index.css"> <title>Link Finder</title> </head> <body> <?php $servername = "localhost"; $username = "root"; $password = ""; $database = "links"; // Create connection $conn = new mysqli($servername, $username, $password, $database); // Check connection if ($conn->connect_error) { die("Connection failed: " . $conn->connect_error); } $sql = "SELECT * FROM `matches` WHERE `trust-flow` > 10 AND `citation-flow` > 10 AND (`language` = 'en' OR `language` is NULL)"; $result = $conn->query($sql); if ($result->num_rows > 0) { echo $result->num_rows . " matches with citation flow and trust flow greater than 10 and written in english or unknown.</br></br>"; $i = 1; while($row = $result->fetch_assoc()) { ?><a href="<?php echo $row["url"] ?>" target="_blank"><?php echo $row["url"] ?></a></br><?php if ($i % 400 == 0) { ?></br><?php } $i++; } } $conn->close(); ?> </body> </html>
Majestic provides a lot of useful metrics of a page and I can download all of them in a CSV file. I made a tool that goes through the CSV files and uploads all the metrics I care about for each page to my database.
import os import csv import mysql.connector directory = 'files' mydb = mysql.connector.connect( host = "localhost", user = "root", password = "", database = "links" ) cursor = mydb.cursor() for filename in os.listdir(directory): f = os.path.join(directory, filename) if os.path.isfile(f): print(f) with open(f, encoding='utf-8') as csvf: csvReader = csv.DictReader(csvf) dbvalues = [] for row in csvReader: dbrow = (row['TrustFlow'], row['CitationFlow'], row['Language'], row['\ufeff"Item"']) dbvalues.append(dbrow) cursor.executemany('UPDATE `matches` SET `trust-flow`=%s,`citation-flow`=%s,`language`=%s WHERE `url`=%s', dbvalues) mydb.commit() cursor.close() mydb.close()