Jump to content
0xf

OB Config - Startpage.com Scraper

Recommended Posts

Hier mal meine Config für OpenBullet 2 um auf Startpage Google Resultate zu scrapen.
Unbedingt Proxies verwenden da ansonsten gejammert wird.
Ebenso ist ein zusätzlicher Eintrag in eurer Environment.ini (C:\Users\Administrator\OpenBullet2\UserData) nötig:

[WORDLIST TYPE]
Name=Query
Regex=^.*$
Verify=True
Separator=
Slices=QUERY

Config als Lolicode:

BLOCK:RegexReplace
LABEL:replace space with plus in queries
  original = $"<input.QUERY>"
  pattern = "\\ "
  replacement = "+"
  => VAR @regexReplaceOutput
ENDBLOCK

BLOCK:HttpRequest
LABEL:Startpage Page 1
  url = "https://www.startpage.com/sp/search"
  method = POST
  customCookies = ${}
  customHeaders = {("Cache-Control", "max-age=0"), ("Sec-Ch-Ua", "\";Not A Brand\";v=\"99\", \"Chromium\";v=\"94\""), ("Sec-Ch-Ua-Mobile", "?0"), ("Sec-Ch-Ua-Platform", "\"Windows\""), ("Upgrade-Insecure-Requests", "1"), ("Origin", "https://www.startpage.com"), ("Content-Type", "application/x-www-form-urlencoded"), ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"), ("Sec-Fetch-Site", "same-origin"), ("Sec-Fetch-Mode", "navigate"), ("Sec-Fetch-User", "?1"), ("Sec-Fetch-Dest", "document"), ("Referer", "https://www.startpage.com/"), ("Accept-Encoding", "gzip, deflate"), ("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8"), ("Connection", "close")}
  TYPE:STANDARD
  $"language=deutsch&abp=-1&lui=deutsch&prfe=b126ccae90a4470f4209ffa74e981d8bfd424470b1bf0f689ab468f66412670df83212ec2f0ef7e712f6f1cd6ed58c4ba29355b7b9adb3db62bda232ca44ed1c9e362256615508d71bde1d474c481c38&t=default&query=<regexReplaceOutput>&cat=web&page=1"
  "text/html; charset=utf-8"
ENDBLOCK

BLOCK:Keycheck
  banIfNoMatch = False
  KEYCHAIN BAN OR
    STRINGKEY @data.SOURCE Contains "Please complete the CAPTCHA"
ENDBLOCK

BLOCK:Parse
LABEL:Parse Results
  input = @data.SOURCE
  leftDelim = "class=\"w-gl__result-url result-link\"\\nhref=\""
  rightDelim = "\""
  cssSelector = ".w-gl__result-url"
  attributeName = "innerHTML"
  xPath = "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"w-gl__result-url\", \" \" ))]"
  pattern = "class\\=\\\"w\\-gl\\_\\_result\\-url\\ result\\-link\\\"\\n(\\ )+href\\="
  multiLine = True
  RECURSIVE
  MODE:CSS
  => VAR @parsepage
ENDBLOCK

BLOCK:Parse
LABEL:Parse clear Results
  input = @parsepage
  pattern = "(?<protocol>\\w*)\\:\\/\\/(?:(?:(?<thld>[\\w\\-]*)(?:\\.))?(?<sld>[\\w\\-]*))\\.(?<tld>\\w*)(?:\\:(?<port>\\d*))?"
  outputFormat = "[0]"
  multiLine = True
  RECURSIVE
  MODE:Regex
  => VAR @parseregex
ENDBLOCK

BLOCK:FileWriteLines
LABEL:write to temp file
  path = "startpagescraper\\temp.txt"
  lines = @parseregex
ENDBLOCK

BLOCK:HttpRequest
LABEL:Startpage Page 2
  url = "https://www.startpage.com/sp/search"
  method = POST
  customCookies = ${}
  customHeaders = {("Cache-Control", "max-age=0"), ("Sec-Ch-Ua", "\";Not A Brand\";v=\"99\", \"Chromium\";v=\"94\""), ("Sec-Ch-Ua-Mobile", "?0"), ("Sec-Ch-Ua-Platform", "\"Windows\""), ("Upgrade-Insecure-Requests", "1"), ("Origin", "https://www.startpage.com"), ("Content-Type", "application/x-www-form-urlencoded"), ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"), ("Sec-Fetch-Site", "same-origin"), ("Sec-Fetch-Mode", "navigate"), ("Sec-Fetch-User", "?1"), ("Sec-Fetch-Dest", "document"), ("Referer", "https://www.startpage.com/"), ("Accept-Encoding", "gzip, deflate"), ("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8"), ("Connection", "close")}
  TYPE:STANDARD
  $"language=deutsch&abp=-1&lui=deutsch&prfe=b126ccae90a4470f4209ffa74e981d8bfd424470b1bf0f689ab468f66412670df83212ec2f0ef7e712f6f1cd6ed58c4ba29355b7b9adb3db62bda232ca44ed1c9e362256615508d71bde1d474c481c38&t=default&query=<regexReplaceOutput>&cat=web&page=2"
  "text/html; charset=utf-8"
ENDBLOCK

BLOCK:Keycheck
  banIfNoMatch = False
  KEYCHAIN BAN OR
    STRINGKEY @data.SOURCE Contains "Please complete the CAPTCHA"
ENDBLOCK

BLOCK:Parse
LABEL:Parse Results
  input = @data.SOURCE
  leftDelim = "class=\"w-gl__result-url result-link\"\\nhref=\""
  rightDelim = "\""
  cssSelector = ".w-gl__result-url"
  attributeName = "innerHTML"
  xPath = "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"w-gl__result-url\", \" \" ))]"
  pattern = "class\\=\\\"w\\-gl\\_\\_result\\-url\\ result\\-link\\\"\\n(\\ )+href\\="
  multiLine = True
  RECURSIVE
  MODE:CSS
  => VAR @parsepage
ENDBLOCK

BLOCK:Parse
LABEL:Parse clear Results
  input = @parsepage
  pattern = "(?<protocol>\\w*)\\:\\/\\/(?:(?:(?<thld>[\\w\\-]*)(?:\\.))?(?<sld>[\\w\\-]*))\\.(?<tld>\\w*)(?:\\:(?<port>\\d*))?"
  outputFormat = "[0]"
  multiLine = True
  RECURSIVE
  MODE:Regex
  => VAR @parseregex
ENDBLOCK

BLOCK:FileAppendLines
LABEL:write to temp file
  path = "startpagescraper\\temp.txt"
  lines = @parseregex
ENDBLOCK

BLOCK:HttpRequest
LABEL:Startpage Page 3
  url = "https://www.startpage.com/sp/search"
  method = POST
  customCookies = ${}
  customHeaders = {("Cache-Control", "max-age=0"), ("Sec-Ch-Ua", "\";Not A Brand\";v=\"99\", \"Chromium\";v=\"94\""), ("Sec-Ch-Ua-Mobile", "?0"), ("Sec-Ch-Ua-Platform", "\"Windows\""), ("Upgrade-Insecure-Requests", "1"), ("Origin", "https://www.startpage.com"), ("Content-Type", "application/x-www-form-urlencoded"), ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"), ("Sec-Fetch-Site", "same-origin"), ("Sec-Fetch-Mode", "navigate"), ("Sec-Fetch-User", "?1"), ("Sec-Fetch-Dest", "document"), ("Referer", "https://www.startpage.com/"), ("Accept-Encoding", "gzip, deflate"), ("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8"), ("Connection", "close")}
  TYPE:STANDARD
  $"language=deutsch&abp=-1&lui=deutsch&prfe=b126ccae90a4470f4209ffa74e981d8bfd424470b1bf0f689ab468f66412670df83212ec2f0ef7e712f6f1cd6ed58c4ba29355b7b9adb3db62bda232ca44ed1c9e362256615508d71bde1d474c481c38&t=default&query=<regexReplaceOutput>&cat=web&page=3"
  "text/html; charset=utf-8"
ENDBLOCK

BLOCK:Keycheck
  banIfNoMatch = False
  KEYCHAIN BAN OR
    STRINGKEY @data.SOURCE Contains "Please complete the CAPTCHA"
ENDBLOCK

BLOCK:Parse
LABEL:Parse Results
  input = @data.SOURCE
  leftDelim = "class=\"w-gl__result-url result-link\"\\nhref=\""
  rightDelim = "\""
  cssSelector = ".w-gl__result-url"
  attributeName = "innerHTML"
  xPath = "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"w-gl__result-url\", \" \" ))]"
  pattern = "class\\=\\\"w\\-gl\\_\\_result\\-url\\ result\\-link\\\"\\n(\\ )+href\\="
  multiLine = True
  RECURSIVE
  MODE:CSS
  => VAR @parsepage
ENDBLOCK

BLOCK:Parse
LABEL:Parse clear Results
  input = @parsepage
  pattern = "(?<protocol>\\w*)\\:\\/\\/(?:(?:(?<thld>[\\w\\-]*)(?:\\.))?(?<sld>[\\w\\-]*))\\.(?<tld>\\w*)(?:\\:(?<port>\\d*))?"
  outputFormat = "[0]"
  multiLine = True
  RECURSIVE
  MODE:Regex
  => VAR @parseregex
ENDBLOCK

BLOCK:FileAppendLines
LABEL:write to temp file
  path = "startpagescraper\\temp.txt"
  lines = @parseregex
ENDBLOCK

BLOCK:HttpRequest
LABEL:Startpage Page 4
  url = "https://www.startpage.com/sp/search"
  method = POST
  customCookies = ${}
  customHeaders = {("Cache-Control", "max-age=0"), ("Sec-Ch-Ua", "\";Not A Brand\";v=\"99\", \"Chromium\";v=\"94\""), ("Sec-Ch-Ua-Mobile", "?0"), ("Sec-Ch-Ua-Platform", "\"Windows\""), ("Upgrade-Insecure-Requests", "1"), ("Origin", "https://www.startpage.com"), ("Content-Type", "application/x-www-form-urlencoded"), ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"), ("Sec-Fetch-Site", "same-origin"), ("Sec-Fetch-Mode", "navigate"), ("Sec-Fetch-User", "?1"), ("Sec-Fetch-Dest", "document"), ("Referer", "https://www.startpage.com/"), ("Accept-Encoding", "gzip, deflate"), ("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8"), ("Connection", "close")}
  TYPE:STANDARD
  $"language=deutsch&abp=-1&lui=deutsch&prfe=b126ccae90a4470f4209ffa74e981d8bfd424470b1bf0f689ab468f66412670df83212ec2f0ef7e712f6f1cd6ed58c4ba29355b7b9adb3db62bda232ca44ed1c9e362256615508d71bde1d474c481c38&t=default&query=<regexReplaceOutput>&cat=web&page=4"
  "text/html; charset=utf-8"
ENDBLOCK

BLOCK:Keycheck
  banIfNoMatch = False
  KEYCHAIN BAN OR
    STRINGKEY @data.SOURCE Contains "Please complete the CAPTCHA"
ENDBLOCK

BLOCK:Parse
LABEL:Parse Results
  input = @data.SOURCE
  leftDelim = "class=\"w-gl__result-url result-link\"\\nhref=\""
  rightDelim = "\""
  cssSelector = ".w-gl__result-url"
  attributeName = "innerHTML"
  xPath = "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"w-gl__result-url\", \" \" ))]"
  pattern = "class\\=\\\"w\\-gl\\_\\_result\\-url\\ result\\-link\\\"\\n(\\ )+href\\="
  multiLine = True
  RECURSIVE
  MODE:CSS
  => VAR @parsepage
ENDBLOCK

BLOCK:Parse
LABEL:Parse clear Results
  input = @parsepage
  pattern = "(?<protocol>\\w*)\\:\\/\\/(?:(?:(?<thld>[\\w\\-]*)(?:\\.))?(?<sld>[\\w\\-]*))\\.(?<tld>\\w*)(?:\\:(?<port>\\d*))?"
  outputFormat = "[0]"
  multiLine = True
  RECURSIVE
  MODE:Regex
  => VAR @parseregex
ENDBLOCK

BLOCK:FileAppendLines
LABEL:write to temp file
  path = "startpagescraper\\temp.txt"
  lines = @parseregex
ENDBLOCK

BLOCK:HttpRequest
LABEL:Startpage Page 5
  url = "https://www.startpage.com/sp/search"
  method = POST
  customCookies = ${}
  customHeaders = {("Cache-Control", "max-age=0"), ("Sec-Ch-Ua", "\";Not A Brand\";v=\"99\", \"Chromium\";v=\"94\""), ("Sec-Ch-Ua-Mobile", "?0"), ("Sec-Ch-Ua-Platform", "\"Windows\""), ("Upgrade-Insecure-Requests", "1"), ("Origin", "https://www.startpage.com"), ("Content-Type", "application/x-www-form-urlencoded"), ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"), ("Sec-Fetch-Site", "same-origin"), ("Sec-Fetch-Mode", "navigate"), ("Sec-Fetch-User", "?1"), ("Sec-Fetch-Dest", "document"), ("Referer", "https://www.startpage.com/"), ("Accept-Encoding", "gzip, deflate"), ("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8"), ("Connection", "close")}
  TYPE:STANDARD
  $"language=deutsch&abp=-1&lui=deutsch&prfe=b126ccae90a4470f4209ffa74e981d8bfd424470b1bf0f689ab468f66412670df83212ec2f0ef7e712f6f1cd6ed58c4ba29355b7b9adb3db62bda232ca44ed1c9e362256615508d71bde1d474c481c38&t=default&query=<regexReplaceOutput>&cat=web&page=5"
  "text/html; charset=utf-8"
ENDBLOCK

BLOCK:Keycheck
  banIfNoMatch = False
  KEYCHAIN BAN OR
    STRINGKEY @data.SOURCE Contains "Please complete the CAPTCHA"
ENDBLOCK

BLOCK:Parse
LABEL:Parse Results
  input = @data.SOURCE
  leftDelim = "class=\"w-gl__result-url result-link\"\\nhref=\""
  rightDelim = "\""
  cssSelector = ".w-gl__result-url"
  attributeName = "innerHTML"
  xPath = "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"w-gl__result-url\", \" \" ))]"
  pattern = "class\\=\\\"w\\-gl\\_\\_result\\-url\\ result\\-link\\\"\\n(\\ )+href\\="
  multiLine = True
  RECURSIVE
  MODE:CSS
  => VAR @parsepage
ENDBLOCK

BLOCK:Parse
LABEL:Parse clear Results
  input = @parsepage
  pattern = "(?<protocol>\\w*)\\:\\/\\/(?:(?:(?<thld>[\\w\\-]*)(?:\\.))?(?<sld>[\\w\\-]*))\\.(?<tld>\\w*)(?:\\:(?<port>\\d*))?"
  outputFormat = "[0]"
  multiLine = True
  RECURSIVE
  MODE:Regex
  => VAR @parseregex
ENDBLOCK

BLOCK:FileAppendLines
LABEL:write to temp file
  path = "startpagescraper\\temp.txt"
  lines = @parseregex
ENDBLOCK

BLOCK:FileReadLines
LABEL:read from temp file
  path = "startpagescraper\\temp.txt"
  => VAR @fileReadLinesOutput
ENDBLOCK

BLOCK:RemoveDuplicates
  list = @fileReadLinesOutput
  => VAR @removeDuplicatesOutput
ENDBLOCK

BLOCK:FileAppendLines
  path = "startpagescraper\\domains.txt"
  lines = @removeDuplicatesOutput
ENDBLOCK

BLOCK:Keycheck
  banIfNoMatch = False
  KEYCHAIN SUCCESS OR
    STRINGKEY @removeDuplicatesOutput MatchesRegex "(?<protocol>\\w*)\\:\\/\\/(?:(?:(?<thld>[\\w\\-]*)(?:\\.))?(?<sld>[\\w\\-]*))\\.(?<tld>\\w*)(?:\\:(?<port>\\d*))?"
  KEYCHAIN CUSTOM OR
    STRINGKEY @data.SOURCE Contains "Oh, es gibt keine Ergebnisse für diese Suche."
  KEYCHAIN BAN OR
    STRINGKEY @data.SOURCE Contains "Please complete the CAPTCHA"
ENDBLOCK


 

Share this post


Link to post












Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
×