Here is simple Scrapper written in Python
The technique that used in this scraper is called corpus...
The technique that used in this scraper is called corpus...
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # dependencies BeautifulSoup import os import ssl import re from urllib.request import urlopen, Request from bs4 import BeautifulSoup class WikiFruitCorpus: _url = "" __headers = "" __dir_location = os.path.dirname(os.path.realpath(__file__)) + "/" # A not Recommended solution to SSL ERROR ssl._create_default_https_context = ssl._create_unverified_context def __init__(self, header="Mozilla/5.0"): self._url = "https://simple.wikipedia.org/wiki/List_of_fruits" self.set_headers(header) def set_headers(self, header): self.__headers = {"User-Agent": header} def get_fruit_links(self): fruit_links = [] req = Request(self._url, headers=self.__headers) page = urlopen(req) soup = BeautifulSoup(page, "html.parser") links = soup.findAll("a") for link in links: tmp = link.get("href") if str(tmp).startswith("/wiki/"): fruit_links.append(self.__remove_non_ascii(tmp)) return fruit_links[2:87] @staticmethod def __remove_non_ascii(text): return "".join([i if ord(i) < 128 else "" for i in text]) def create_corpus(self, links_list): dir_name = "fruit_corpus_simplewiki" dir_loc = self.__dir_location + dir_name + "/" base_url = "https://simple.wikipedia.org" if not os.path.exists(dir_loc): try: os.mkdir("fruit_corpus_simplewiki") except OSError as e: print(e) for link in links_list: fruit_url = base_url + link print("Working with ", fruit_url) soup = BeautifulSoup(urlopen(fruit_url).read(), "html.parser") html_content = soup.findAll("p") main_content = str(self.__remove_non_ascii("\n".join(["".join(w.text) for w in html_content]))) filename = link[str(link).rindex("/") + 1: len(link)] try: # opening file with 'wb' may causes the TypeError with open(dir_loc + filename + ".txt", "w") as f: main_content = main_content.replace('. ', '.\n') # Remove the tags at the end of the line --> [x] main_content = re.sub("\[\d+\]", "", main_content) f.write(main_content) except IOError as e: print("Error in writing to file", e) print("File \"%s.txt\" saved in %s" % (filename, dir_loc)) def main(): corpus = WikiFruitCorpus() header = "Mo5zilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.3)"\ "Gecko/20090913 Firefox/3.5.3" corpus.set_headers(header) fruit_list = corpus.get_fruit_links() corpus.create_corpus(fruit_list) if __name__ == "__main__": main()
No comments :
Post a Comment