Here is simple Scrapper written in Python
The technique that used in this scraper is called corpus...
The technique that used in this scraper is called corpus...
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# dependencies BeautifulSoup
import os
import ssl
import re
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
class WikiFruitCorpus:
_url = ""
__headers = ""
__dir_location = os.path.dirname(os.path.realpath(__file__)) + "/"
# A not Recommended solution to SSL ERROR
ssl._create_default_https_context = ssl._create_unverified_context
def __init__(self, header="Mozilla/5.0"):
self._url = "https://simple.wikipedia.org/wiki/List_of_fruits"
self.set_headers(header)
def set_headers(self, header):
self.__headers = {"User-Agent": header}
def get_fruit_links(self):
fruit_links = []
req = Request(self._url, headers=self.__headers)
page = urlopen(req)
soup = BeautifulSoup(page, "html.parser")
links = soup.findAll("a")
for link in links:
tmp = link.get("href")
if str(tmp).startswith("/wiki/"):
fruit_links.append(self.__remove_non_ascii(tmp))
return fruit_links[2:87]
@staticmethod
def __remove_non_ascii(text):
return "".join([i if ord(i) < 128 else "" for i in text])
def create_corpus(self, links_list):
dir_name = "fruit_corpus_simplewiki"
dir_loc = self.__dir_location + dir_name + "/"
base_url = "https://simple.wikipedia.org"
if not os.path.exists(dir_loc):
try:
os.mkdir("fruit_corpus_simplewiki")
except OSError as e:
print(e)
for link in links_list:
fruit_url = base_url + link
print("Working with ", fruit_url)
soup = BeautifulSoup(urlopen(fruit_url).read(), "html.parser")
html_content = soup.findAll("p")
main_content = str(self.__remove_non_ascii("\n".join(["".join(w.text) for w in html_content])))
filename = link[str(link).rindex("/") + 1: len(link)]
try:
# opening file with 'wb' may causes the TypeError
with open(dir_loc + filename + ".txt", "w") as f:
main_content = main_content.replace('. ', '.\n')
# Remove the tags at the end of the line --> [x]
main_content = re.sub("\[\d+\]", "", main_content)
f.write(main_content)
except IOError as e:
print("Error in writing to file", e)
print("File \"%s.txt\" saved in %s" % (filename, dir_loc))
def main():
corpus = WikiFruitCorpus()
header = "Mo5zilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.3)"\
"Gecko/20090913 Firefox/3.5.3"
corpus.set_headers(header)
fruit_list = corpus.get_fruit_links()
corpus.create_corpus(fruit_list)
if __name__ == "__main__":
main()
No comments :
Post a Comment