data
PaperItem
A structure that store data of a paper.
| Attributes: |
|
|---|
Source code in src/racp/data.py
class PaperItem:
'''A structure that store data of a paper.
Attributes:
arxiv_id: The arXiv id of the paper.
ss_id: The semantics scholar id of the paper.
citations: Set of ss_ids of the papers cite this paper.
references: Set of ss_ids of the papers cited by this paper.
authors: List of semantics scholar author data of this paper.
publication: The publication type of the paper(A list).
date: The publication date from arXiv.
title: The title of the paper from arXiv.
abstract: The abstract of this paper from arXiv.
content: The text of this paper from arXiv.
logger: logurus logger
'''
def __init__(
self,
arxiv_id = None,
ss_id = None,
data = None,
logger = crawl.logger,
key = ""
) -> None:
'''Initialize with arxiv id or ss id. Pass in an api key if you have.'''
self.arxiv_id = ""
self.ss_id = ""
self.citations = set()
self.references = set()
self.authors = []
self.publication = []
self.date = ""
self.title = ""
self.abstract = ""
self.content = ""
self.logger = logger
self._quality = None
if arxiv_id != None:
self.get_data_by_arxiv(arxiv_id, key)
if data != None:
self.load_json(data)
def __repr__(self) -> str:
return json.dumps(self.to_json(), indent=2)
def get_data_by_arxiv(self, arxiv_id, key):
try:
data = crawl.get_ss_data_by_arxiv(arxiv_id, self.logger, key)
except:
raise ConnectionError("Fail to get semantics scholar data.")
self.arxiv_id = arxiv_id
self.ss_id = data["paperId"]
self.citations = set([item["paperId"] for item in data["citations"]])
self.references = set([item["paperId"] for item in data["references"]])
try:
self.authors = crawl.get_author_info([item["authorId"] \
for item in data["authors"]], self.logger, key)
except:
raise ConnectionError("Fail to get semantics scholar data.")
self.publication = data["publicationTypes"]
self.date = data["publicationDate"]
self.title = data["title"]
self.abstract = data["abstract"]
try:
self.content = crawl.get_arxiv_data(self.arxiv_id, self.logger)
except:
raise ConnectionError("Fail to get arXiv data.")
def to_json(self):
'''Convert to json format'''
return {
"arxivId": self.arxiv_id,
"paperId": self.ss_id,
"citations": list(self.citations),
"references": list(self.references),
"authors": self.authors,
"publication": self.publication,
"date": self.date,
"title": self.title,
"abstract": self.abstract,
"content": self.content
}
def to_Document(self):
"""Convert to document format"""
from langchain_core.documents.base import Document
abstract = self.abstract
content = self.content
if abstract is None:
abstract = self.content[:250]
# TODO : content retrival
if abstract is None:
abstract = ""
metadata = {"source":self.arxiv_id,"title":self.title,"quality":self.quality}
metadata = {"source":self.arxiv_id,"title":self.title,"quality":str(self.quality)}
doc = Document(metadata=metadata,page_content=abstract)
return doc
def save_json(self, save_path):
'''Save as a json file'''
save_json(self.to_json(),os.path.join(save_path, f"{self.arxiv_id}.json"),\
self.logger, f"{self.arxiv_id}.json")
def load_json(self, json_data):
'''Load from a json dictionary'''
if not isinstance(json_data, dict):
raise ValueError("Please pass in a dictionary")
try:
self.arxiv_id = json_data.get("arxivId", "")
self.ss_id = json_data.get("paperId", "")
self.citations = set(json_data.get("citations", []))
self.references = set(json_data.get("references",[]))
self.authors = json_data.get("authors", [])
self.publication = json_data.get("publication", [])
self.date = json_data.get("date", "")
self.title = json_data.get("title", "")
self.abstract = json_data.get("abstract", "")
self.content = json_data.get("content", "")
except:
raise ValueError("Fail to load data, please check the items.")
@property
def quality(self):
"""Evaluate confidence quality."""
if self._quality is None: # Calculate only if not computed yet
# TODO: normalize citation
cite_num = len(self.citations)
# pubdate = datetime.strptime(self.date, "%Y-%m-%d").date()
# today = datetime.now().date()
# days_diff = (today - pubdate).days
# cite_diff = cite_num/ days_diff
cite_score = np.log(cite_num+1) # the citation larger than dozens is enough for reality
# TODO: Author score considering the history of publication
author_score = 0
# x = (days_diff / 225)
# dates_core = np.e * x * np.exp(-x)
dates_core=0
self._quality = dates_core+cite_score+ author_score
return self._quality
quality
property
Evaluate confidence quality.
__init__(arxiv_id=None, ss_id=None, data=None, logger=crawl.logger, key='')
Initialize with arxiv id or ss id. Pass in an api key if you have.
Source code in src/racp/data.py
def __init__(
self,
arxiv_id = None,
ss_id = None,
data = None,
logger = crawl.logger,
key = ""
) -> None:
'''Initialize with arxiv id or ss id. Pass in an api key if you have.'''
self.arxiv_id = ""
self.ss_id = ""
self.citations = set()
self.references = set()
self.authors = []
self.publication = []
self.date = ""
self.title = ""
self.abstract = ""
self.content = ""
self.logger = logger
self._quality = None
if arxiv_id != None:
self.get_data_by_arxiv(arxiv_id, key)
if data != None:
self.load_json(data)
load_json(json_data)
Load from a json dictionary
Source code in src/racp/data.py
def load_json(self, json_data):
'''Load from a json dictionary'''
if not isinstance(json_data, dict):
raise ValueError("Please pass in a dictionary")
try:
self.arxiv_id = json_data.get("arxivId", "")
self.ss_id = json_data.get("paperId", "")
self.citations = set(json_data.get("citations", []))
self.references = set(json_data.get("references",[]))
self.authors = json_data.get("authors", [])
self.publication = json_data.get("publication", [])
self.date = json_data.get("date", "")
self.title = json_data.get("title", "")
self.abstract = json_data.get("abstract", "")
self.content = json_data.get("content", "")
except:
raise ValueError("Fail to load data, please check the items.")
save_json(save_path)
Save as a json file
Source code in src/racp/data.py
def save_json(self, save_path):
'''Save as a json file'''
save_json(self.to_json(),os.path.join(save_path, f"{self.arxiv_id}.json"),\
self.logger, f"{self.arxiv_id}.json")
to_Document()
Convert to document format
Source code in src/racp/data.py
def to_Document(self):
"""Convert to document format"""
from langchain_core.documents.base import Document
abstract = self.abstract
content = self.content
if abstract is None:
abstract = self.content[:250]
# TODO : content retrival
if abstract is None:
abstract = ""
metadata = {"source":self.arxiv_id,"title":self.title,"quality":self.quality}
metadata = {"source":self.arxiv_id,"title":self.title,"quality":str(self.quality)}
doc = Document(metadata=metadata,page_content=abstract)
return doc
to_json()
Convert to json format
Source code in src/racp/data.py
def to_json(self):
'''Convert to json format'''
return {
"arxivId": self.arxiv_id,
"paperId": self.ss_id,
"citations": list(self.citations),
"references": list(self.references),
"authors": self.authors,
"publication": self.publication,
"date": self.date,
"title": self.title,
"abstract": self.abstract,
"content": self.content
}
RawSet
Bases: Dataset
A torch Dataset storing raw data.
Source code in src/racp/data.py
class RawSet(Dataset):
'''A torch Dataset storing raw data.'''
def __init__(self, save_path=None,length = -1 ) -> None:
super().__init__()
self.items = [] # List of PaperItems
self.id2idx = {}
if save_path != None:
self._load_from_directory(save_path,length)
def _load_from_directory(self, save_path,length = -1 ):
'''Load json files from given directory.'''
filenames = os.listdir(save_path)
for idx, file in tqdm(enumerate(filenames), total=len(filenames)):
path = os.path.join(save_path, file)
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
except:
print(path)
continue
item = PaperItem()
item.load_json(data)
self.id2idx[item.arxiv_id] = idx
self.items.append(item)
# TODO : remove for deveplop
if length>0 and len(self.items)>length:
break
def add_item(self, item : PaperItem):
self.items.append(item)
def get_item_by_arxivid(self,arxiv_id):
if self.id2idx.get(arxiv_id,-1) !=-1:
return self.items[self.id2idx[arxiv_id]]
else:
return -1
def __getitem__(self, index) -> PaperItem:
return self.items[index]
def __len__(self):
return len(self.items)
def save(self, filepath):
'''Save as jsonl file.'''
with jsonlines.open(filepath, "w") as f:
for item in self.items:
data = item.to_json()
f.write(data)
def load(self, filepath):
'''Load from a jsonl file.'''
with jsonlines.open(filepath, "r") as f:
for item in f:
self.items.append(PaperItem(data=item))
def all_papers(self):
'''Return a set of semantics scholar ids involved.'''
papers = set()
for item in self.items:
papers.add(item.ss_id)
for cite in item.citations:
papers.add(cite)
for ref in item.references:
papers.add(ref)
return papers
def all_authors(self):
'''Return a dictionary with all author ids in the dataset as keys.'''
all_authors = {}
for item in self.items:
try:
for author in item.authors:
all_authors[author["authorId"]] = {
"name": author["name"],
"paperCount": author["paperCount"],
"citationCount": author["citationCount"]
}
except:
continue
return all_authors
def publication_types(self):
'''Return a dictionary counting all publication types' papers.'''
type_count = {}
for item in self.items:
if item.publication != None:
for type in item.publication:
type_count[type] = type_count.get(type, 0) + 1
return type_count
def publication_years(self):
'''Return a dictionary counting papers each year.'''
year_count = {}
for item in self.items:
try:
year = item.date.split("-")[0]
except:
continue
year_count[year] = year_count.get(year,0) + 1
return year_count
def paper_citations(self):
'''Return a dictionary of papers' citaiton counts.'''
return dict([(item.arxiv_id, len(item.citations)) for item in self.items])
def topk(self, paper, k=100):
"""Return top k relevance paper"""
sim = np.zeros(self.__len__())
for i in range(self.__len__()):
paper_i = self.__getitem__(i)
sim[i] = ccbc(paper, paper_i)
# 使用np.argsort获取排序后的索引数组
print(sim)
print(np.argsort(sim))
print(k,type(k))
topk_indices = np.argsort(sim)[::-1][:k]
# 获取对应的top k项
topk_items = [self.__getitem__(i) for i in topk_indices]
return topk_items
def load_from_papers(self,papers):
"""Build dataset from papers list """
for paperitem in papers:
self.items.append(paperitem)
all_authors()
Return a dictionary with all author ids in the dataset as keys.
Source code in src/racp/data.py
def all_authors(self):
'''Return a dictionary with all author ids in the dataset as keys.'''
all_authors = {}
for item in self.items:
try:
for author in item.authors:
all_authors[author["authorId"]] = {
"name": author["name"],
"paperCount": author["paperCount"],
"citationCount": author["citationCount"]
}
except:
continue
return all_authors
all_papers()
Return a set of semantics scholar ids involved.
Source code in src/racp/data.py
def all_papers(self):
'''Return a set of semantics scholar ids involved.'''
papers = set()
for item in self.items:
papers.add(item.ss_id)
for cite in item.citations:
papers.add(cite)
for ref in item.references:
papers.add(ref)
return papers
load(filepath)
Load from a jsonl file.
Source code in src/racp/data.py
def load(self, filepath):
'''Load from a jsonl file.'''
with jsonlines.open(filepath, "r") as f:
for item in f:
self.items.append(PaperItem(data=item))
load_from_papers(papers)
Build dataset from papers list
Source code in src/racp/data.py
def load_from_papers(self,papers):
"""Build dataset from papers list """
for paperitem in papers:
self.items.append(paperitem)
paper_citations()
Return a dictionary of papers' citaiton counts.
Source code in src/racp/data.py
def paper_citations(self):
'''Return a dictionary of papers' citaiton counts.'''
return dict([(item.arxiv_id, len(item.citations)) for item in self.items])
publication_types()
Return a dictionary counting all publication types' papers.
Source code in src/racp/data.py
def publication_types(self):
'''Return a dictionary counting all publication types' papers.'''
type_count = {}
for item in self.items:
if item.publication != None:
for type in item.publication:
type_count[type] = type_count.get(type, 0) + 1
return type_count
publication_years()
Return a dictionary counting papers each year.
Source code in src/racp/data.py
def publication_years(self):
'''Return a dictionary counting papers each year.'''
year_count = {}
for item in self.items:
try:
year = item.date.split("-")[0]
except:
continue
year_count[year] = year_count.get(year,0) + 1
return year_count
save(filepath)
Save as jsonl file.
Source code in src/racp/data.py
def save(self, filepath):
'''Save as jsonl file.'''
with jsonlines.open(filepath, "w") as f:
for item in self.items:
data = item.to_json()
f.write(data)
topk(paper, k=100)
Return top k relevance paper
Source code in src/racp/data.py
def topk(self, paper, k=100):
"""Return top k relevance paper"""
sim = np.zeros(self.__len__())
for i in range(self.__len__()):
paper_i = self.__getitem__(i)
sim[i] = ccbc(paper, paper_i)
# 使用np.argsort获取排序后的索引数组
print(sim)
print(np.argsort(sim))
print(k,type(k))
topk_indices = np.argsort(sim)[::-1][:k]
# 获取对应的top k项
topk_items = [self.__getitem__(i) for i in topk_indices]
return topk_items